#Importing and installing

In [None]:
#Pip install required

!pip install datasets
!pip install transformers datasets evaluate
!pip install --upgrade torch torchvision torchaudio
!python -m spacy download en_core_web_lg
!pip install --upgrade torch torchvision torchaudio
!pip install --upgrade evaluate datasets

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
import csv
from sklearn.manifold import TSNE
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

nlp = spacy.load('en_core_web_lg')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

#Preprocessing and Cleaning

In [None]:
# Load Dataset
dataset = load_dataset("yelp_review_full")

# Convert dataset to Pandas
df = pd.DataFrame(dataset['train'])
display(df.head())




In [None]:
label_counts = df_sample['label'].value_counts()
print(label_counts)

null_values = df_sample.isnull().sum()
print(null_values)

In [None]:
#Create function to generally clean text
def clean_text(text):
    text = text.lower()  # lowercase words
    text = re.sub(r"http\S+", "", text)  # Remove links
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)  # Remove mentions
    text = re.sub(r"#[A-Za-z0-9_]+", "", text)  # Remove hashtags
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r"\s+", " ", text)  #Uniform spaces
    text = text.strip()
    return text

#Create function to remove stop words
def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

#Lemmatize words
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)

#Apply cleaning

df['clean_text'] = df['text'].apply(clean_text)
df['clean_text'] = df['clean_text'].apply(remove_stopwords)
df['clean_text'] = df['clean_text'].apply(lemmatize_text)
print("\Cleaned Words")

#Print head to confirm cleaning
display(df[['text', 'clean_text']].head())

#Create subset to speed up data
df_sample = df.sample(10000, random_state=42).reset_index(drop=True)

In [None]:
#Create subset to speed up data
df_sample = df.sample(10000, random_state=42).reset_index(drop=True)

In [None]:
#Cleaning test dataset as well

df_test = pd.DataFrame(dataset['test'])
df_test['clean_text'] = df_test['text'].apply(clean_text)
df_test['clean_text'] = df_test['clean_text'].apply(remove_stopwords)
df_test['clean_text'] = df_test['clean_text'].apply(lemmatize_text)

#Print cleaned data
display(df_test[['text', 'clean_text']].head())

#EDA

In [None]:
# Review length distribution
df_sample['text_length'] = df_sample['clean_text'].apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 6))
sns.histplot(df_sample['text_length'], bins=30, kde=True, color='skyblue')
plt.title("Distribution of Review Length (in words)")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()

#Show most common words
all_words = " ".join(df_sample['clean_text']).split()
most_common = Counter(all_words).most_common(20)
common_df = pd.DataFrame(most_common, columns=['Word', 'Frequency'])
plt.figure(figsize=(12, 6))
sns.barplot(x='Frequency', y='Word', data=common_df, palette='viridis')
plt.title("Top 20 Most Common Words in Reviews")
plt.xlabel("Frequency")
plt.ylabel("Words")
plt.show()



#Creating EDA to confirm distribution of labels
df_sample['label'].value_counts().sort_index().plot(kind='bar', title='Label Distribution (Yelp)')