# Alex Liddle

# COMP4448

## Mini Project Sentiment Analysis

In [4]:
import nltk
import string
import re
import sklearn
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy import stats
#nltk.download('stopwords') #<---uncomment if you haven't downloaded the stopwords library
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Import the dataset

In [12]:
df_reviews_raw = pd.read_csv('train_40k.csv')
df_reviews_raw.head()

Unnamed: 0,productId,Title,userId,Helpfulness,Score,Time,Text,Cat1,Cat2,Cat3
0,B000E46LYG,Golden Valley Natural Buffalo Jerky,A3MQDNGHDJU4MK,0/0,3.0,-1,The description and photo on this product need...,grocery gourmet food,meat poultry,jerky
1,B000GRA6N8,Westing Game,unknown,0/0,5.0,860630400,This was a great book!!!! It is well thought t...,toys games,games,unknown
2,B000GRA6N8,Westing Game,unknown,0/0,5.0,883008000,"I am a first year teacher, teaching 5th grade....",toys games,games,unknown
3,B000GRA6N8,Westing Game,unknown,0/0,5.0,897696000,I got the book at my bookfair at school lookin...,toys games,games,unknown
4,B00000DMDQ,I SPY A is For Jigsaw Puzzle 63pc,unknown,2/4,5.0,911865600,Hi! I'm Martine Redman and I created this puzz...,toys games,puzzles,jigsaw puzzles


All we care about for the purpose of sentiment analysis is the text (our feature) and the score (our label). Furthermore, we'll remove scores of 3.0 and recode 1.0 & 2.0 to 'bad' and 4.0 & 5.0 to 'good'. Lastly, we don't want to consider reviews with few words, so we will arbitrarily filter out reviews with less than 60 words.

### Clean the data

In [30]:
df_reviews = df_reviews_raw[['Text', 'Score']]
df_reviews = df_reviews[(df_reviews.Score < 3.0) | (df_reviews.Score > 3.0)]
df_reviews = df_reviews[df_reviews.Text.str.split().str.len().ge(60)]
df_reviews.replace([1.0, 2.0], 0, inplace=True)
df_reviews.replace([4.0, 5.0], 1, inplace=True)
df_reviews.head()

Unnamed: 0,Text,Score
2,"I am a first year teacher, teaching 5th grade....",1.0
3,I got the book at my bookfair at school lookin...,1.0
4,Hi! I'm Martine Redman and I created this puzz...,1.0
6,The real joy of this movie doesn't lie in its ...,1.0
13,"Parents, don't try to play this game with your...",1.0


### Examine the data

In [31]:
df_reviews.describe()

Unnamed: 0,Score
count,18936.0
mean,0.794149
std,0.404333
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


We want an approximately equal number of good and bad reviews for training purposes, so we'll create an evenly distributed subset by sampling the full dataset.

In [32]:
df_reviews_sampled = df_reviews.groupby('Score').apply(lambda x: x.sample(3000)).reset_index(drop=True)
df_reviews_sampled.describe()

Unnamed: 0,Score
count,6000.0
mean,0.5
std,0.500042
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


Now that the dataset is evenly distributed, it is time to conduct some preprocessing on the text data (i.e., remove stopwords, punctuation, etc.).

### Text Preprocessing

In [None]:
print("Before Preprocessing:")
print(df_reviews.Text.head(1))

tqdm.pandas()
stop = stopwords.words()

df_reviews.Text = df_reviews.Text.str.replace("[^\w\s]", "").str.lower()
df_reviews.Text = df_reviews.Text.progress_apply(lambda x: ' '.join([item for item in x.split() 
                                                               if item not in stop]))

print("After Preprocessing:")
print(df_reviews.Text.head(1))

Before Preprocessing:
2    i am a first year teacher teaching 5th grade i...
Name: Text, dtype: object


 20%|██        | 3820/18936 [00:29<02:17, 109.82it/s]