In [1]:
import zipfile

import os

import pandas as pd
import numpy as np

import datetime

import re

import nltk

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4') # uses Multilingual Wordnet Data from OMW with newer Wordnet versions

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/r.shahukaru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/r.shahukaru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/r.shahukaru/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
from textblob import TextBlob # to use .correct() method to correct spellings

In [5]:
%%time

with zipfile.ZipFile('../data/txt_reviews.zip') as zip_ref:
    zip_ref.extractall('../data/')

CPU times: user 29.8 s, sys: 1min 53s, total: 2min 23s
Wall time: 7min 31s


In [6]:
files = [f'../data/txt_reviews/{x}' for x in os.listdir('../data/txt_reviews')]

In [7]:
%%time

info = []

for file in files:
    with open(file, 'r') as f:
        lines = f.readlines()
        
        info.append([each_line.split(':')[1].strip() for each_line in lines])

CPU times: user 5.99 s, sys: 26.3 s, total: 32.3 s
Wall time: 1min 56s


In [8]:
with open(files[0], 'r') as f:
    
    lines = f.readlines()
    cols = [each_line.split(':')[0] for each_line in lines]

In [9]:
df = pd.DataFrame(columns = cols)

In [10]:
df['ProductId'] = [information[0] for information in info]

df['UserId'] = [information[1] for information in info]

df['ProfileName'] = [information[2] for information in info]

df['HelpfulnessNumerator'] = [information[3] for information in info]

df['HelpfulnessDenominator'] = [information[4] for information in info]

df['Score'] = [information[5] for information in info]

df['Time'] = [information[6] for information in info]

df['ReviewSummary'] = [information[7] for information in info]

df['ReviewText'] = [information[8] for information in info]

In [11]:
df

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,ReviewSummary,ReviewText
0,B0000GHNUE,A3D7GP8AS2PRIT,S.C.,1,1,5,1268611200,Hot but still flavorful,We had the Green Chile Habanero one at a local...
1,B000TTDDWE,AA1TQ4QJ4Y94P,barb,0,0,5,1267660800,Yummy,I bought this item at Costco at Christmas time...
2,B0000ICLLS,A2OXNQ43JBMAEI,nene,0,0,5,1341619200,candy,This is the best candy that I have ever had an...
3,B000JWGFQC,A2N9T4CS40KDJE,"D. Wilson ""Euro writer""",1,1,3,1189209600,The muffins were a pleasant low carb treat,As the subject said the muffins were a pleasan...
4,B000KFXEYE,A2M9ANEOKBVD2D,Vinegar Jim,3,9,1,1291075200,Surprise...bait and switch.,The company does not operate properly I ordere...
...,...,...,...,...,...,...,...,...,...
568449,B000A0WLFC,A1HH7L6EJI6N0,Paul Fillmore,4,5,4,1230940800,Great gift,I received this as a gift for xmas of 2007 and...
568450,B004M050W2,A3NGUULXOMWAMM,A. Tang,3,4,5,1302393600,Excellent product,I was given a bottle from a friend. Tasted an...
568451,B001VNGK6I,A2RBGOWJO35P93,Marie Flowers,8,9,4,1310601600,How Curcumin is Absorbed Properly,I bought some of Frontier's organic tumeric at...
568452,B000EY5COG,A1W7CVE5I70HVI,"B. Kramer ""Baker Boy""",3,4,5,1196294400,Good Stuff,Honestly I prefer the powder-version of this p...


In [12]:
df.dropna(inplace= True)

In [13]:
df.shape

(568454, 9)

In [14]:
df.drop_duplicates(inplace= True)
df.shape

(568167, 9)

## Splitting into train and test

In [15]:
X = df[[x for x in df.columns if x != 'Score']]
y = df['Score']

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.85, random_state = 100, stratify= y)

In [17]:
X_train

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Time,ReviewSummary,ReviewText
380818,B001CU0N9K,A2FQCGYGQRQ6DR,KathleenP,1,1,1300492800,"Great bang for the ""caloric"" buck!",At only 80 calories these are a great bang for...
21338,B0051WBNG2,AIYYSXZJRYYDR,F. Cohen,0,0,1349222400,My Favorite,"I sent for the tea, because I can't buy it in ..."
368711,B004NSH6O8,A3GFK7F5IUF60X,"Myra Schjelderup ""Ignolopi""",1,1,1320105600,New Love!,I absolutely love these chocolate bars! They c...
141202,B000CQID2Y,AQGZUG5IHZ9X7,"Ibrahim Al-Abdulwahab ""IbrahimZen""",0,0,1308096000,Very fine Chamomile tea,I have been drinking this Chamomile tea for th...
180881,B003VIWN26,A2CN496XAPHDL4,Sue M.,0,0,1302566400,Wellness Simple Solutions Dog Food,My dogs do well on Wellness Simple Solutions; ...
...,...,...,...,...,...,...,...,...
150122,B003M5TG28,A14VQRIH3FDX5K,kittenkatt,5,6,1309910400,"Sickening filler ingredients, high price",I think the ingredients in this food speak for...
295370,B0049ULB78,A3C2Z6EYQQBXZV,Shari Karanas,0,0,1320451200,Bold Flavor,I have tried many k-cups and this one is by fa...
118004,B000084E6V,A37FHDFU0H1G5J,Jaimi DeFeo,6,7,1288656000,Nylabone Durable Dental Dinosaur,My dogs had this chewed apart in the matter of...
303837,B005K4Q37A,A367O562XAG9K9,Goldie,1,1,1327363200,Pretty damn good....but I LIKE the Wawa/Conven...,"First of all, let me say that I LIKE the stuff..."


In [18]:
y_train

380818    5
21338     5
368711    5
141202    5
180881    5
         ..
150122    1
295370    5
118004    1
303837    4
1800      5
Name: Score, Length: 482941, dtype: object

In [19]:
df['Score'].value_counts(normalize= True)

Score
5    0.638773
4    0.141907
1    0.091929
3    0.075003
2    0.052388
Name: proportion, dtype: float64

In [20]:
y_train.value_counts(normalize= True)

Score
5    0.638774
4    0.141908
1    0.091928
3    0.075003
2    0.052387
Name: proportion, dtype: float64

### Creating a function to clean X

In [21]:
def clean_X(X): 
    
    X.drop(['ProductId', 'UserId', 'ProfileName', 'Time', 'HelpfulnessNumerator', 'HelpfulnessDenominator'], axis = 1, inplace = True)

    return X

### Creating a function to clean y

In [22]:
def clean_y(y):
    y_clean = []
    
    for score in y:
        if score in ['1', '2']:
            y_clean.append('bad')
        elif score == '3':
            y_clean.append('neutral')
        else:
            y_clean.append('good')
            
    return pd.Series(y_clean, name= 'score')

## Creating a function to preprocess the data

In [23]:
# initialization
stemmer = PorterStemmer()

In [24]:
# initialization
lemmatizer = WordNetLemmatizer()

In [25]:
def preprocess(text, flag):
    
    #1. Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", text)
    
    #2. Converting to lowercase
    sentence_1 = sentence.lower()
    
    #3. Tokenization (Word-level)
    tokens = sentence_1.split()
    
    #3.1. TextBlob- Correcting the spellings
    tokens_correct_spell = [str(TextBlob(token)) for token in tokens]

    #4. Removing stopwords
    clean_tokens = [token for token in tokens_correct_spell if token not in stopwords.words("english")]

    #5. lemmatization
    if(flag == 'stem'):
        clean_tokens_final = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens_final = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens_final), len(clean_tokens_final)])

# 1. Train data

## 1.1.1. Cleaning X_train

In [26]:
%%time

X_train = clean_X(X_train)
X_train

CPU times: user 31.1 ms, sys: 3.73 ms, total: 34.8 ms
Wall time: 34.2 ms


Unnamed: 0,ReviewSummary,ReviewText
380818,"Great bang for the ""caloric"" buck!",At only 80 calories these are a great bang for...
21338,My Favorite,"I sent for the tea, because I can't buy it in ..."
368711,New Love!,I absolutely love these chocolate bars! They c...
141202,Very fine Chamomile tea,I have been drinking this Chamomile tea for th...
180881,Wellness Simple Solutions Dog Food,My dogs do well on Wellness Simple Solutions; ...
...,...,...
150122,"Sickening filler ingredients, high price",I think the ingredients in this food speak for...
295370,Bold Flavor,I have tried many k-cups and this one is by fa...
118004,Nylabone Durable Dental Dinosaur,My dogs had this chewed apart in the matter of...
303837,Pretty damn good....but I LIKE the Wawa/Conven...,"First of all, let me say that I LIKE the stuff..."


## 1.1.2. Cleaning y_train

In [27]:
y_train = clean_y(y_train)
y_train

0         good
1         good
2         good
3         good
4         good
          ... 
482936     bad
482937    good
482938     bad
482939    good
482940    good
Name: score, Length: 482941, dtype: object

## 1.2 Preprocessing X_train

In [28]:
from tqdm import tqdm, tqdm_notebook

In [29]:
tqdm.pandas()

In [30]:
temp_df_1 = X_train['ReviewSummary'].progress_apply(lambda x: preprocess(x, 'stem'))
temp_df_1.columns = ['revsum_clean_text_stem', 'revsum_text_length_stem']

100%|█████████████████████████████████| 482941/482941 [01:52<00:00, 4288.44it/s]


In [31]:
temp_df_2 = X_train['ReviewText'].progress_apply(lambda x: preprocess(x, 'stem'))
temp_df_2.columns = ['revtext_clean_text_stem', 'revtext_text_length_stem']

100%|██████████████████████████████████| 482941/482941 [26:04<00:00, 308.68it/s]


In [32]:
temp_df = pd.concat([temp_df_1, temp_df_2], axis = 1)
temp_df

Unnamed: 0,revsum_clean_text_stem,revsum_text_length_stem,revtext_clean_text_stem,revtext_text_length_stem
380818,great bang calor buck,4,calori great bang calor buck br low sugar low ...,30
21338,favorit,1,sent tea buy smaller town move glad back shelf,9
368711,new love,2,absolut love chocol bar come nice wrap thin oz...,97
141202,fine chamomil tea,3,drink chamomil tea past year late night need r...,16
180881,well simpl solut dog food,5,dog well well simpl solut great dog allergi pu...,12
...,...,...,...,...
150122,sicken filler ingredi high price,5,think ingredi food speak cheap filler byproduc...,35
295370,bold flavor,2,tri mani k cup one far favorit deep flavor wit...,12
118004,nylabon durabl dental dinosaur,4,dog chew apart matter minut eat small part dog...,11
303837,pretti damn good like wawa conveni store stuff,8,first let say like stuff wawa machin look want...,36


In [33]:
X_train.drop(columns= ['ReviewSummary', 'ReviewText'], inplace = True)

In [34]:
X_train = pd.concat([X_train, temp_df], axis = 1)
X_train

Unnamed: 0,revsum_clean_text_stem,revsum_text_length_stem,revtext_clean_text_stem,revtext_text_length_stem
380818,great bang calor buck,4,calori great bang calor buck br low sugar low ...,30
21338,favorit,1,sent tea buy smaller town move glad back shelf,9
368711,new love,2,absolut love chocol bar come nice wrap thin oz...,97
141202,fine chamomil tea,3,drink chamomil tea past year late night need r...,16
180881,well simpl solut dog food,5,dog well well simpl solut great dog allergi pu...,12
...,...,...,...,...
150122,sicken filler ingredi high price,5,think ingredi food speak cheap filler byproduc...,35
295370,bold flavor,2,tri mani k cup one far favorit deep flavor wit...,12
118004,nylabon durabl dental dinosaur,4,dog chew apart matter minut eat small part dog...,11
303837,pretti damn good like wawa conveni store stuff,8,first let say like stuff wawa machin look want...,36


## 1.3 Data Transformation

### 1.3.1 Bag of Words approach

#### Just considering revtext and not revsum

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

# Initializing the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vocab = CountVectorizer()

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.

X_train_bow = vocab.fit_transform(X_train['revtext_clean_text_stem'])

#### Unique Words

In [36]:
# We can look at unique words by using 'vocabulary_'

vocab.vocabulary_

{'calori': 8367,
 'great': 24702,
 'bang': 4412,
 'calor': 8365,
 'buck': 7551,
 'br': 6897,
 'low': 33691,
 'sugar': 55540,
 'fat': 20348,
 'high': 26691,
 'protein': 45483,
 'tast': 56895,
 'like': 32883,
 'pepperoni': 42672,
 'stick': 54753,
 'without': 64011,
 'mayb': 35208,
 'littl': 33162,
 'dri': 17078,
 'side': 51887,
 'bad': 4178,
 'use': 61290,
 'workout': 64323,
 'sent': 50951,
 'tea': 57054,
 'buy': 8002,
 'smaller': 52658,
 'town': 58891,
 'move': 37473,
 'glad': 23765,
 'back': 4112,
 'shelf': 51393,
 'absolut': 220,
 'love': 33641,
 'chocol': 10508,
 'bar': 4457,
 'come': 11830,
 'nice': 38824,
 'wrap': 64448,
 'thin': 57848,
 'oz': 41487,
 'troubl': 59379,
 'set': 51103,
 'groov': 24981,
 'pattern': 42243,
 'make': 34342,
 'easi': 17604,
 'break': 7051,
 'triangl': 59202,
 'though': 57972,
 'prefer': 44778,
 'crack': 13197,
 'open': 40615,
 'eat': 17638,
 'tini': 58301,
 'piec': 43390,
 'sweet': 56221,
 'milk': 36281,
 'overli': 41303,
 'probabl': 45156,
 'thank': 57585

In [37]:
print("Total unique words", len(vocab.vocabulary_))

print("Type of train features:", type(X_train_bow))

print("Shape of input data:", X_train_bow.shape)

Total unique words 65679
Type of train features: <class 'scipy.sparse._csr.csr_matrix'>
Shape of input data: (482941, 65679)


## 2.1.1. Cleaning X_test

In [38]:
%%time

X_test = clean_X(X_test)
X_test

CPU times: user 16.3 ms, sys: 172 ms, total: 188 ms
Wall time: 212 ms


Unnamed: 0,ReviewSummary,ReviewText
529445,One of the best breakfast teas,Yorkshire Gold is one of my favorite breakfast...
271152,Grilled is the best!,My cat loves the Fancy Feast grilled style. H...
415447,"Wolfgang Puck Coffee, Vienna Coffee House","great tasting coffee. strong, but not obnoxiou..."
333254,Everything a Cola should be,This is the best Cola I have ever tasted. I f...
71360,Like Something You might find for sale at a fo...,"I, too, first tasted this at a ""caf&eacute; ne..."
...,...,...
152671,Excellent Oatmeal Bar,These Bars are the best that I have ever had. ...
435958,Almond Anise Biscotti,Celiac so this is a real treat in the morning ...
170682,they just dont like em,only 1 of my 2 cats have shown any interest at...
391149,great snack,This product is a healthy snack during my work...


## 2.1.2. Cleaning y_teset

In [39]:
y_test = clean_y(y_test)
y_test

0        good
1        good
2        good
3        good
4         bad
         ... 
85221    good
85222    good
85223     bad
85224    good
85225    good
Name: score, Length: 85226, dtype: object

## 2.2. Preprocessing X_test

In [40]:
temp_df_1 = X_test['ReviewSummary'].progress_apply(lambda x: preprocess(x, 'stem'))
temp_df_1.columns = ['revsum_clean_text_stem', 'revsum_text_length_stem']

100%|███████████████████████████████████| 85226/85226 [00:21<00:00, 4044.85it/s]


In [41]:
temp_df_2 = X_test['ReviewText'].progress_apply(lambda x: preprocess(x, 'stem'))
temp_df_2.columns = ['revtext_clean_text_stem', 'revtext_text_length_stem']

100%|████████████████████████████████████| 85226/85226 [05:39<00:00, 250.85it/s]


In [42]:
temp_df = pd.concat([temp_df_1, temp_df_2], axis = 1)
temp_df

Unnamed: 0,revsum_clean_text_stem,revsum_text_length_stem,revtext_clean_text_stem,revtext_text_length_stem
529445,one best breakfast tea,4,yorkshir gold one favorit breakfast tea like t...,66
271152,grill best,2,cat love fanci feast grill style favorit flavo...,37
415447,wolfgang puck coffe vienna coffe hous,6,great tast coffe strong obnoxi strong nasti af...,14
333254,everyth cola,2,best cola ever tast first tri decid stop buy p...,101
71360,like someth might find sale four year old juic...,10,first tast caf eacut next moulin roug pari deg...,192
...,...,...,...,...
152671,excel oatmeal bar,3,bar best ever sweet great textur even realiz e...,17
435958,almond anis biscotti,3,celiac real treat morn cup coffe,6
170682,dont like em,3,cat shown interest mylar toy she attract crink...,38
391149,great snack,2,product healthi snack workday kid homework tim...,15


In [43]:
X_test.drop(columns= ['ReviewSummary', 'ReviewText'], inplace = True)

In [44]:
X_test = pd.concat([X_test, temp_df], axis = 1)
X_test

Unnamed: 0,revsum_clean_text_stem,revsum_text_length_stem,revtext_clean_text_stem,revtext_text_length_stem
529445,one best breakfast tea,4,yorkshir gold one favorit breakfast tea like t...,66
271152,grill best,2,cat love fanci feast grill style favorit flavo...,37
415447,wolfgang puck coffe vienna coffe hous,6,great tast coffe strong obnoxi strong nasti af...,14
333254,everyth cola,2,best cola ever tast first tri decid stop buy p...,101
71360,like someth might find sale four year old juic...,10,first tast caf eacut next moulin roug pari deg...,192
...,...,...,...,...
152671,excel oatmeal bar,3,bar best ever sweet great textur even realiz e...,17
435958,almond anis biscotti,3,celiac real treat morn cup coffe,6
170682,dont like em,3,cat shown interest mylar toy she attract crink...,38
391149,great snack,2,product healthi snack workday kid homework tim...,15


## 2.3 Data Transformation

### Bag of Words

#### Just considering revtext and not revsum

In [45]:
X_test_bow = vocab.transform(X_test['revtext_clean_text_stem'])

In [46]:
print("Total unique words", len(vocab.vocabulary_))

print("Type of train features:", type(X_test_bow))

Total unique words 65679
Type of train features: <class 'scipy.sparse._csr.csr_matrix'>


# 3. Model

## Logistic Regression

In [47]:
X_train.shape

(482941, 4)

In [48]:
X_train_bow.shape

(482941, 65679)

In [49]:
len(y_train)

482941

In [50]:
%%time

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_bow, y_train)

CPU times: user 13.8 s, sys: 1.23 s, total: 15 s
Wall time: 14.3 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
y_test_pred = classifier.predict(X_test_bow)

In [52]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.8632459578063032
              precision    recall  f1-score   support

         bad       0.75      0.64      0.69     12300
        good       0.89      0.97      0.93     66534
     neutral       0.53      0.20      0.29      6392

    accuracy                           0.86     85226
   macro avg       0.72      0.60      0.64     85226
weighted avg       0.84      0.86      0.85     85226



## Decision Tree

In [None]:
%%time

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train_bow, y_train)

In [None]:
y_test_pred = classifier.predict(X_test_bow)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

## Random Forest

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_bow, y_train)

In [None]:
y_test_pred = classifier.predict(X_test_bow)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

In [None]:
X_test_bow

In [None]:
def preprocess_str(text):
    
    flag = 'stem'
    
    text = re.sub("n't", " not", text)
    
    #1. Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", text)
    
    #2. Converting to lowercase
    sentence_1 = sentence.lower()
    
    #3. Tokenization (Word-level)
    tokens = sentence_1.split()
    
    #3.1. TextBlob- Correcting the spellings
    tokens_correct_spell = [str(TextBlob(token)) for token in tokens]

    #4. Removing stopwords
    clean_tokens = [token for token in tokens_correct_spell if token not in stopwords.words("english")]

    #5. lemmatization
    if(flag == 'stem'):
        clean_tokens_final = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens_final = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens_final)])

In [None]:
preprocess_str("Very good burger.")

In [None]:
preprocess_str("Hated the food. Wouldn't recommend it.")

In [None]:
a = vocab.transform(preprocess_str("Very good burger."))
a

In [None]:
a[0]

In [None]:
classifier.predict(a)[0]

In [None]:
text = str(input("Please input a review: "))

a = vocab.transform(preprocess_str(text))

print()
print(classifier.predict(a)[0])