In [1]:
import zipfile

import os

import pandas as pd
import numpy as np

import datetime

import re

import nltk

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4') # uses Multilingual Wordnet Data from OMW with newer Wordnet versions

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/r.shahukaru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/r.shahukaru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/r.shahukaru/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
from textblob import TextBlob # to use .correct() method to correct spellings

In [5]:
%%time

with zipfile.ZipFile('../data/txt_reviews.zip') as zip_ref:
    zip_ref.extractall('../data/')

CPU times: user 38.2 s, sys: 2min 8s, total: 2min 47s
Wall time: 6min 29s


In [6]:
files = [f'../data/txt_reviews/{x}' for x in os.listdir('../data/txt_reviews')]

In [7]:
%%time

info = []

for file in files:
    with open(file, 'r') as f:
        lines = f.readlines()
        
        info.append([each_line.split(':')[1].strip() for each_line in lines])

CPU times: user 9.41 s, sys: 1min 10s, total: 1min 20s
Wall time: 3min 24s


In [8]:
with open(files[0], 'r') as f:
    
    lines = f.readlines()
    cols = [each_line.split(':')[0] for each_line in lines]

In [9]:
df = pd.DataFrame(columns = cols)

In [10]:
df['ProductId'] = [information[0] for information in info]

df['UserId'] = [information[1] for information in info]

df['ProfileName'] = [information[2] for information in info]

df['HelpfulnessNumerator'] = [information[3] for information in info]

df['HelpfulnessDenominator'] = [information[4] for information in info]

df['Score'] = [information[5] for information in info]

df['Time'] = [information[6] for information in info]

df['ReviewSummary'] = [information[7] for information in info]

df['ReviewText'] = [information[8] for information in info]

In [11]:
df

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,ReviewSummary,ReviewText
0,B0000GHNUE,A3D7GP8AS2PRIT,S.C.,1,1,5,1268611200,Hot but still flavorful,We had the Green Chile Habanero one at a local...
1,B000TTDDWE,AA1TQ4QJ4Y94P,barb,0,0,5,1267660800,Yummy,I bought this item at Costco at Christmas time...
2,B0000ICLLS,A2OXNQ43JBMAEI,nene,0,0,5,1341619200,candy,This is the best candy that I have ever had an...
3,B000JWGFQC,A2N9T4CS40KDJE,"D. Wilson ""Euro writer""",1,1,3,1189209600,The muffins were a pleasant low carb treat,As the subject said the muffins were a pleasan...
4,B000KFXEYE,A2M9ANEOKBVD2D,Vinegar Jim,3,9,1,1291075200,Surprise...bait and switch.,The company does not operate properly I ordere...
...,...,...,...,...,...,...,...,...,...
568449,B000A0WLFC,A1HH7L6EJI6N0,Paul Fillmore,4,5,4,1230940800,Great gift,I received this as a gift for xmas of 2007 and...
568450,B004M050W2,A3NGUULXOMWAMM,A. Tang,3,4,5,1302393600,Excellent product,I was given a bottle from a friend. Tasted an...
568451,B001VNGK6I,A2RBGOWJO35P93,Marie Flowers,8,9,4,1310601600,How Curcumin is Absorbed Properly,I bought some of Frontier's organic tumeric at...
568452,B000EY5COG,A1W7CVE5I70HVI,"B. Kramer ""Baker Boy""",3,4,5,1196294400,Good Stuff,Honestly I prefer the powder-version of this p...


In [12]:
df.dropna(inplace= True)

In [13]:
df.shape

(568454, 9)

In [14]:
df.drop_duplicates(inplace= True)
df.shape

(568167, 9)

## Splitting into train and test

In [15]:
X = df[[x for x in df.columns if x != 'Score']]
y = df['Score']

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 100, stratify= y)

In [17]:
X_train

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Time,ReviewSummary,ReviewText
297536,B0041FWCYM,A2ZWGWU97T6K3M,phi,1,1,1343001600,I love this seasoning!,I love this seasoning! I use it with a slight ...
309715,B004JRKEH4,A12O5SEIF162P8,"William A. Nolan ""freeholder""",0,0,1314230400,Good flavor,"I used this on a London broil I had grilled, a..."
548765,B0013A0QXC,A34OZ261SP7CU9,Earl D. Wallace,0,0,1348617600,Senso coffee,I have purchased this product many times over....
63672,B000FFRU3U,AYDE7GGEGO9QQ,Rosemary P. Mcpharlin,0,0,1315699200,"healthful, not too sweet hot cereal",This cereal has great texture (not over-proces...
458398,B003TOES7K,APOODDMKAG57Y,marcia,0,0,1322179200,Keurig,I love my new Keurig and am very happy to have...
...,...,...,...,...,...,...,...,...
150122,B003M5TG28,A14VQRIH3FDX5K,kittenkatt,5,6,1309910400,"Sickening filler ingredients, high price",I think the ingredients in this food speak for...
51681,B001ELL4E0,A2Q5YH8827Z3VS,Mary L Levesque,2,2,1316131200,Just as good or better than McCann's,"I'm eating a bowl of it right now, and it is g..."
118004,B000084E6V,A37FHDFU0H1G5J,Jaimi DeFeo,6,7,1288656000,Nylabone Durable Dental Dinosaur,My dogs had this chewed apart in the matter of...
129611,B001FA1E7M,A2P0254ZZ42ZOJ,Roni Kauffman,0,2,1306281600,no free shipping?,I love these and they used to be such great va...


In [18]:
y_train

297536    5
309715    4
548765    5
63672     4
458398    5
         ..
150122    1
51681     5
118004    1
129611    4
284895    5
Name: Score, Length: 454533, dtype: object

In [19]:
df['Score'].value_counts(normalize= True)

Score
5    0.638773
4    0.141907
1    0.091929
3    0.075003
2    0.052388
Name: proportion, dtype: float64

In [20]:
y_train.value_counts(normalize= True)

Score
5    0.638774
4    0.141906
1    0.091930
3    0.075002
2    0.052388
Name: proportion, dtype: float64

### Creating a function to clean X

In [21]:
def clean_X(X): 
    
    X.drop(['ProductId', 'UserId', 'ProfileName', 'Time', 'HelpfulnessNumerator', 'HelpfulnessDenominator'], axis = 1, inplace = True)

    return X

### Creating a function to clean y

In [22]:
def clean_y(y):
    y_clean = []
    
    for score in y:
        if score in ['1', '2']:
            y_clean.append('bad')
        elif score == '3':
            y_clean.append('neutral')
        else:
            y_clean.append('good')
            
    return pd.Series(y_clean, name= 'score')

## Creating a function to preprocess the data

In [23]:
# initialization
stemmer = PorterStemmer()

In [24]:
# initialization
lemmatizer = WordNetLemmatizer()

In [25]:
def preprocess(text, flag):
    
    #1. Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", text)
    
    #2. Converting to lowercase
    sentence_1 = sentence.lower()
    
    #3. Tokenization (Word-level)
    tokens = sentence_1.split()
    
    #3.1. TextBlob- Correcting the spellings
    tokens_correct_spell = [str(TextBlob(token)) for token in tokens]

    #4. Removing stopwords
    clean_tokens = [token for token in tokens_correct_spell if token not in stopwords.words("english")]

    #5. lemmatization
    if(flag == 'stem'):
        clean_tokens_final = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens_final = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens_final), len(clean_tokens_final)])

# 1. Train data

## 1.1.1. Cleaning X_train

In [26]:
%%time

X_train = clean_X(X_train)
X_train

CPU times: user 46.4 ms, sys: 224 ms, total: 271 ms
Wall time: 445 ms


Unnamed: 0,ReviewSummary,ReviewText
297536,I love this seasoning!,I love this seasoning! I use it with a slight ...
309715,Good flavor,"I used this on a London broil I had grilled, a..."
548765,Senso coffee,I have purchased this product many times over....
63672,"healthful, not too sweet hot cereal",This cereal has great texture (not over-proces...
458398,Keurig,I love my new Keurig and am very happy to have...
...,...,...
150122,"Sickening filler ingredients, high price",I think the ingredients in this food speak for...
51681,Just as good or better than McCann's,"I'm eating a bowl of it right now, and it is g..."
118004,Nylabone Durable Dental Dinosaur,My dogs had this chewed apart in the matter of...
129611,no free shipping?,I love these and they used to be such great va...


## 1.1.2. Cleaning y_train

In [27]:
y_train = clean_y(y_train)
y_train

0         good
1         good
2         good
3         good
4         good
          ... 
454528     bad
454529    good
454530     bad
454531    good
454532    good
Name: score, Length: 454533, dtype: object

## 1.2 Preprocessing X_train

In [28]:
from tqdm import tqdm, tqdm_notebook

In [29]:
tqdm.pandas()

In [30]:
temp_df_1 = X_train['ReviewSummary'].progress_apply(lambda x: preprocess(x, 'stem'))
temp_df_1.columns = ['revsum_clean_text_stem', 'revsum_text_length_stem']

100%|█████████████████████████████████| 454533/454533 [02:32<00:00, 2980.16it/s]


In [31]:
temp_df_2 = X_train['ReviewText'].progress_apply(lambda x: preprocess(x, 'stem'))
temp_df_2.columns = ['revtext_clean_text_stem', 'revtext_text_length_stem']

100%|██████████████████████████████████| 454533/454533 [42:00<00:00, 180.33it/s]


In [32]:
temp_df = pd.concat([temp_df_1, temp_df_2], axis = 1)
temp_df

Unnamed: 0,revsum_clean_text_stem,revsum_text_length_stem,revtext_clean_text_stem,revtext_text_length_stem
297536,love season,2,love season use slight butter spray asparagu g...,21
309715,good flavor,2,use london broil grill complement meat well ma...,24
548765,senso coffe,2,purchas product mani time although cost risen ...,12
63672,health sweet hot cereal,4,cereal great textur process portion size gener...,34
458398,keurig,1,love new keurig happi found place order k cup ...,17
...,...,...,...,...
150122,sicken filler ingredi high price,5,think ingredi food speak cheap filler byproduc...,35
51681,good better mccann,3,eat bowl right great actual like better mccann...,31
118004,nylabon durabl dental dinosaur,4,dog chew apart matter minut eat small part dog...,11
129611,free ship,2,love use great valu amazon happen free ship bu...,12


In [33]:
X_train.drop(columns= ['ReviewSummary', 'ReviewText'], inplace = True)

In [34]:
X_train = pd.concat([X_train, temp_df], axis = 1)
X_train

Unnamed: 0,revsum_clean_text_stem,revsum_text_length_stem,revtext_clean_text_stem,revtext_text_length_stem
297536,love season,2,love season use slight butter spray asparagu g...,21
309715,good flavor,2,use london broil grill complement meat well ma...,24
548765,senso coffe,2,purchas product mani time although cost risen ...,12
63672,health sweet hot cereal,4,cereal great textur process portion size gener...,34
458398,keurig,1,love new keurig happi found place order k cup ...,17
...,...,...,...,...
150122,sicken filler ingredi high price,5,think ingredi food speak cheap filler byproduc...,35
51681,good better mccann,3,eat bowl right great actual like better mccann...,31
118004,nylabon durabl dental dinosaur,4,dog chew apart matter minut eat small part dog...,11
129611,free ship,2,love use great valu amazon happen free ship bu...,12


## 1.3 Data Transformation

### TF-IDF approach

#### Just considering revtext and not revsum

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing the "TfidfVectorizer" object
vectorizer = TfidfVectorizer()

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.

X_train_tfidf = vectorizer.fit_transform(X_train['revtext_clean_text_stem'])

In [36]:
print(vectorizer.vocabulary_)

print("Type of train features:", type(X_train_tfidf))

print("Shape of input data:", X_train_tfidf.shape)

{'love': 32785, 'season': 49358, 'use': 59749, 'slight': 51191, 'butter': 7736, 'spray': 52707, 'asparagu': 3307, 'grill': 24249, 'steamer': 53227, 'bag': 4095, 'frozen': 21884, 'veggi': 60201, 'bring': 7065, 'flavor': 20731, 'everyth': 18926, 'without': 62401, 'overwhelm': 40383, 'chang': 9503, 'way': 61424, 'cook': 12329, 'london': 32554, 'broil': 7155, 'complement': 11703, 'meat': 34519, 'well': 61627, 'mani': 33734, 'wonder': 62542, 'hot': 26672, 'tough': 57368, 'question': 45107, 'sinc': 50816, 'realli': 45754, 'like': 32051, 'pepper': 41582, 'kind': 30360, 'would': 62772, 'call': 8115, 'pretti': 43847, 'mild': 35335, 'tast': 55451, 'may': 34309, 'vari': 60034, 'purchas': 44724, 'product': 44141, 'time': 56775, 'although': 1691, 'cost': 12613, 'risen': 47386, 'find': 20386, 'still': 53404, 'good': 23590, 'brand': 6783, 'coffe': 11244, 'cereal': 9301, 'great': 24087, 'textur': 56075, 'process': 44089, 'portion': 43278, 'size': 50957, 'gener': 22691, 'also': 1648, 'overli': 40266, '

## 2.1.1. Cleaning X_test

In [37]:
%%time

X_test = clean_X(X_test)
X_test

CPU times: user 31.1 ms, sys: 290 ms, total: 321 ms
Wall time: 459 ms


Unnamed: 0,ReviewSummary,ReviewText
289270,Works great and we will order more,We are very satisfied with Natural Vitality-Na...
93271,Will not buy this product again,The hospital where I worked bought this coffee...
291601,great coffee,We found this k-cup coffee concept to be quick...
347392,Peppers were very dry,The peppers were very dry because the oil came...
261121,"If you like weak tea, this is for you","This tea isn't bad, it just made (to me) a wea..."
...,...,...
397590,A bit gritty and green,I have to put this in a travel cup for my 5 ye...
82608,Kiddo loves it!,We use jar baby food to supplement what we mak...
343778,Not as described,"These are not really anchovie stuffed olives, ..."
297557,"Very good in smoothies, but why the short shel...",I really like the taste of this powder. It is...


## 2.1.2. Cleaning y_teset

In [38]:
y_test = clean_y(y_test)
y_test

0            good
1             bad
2            good
3             bad
4             bad
           ...   
113629    neutral
113630       good
113631        bad
113632    neutral
113633       good
Name: score, Length: 113634, dtype: object

## 2.2. Preprocessing X_test

In [39]:
temp_df_1 = X_test['ReviewSummary'].progress_apply(lambda x: preprocess(x, 'stem'))
temp_df_1.columns = ['revsum_clean_text_stem', 'revsum_text_length_stem']

100%|█████████████████████████████████| 113634/113634 [01:09<00:00, 1632.63it/s]


In [40]:
temp_df_2 = X_test['ReviewText'].progress_apply(lambda x: preprocess(x, 'stem'))
temp_df_2.columns = ['revtext_clean_text_stem', 'revtext_text_length_stem']

100%|██████████████████████████████████| 113634/113634 [11:46<00:00, 160.82it/s]


In [41]:
temp_df = pd.concat([temp_df_1, temp_df_2], axis = 1)
temp_df

Unnamed: 0,revsum_clean_text_stem,revsum_text_length_stem,revtext_clean_text_stem,revtext_text_length_stem
289270,work great order,3,satisfi natur vital natur calm oz powder work ...,11
93271,buy product,2,hospit work bought coffe singl use pack everi ...,56
291601,great coffe,2,found k cup coffe concept quick tidi newman pu...,15
347392,pepper dri,2,pepper dri oil came box jar br would purchas item,10
261121,like weak tea,3,tea bad made weak cup use recommend pod holder...,31
...,...,...,...,...
397590,bit gritti green,3,put travel cup year old els drink use smooth c...,49
82608,kiddo love,2,use jar babi food supplement make homemad some...,32
343778,describ,1,realli anchovi stuf oliv littl anchovi past ne...,16
297557,good smoothi short shelf life,5,realli like tast powder great protein shake es...,39


In [42]:
X_test.drop(columns= ['ReviewSummary', 'ReviewText'], inplace = True)

In [43]:
X_test = pd.concat([X_test, temp_df], axis = 1)
X_test

Unnamed: 0,revsum_clean_text_stem,revsum_text_length_stem,revtext_clean_text_stem,revtext_text_length_stem
289270,work great order,3,satisfi natur vital natur calm oz powder work ...,11
93271,buy product,2,hospit work bought coffe singl use pack everi ...,56
291601,great coffe,2,found k cup coffe concept quick tidi newman pu...,15
347392,pepper dri,2,pepper dri oil came box jar br would purchas item,10
261121,like weak tea,3,tea bad made weak cup use recommend pod holder...,31
...,...,...,...,...
397590,bit gritti green,3,put travel cup year old els drink use smooth c...,49
82608,kiddo love,2,use jar babi food supplement make homemad some...,32
343778,describ,1,realli anchovi stuf oliv littl anchovi past ne...,16
297557,good smoothi short shelf life,5,realli like tast powder great protein shake es...,39


## 2.3 Data Transformation

### TF-IDF

#### Just considering revtext and not revsum

In [44]:
X_test_tfidf = vectorizer.transform(X_test['revtext_clean_text_stem'])

In [45]:
print(vectorizer.vocabulary_)
print("Type of train features:", type(X_test_tfidf))

{'love': 32785, 'season': 49358, 'use': 59749, 'slight': 51191, 'butter': 7736, 'spray': 52707, 'asparagu': 3307, 'grill': 24249, 'steamer': 53227, 'bag': 4095, 'frozen': 21884, 'veggi': 60201, 'bring': 7065, 'flavor': 20731, 'everyth': 18926, 'without': 62401, 'overwhelm': 40383, 'chang': 9503, 'way': 61424, 'cook': 12329, 'london': 32554, 'broil': 7155, 'complement': 11703, 'meat': 34519, 'well': 61627, 'mani': 33734, 'wonder': 62542, 'hot': 26672, 'tough': 57368, 'question': 45107, 'sinc': 50816, 'realli': 45754, 'like': 32051, 'pepper': 41582, 'kind': 30360, 'would': 62772, 'call': 8115, 'pretti': 43847, 'mild': 35335, 'tast': 55451, 'may': 34309, 'vari': 60034, 'purchas': 44724, 'product': 44141, 'time': 56775, 'although': 1691, 'cost': 12613, 'risen': 47386, 'find': 20386, 'still': 53404, 'good': 23590, 'brand': 6783, 'coffe': 11244, 'cereal': 9301, 'great': 24087, 'textur': 56075, 'process': 44089, 'portion': 43278, 'size': 50957, 'gener': 22691, 'also': 1648, 'overli': 40266, '

# 3. Model

## Logistic Regression

In [46]:
X_train.shape

(454533, 4)

In [47]:
X_train_tfidf.shape

(454533, 64031)

In [48]:
len(y_train)

454533

In [49]:
%%time

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

CPU times: user 19.2 s, sys: 1.7 s, total: 20.9 s
Wall time: 21.1 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
y_test_pred = classifier.predict(X_test_tfidf)

In [51]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.8655156027245366
              precision    recall  f1-score   support

         bad       0.75      0.66      0.70     16399
        good       0.89      0.97      0.93     88712
     neutral       0.55      0.17      0.26      8523

    accuracy                           0.87    113634
   macro avg       0.73      0.60      0.63    113634
weighted avg       0.85      0.87      0.85    113634



## Decision Tree

In [None]:
%%time

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train_tfidf, y_train)

In [None]:
y_test_pred = classifier.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

## Random Forest

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_tfidf, y_train)

In [None]:
y_test_pred = classifier.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))