In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Reviews.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
# Shape of the dataframe
df.shape

(568454, 10)

In [5]:
# Take sample of 1000

sample_df = df.sample(1000)
# Print shape of the sample
sample_df.shape

(1000, 10)

In [6]:
sample_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
72121,72122,B000PEA4G6,A10A9H10YAQSA0,happytoshop,0,0,5,1350604800,Great marinade,I really liked this. I am on a low sodium diet...
78816,78817,B001E5DZSO,A13B5VIYPEC4IV,D. Schmidli,6,6,5,1219017600,Delicious!,I ordered this on a whim after reading the rev...
35492,35493,B0007OPW66,A2K41JWLWXORKW,LisaVK,0,0,2,1341619200,Don't Recommend,I do not recommend this. Half of them were st...
62108,62109,B000CQG8K8,AE10DV4Y7NFUA,Meisha4,1,1,5,1318204800,Excellent!,"This tea is very hearty...definitely ""double"" ..."
139723,139724,B001BCXTGS,AAG0O8X2PYE6G,"J. Aragon ""Feminist Educator""",0,1,5,1227657600,The Cats' meow,We have two cats and they are picky with their...


In [7]:
# Distribution of score
sample_df['Score'].value_counts()

5    639
4    143
1     95
2     64
3     59
Name: Score, dtype: int64

In [8]:
# Distribution of score
df['Score'].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [20]:
# Since the scores are not uniformly distributed, let us take 200 from each category

sample_df = df.sample(df.shape[0]).groupby(['Score']).head(200)
sample_df.shape

(1000, 10)

In [21]:
sample_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
40103,40104,B001E50X66,A3IHTXQYL7SJ6Z,lynne Perrin,0,0,5,1341705600,mmmmmm mmmmmm good,I like this product very much. I use them ever...
381058,381059,B001EPQMMK,A13T0V3LHOTHDL,"E. Treants ""tree""",0,0,5,1294272000,OMG GOOD! Tastes as good as the old Wally Worl...,"Actually, I do not like this, BUT my wife thin..."
560667,560668,B000F7V872,APHAQ7GNQWDQW,J. Mattison,4,4,5,1233619200,Fabulous!,My favorite coffee creamer years ago was a Kah...
399479,399480,B003ZA46X4,A1X9U49ZF0RNVB,Kimmer,1,1,5,1309996800,So delicious!,I really had no idea what to expect with Dark ...
525245,525246,B000F5DRVO,AD309KSBRGWB,Pod,0,0,5,1341100800,Great!,This tastes great inside of rice balls. I love...


In [98]:
review_df = sample_df[['Text', 'Score']]
review_df.head()

Unnamed: 0,Text,Score
40103,I like this product very much. I use them ever...,5
381058,"Actually, I do not like this, BUT my wife thin...",5
560667,My favorite coffee creamer years ago was a Kah...,5
399479,I really had no idea what to expect with Dark ...,5
525245,This tastes great inside of rice balls. I love...,5


In [99]:
review_df.Score.value_counts()

1    200
2    200
3    200
4    200
5    200
Name: Score, dtype: int64

In [100]:
review_df['Score'] = review_df['Score'].apply(lambda s: 1 if  s>3 else 0)

In [101]:
review_df['Score'].value_counts()

0    600
1    400
Name: Score, dtype: int64

In [102]:
# Importing the required libraries
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
import string

lemmatizer = WordNetLemmatizer()

english_stop_words = stopwords.words('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vlekkala\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [103]:
def clean_text(sentence):
    sentence = sentence.lower()
    words = word_tokenize(sentence)
    lemmatized = [lemmatizer.lemmatize(word) 
                  for word in words 
                  if word not in english_stop_words and word not in string.punctuation and word.isalpha()]
    return " ".join(lemmatized)

In [115]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [140]:

def clean_text_spacy(sentence):
    corpus = []
    
    doc = nlp(sentence.lower())
    for token in doc:
        if token.pos_ in ['ADV','ADJ', 'VERB'] and (not token.is_punct) and (token.text not in english_stop_words):
            corpus.append(token.lemma_)

    return " ".join(corpus)

In [141]:
print(clean_text('I am very greatful to the product 10'))
print(clean_text_spacy('I am very greatful to the product 10'))

greatful product
greatful


In [142]:
review_df.head()['Text'].apply(clean_text_spacy).head()

40103           like much use tasty consistent reorder many
381058    actually like think good ever get wally long a...
560667    favorite ago flavor disappear never return thr...
399479    really expect dark figure mean go wrong truly ...
525245    taste great inside love individual use last ke...
Name: Text, dtype: object

In [143]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

tfidf = TfidfVectorizer(tokenizer=clean_text_spacy)
model = RandomForestClassifier()

clf = Pipeline([('tfidf', tfidf),('clf', model)])

X = review_df['Text']
y = review_df['Score']

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.2)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [144]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.91      0.74       117
           1       0.63      0.23      0.34        83

    accuracy                           0.62       200
   macro avg       0.63      0.57      0.54       200
weighted avg       0.63      0.62      0.57       200



In [148]:
clf.predict(['This is a very good product'])

array([1], dtype=int64)

In [111]:
import spacy
nlp = spacy.load('en_core_web_sm')

This DET
is AUX
a DET
good ADJ
product NOUN


In [121]:
doc = nlp("I am very greatful to the product 10")
for token in doc:
    print(token, token.pos_)

I PRON
am AUX
very ADV
greatful ADJ
to ADP
the DET
product NOUN
10 NUM


In [113]:
review_df.iloc[0,0]

'I like this product very much. I use them every other day. They are tasty, the product is consistent. I will reorder many times.'

In [None]:
from sklearn.feature_extraction.text import 