In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Reviews.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
# Shape of the dataframe
df.shape

(568454, 10)

In [4]:
# Take sample of 1000

sample_df = df.sample(1000)
# Print shape of the sample
sample_df.shape

(1000, 10)

In [5]:
sample_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
495772,495773,B0098WV8F2,A2PV0TBG04BZKF,jump_ace,1,1,3,1334966400,Ok tasting on its own.,This was ok on its own mixed with water. I muc...
552799,552800,B001SB4LSC,A2Q2GDB5XDYQV5,From The Tree House,2,2,5,1256428800,Absolutely perfect rice.,"Light, long grain, tasty and a brilliant white..."
383057,383058,B001PMDYV4,AVSTWD92CNM89,Caroline,0,0,5,1338076800,Cats like it,I have two Siamese kittens who are on a mostly...
357408,357409,B006DR0HMU,A3HVK0BC3VYDH5,Enya,1,1,5,1336003200,My favorite dark chocolate,"I love this chocolate. It has 85% cacao, so it..."
205681,205682,B000E65OII,A3NOBH42C7UI5M,"Carol ""kepela""",2,3,5,1193788800,I love the smell of this tea and the flavor,I drink a lot of tea and I've tried many diffe...


In [6]:
# Distribution of score
sample_df['Score'].value_counts()

5    638
4    147
1     95
3     65
2     55
Name: Score, dtype: int64

In [7]:
# Distribution of score
df['Score'].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [8]:
# Since the scores are not uniformly distributed, let us take 200 from each category

sample_df = df.sample(df.shape[0]).groupby(['Score']).head(200)
sample_df.shape

(1000, 10)

In [9]:
sample_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
155433,155434,B000GAT6NG,A1W1V6RD1I9R2D,"Cecil ""Bay Area Pilot""",1,5,5,1320710400,Happy! Would recommend to anyone!,Item arrived when promised and works as descri...
26919,26920,B004NDV46E,ARM0FJD9YRWV6,kk,0,0,5,1320796800,Great product,I have been using emergen-c packets for years ...
484306,484307,B00020HHGS,A1YJNI5NUYVJKG,RLH,0,0,5,1333929600,Love this tea,Better than many I have tried. With a little ...
547177,547178,B002ESMK4U,A1UD8NKXB3868M,B. Ryan-Cabot,2,2,3,1277942400,Just OK,"I was attracted to the price, size and pecan f..."
359134,359135,B0051SU8BC,AI1SSQELYZGLB,C. Hinton,0,0,3,1343001600,People at work like it,I ordered this based on overall positive revie...


In [10]:
review_df = sample_df[['Text', 'Score']]
review_df.head()

Unnamed: 0,Text,Score
155433,Item arrived when promised and works as descri...,5
26919,I have been using emergen-c packets for years ...,5
484306,Better than many I have tried. With a little ...,5
547177,"I was attracted to the price, size and pecan f...",3
359134,I ordered this based on overall positive revie...,3


In [11]:
review_df.Score.value_counts()

1    200
2    200
3    200
4    200
5    200
Name: Score, dtype: int64

In [12]:
review_df['Score'] = review_df['Score'].apply(lambda s: 1 if  s>3 else 0)

In [13]:
review_df['Score'].value_counts()

0    600
1    400
Name: Score, dtype: int64

In [14]:
# Importing the required libraries
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
import string

lemmatizer = WordNetLemmatizer()

english_stop_words = stopwords.words('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vlekkala\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
def clean_text(sentence):
    sentence = sentence.lower()
    words = word_tokenize(sentence)
    lemmatized = [lemmatizer.lemmatize(word) 
                  for word in words 
                  if word not in english_stop_words and word not in string.punctuation and word.isalpha()]
    return " ".join(lemmatized)

In [16]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [17]:

def clean_text_spacy(sentence):
    corpus = []
    
    doc = nlp(sentence.lower())
    for token in doc:
        if token.pos_ in ['ADV','ADJ', 'VERB'] and (not token.is_punct) and (token.text not in english_stop_words):
            corpus.append(token.lemma_)

    return " ".join(corpus)

In [18]:
print(clean_text('I am very greatful to the product 10'))
print(clean_text_spacy('I am very greatful to the product 10'))

greatful product
greatful


In [19]:
review_df.head()['Text'].apply(clean_text_spacy).head()

155433    arrive promise work describe use season eat ta...
26919                use love give good fall well important
484306    well many try little terrific go order nice sm...
547177                          attract pecan bad find weak
359134    order base overall positive tender crazy seem ...
Name: Text, dtype: object

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

tfidf = TfidfVectorizer(tokenizer=clean_text_spacy)
model = RandomForestClassifier()

clf = Pipeline([('tfidf', tfidf),('clf', model)])

X = review_df['Text']
y = review_df['Score']

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.2)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.91      0.74       114
           1       0.71      0.28      0.40        86

    accuracy                           0.64       200
   macro avg       0.67      0.60      0.57       200
weighted avg       0.66      0.64      0.60       200



In [22]:
clf.predict(['This is a very good product'])

array([1], dtype=int64)

In [23]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [24]:
doc = nlp("I am very greatful to the product 10")
for token in doc:
    print(token, token.pos_)

I PRON
am AUX
very ADV
greatful ADJ
to ADP
the DET
product NOUN
10 NUM


In [25]:
review_df.iloc[0,0]

'Item arrived when promised and works as described.  Using for popcorn seasoned with Flavocal - I am now eating popcorn that tastes EXACTLY like movie popcorn.<br /><br />SPECIAL NOTE:  I will likely stop offering to review products if Amazon continues with the silly 20 word minimum.  If they want to establish minimums then I want a discount or rebate on future items I may order.'

In [38]:
import pycaret
from pycaret.nlp import *
#intialize the setup
exp_clf = setup(data = sample_df, target = 'Score', session_id=12)

Description,Value
session_id,12
Documents,1000
Vocab Size,0
Custom Stopwords,False


In [40]:
#lda = create_model(model = 'lda')
# Not really sure why this is failing