In [1]:
import numpy as np
import pandas as pd

##### Data uploading

In [3]:
data = pd.read_csv("Reviews.csv")

In [4]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


##### Data initial exploration

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [6]:
data.shape

(568454, 10)

In [7]:
data.isna().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [10]:
data.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,1.0,0.0,0.0,1.0,939340800.0
25%,142114.25,0.0,0.0,4.0,1271290000.0
50%,284227.5,0.0,1.0,5.0,1311120000.0
75%,426340.75,2.0,2.0,5.0,1332720000.0
max,568454.0,866.0,923.0,5.0,1351210000.0


##### Importing necessary NLTK libraries 

In [16]:
import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer


In [25]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\19737\AppData\Roaming\nltk_data...


True

In [26]:
stop_words = set(nltk.corpus.stopwords.words('english'))

##### text data Preprocessing

In [27]:
#we will apply Lemmatization to the Dataset 
#We are creating a function here 

def clean_text_data(text):
    le = WordNetLemmatizer()
    word_token = word_tokenize(text)
    tokens = [le.lemmatize(x) for x in word_token if x not in stop_words and len(x)>3]
    cleaned_text = " ".join(tokens)
    return cleaned_text


In [28]:
#applying the function on the dataset
data['cleaned_text']= data['Text'].apply(clean_text_data)

##### TFIDF Vectorization on text columns

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [48]:
tfidfvectorizer = TfidfVectorizer(stop_words='english', max_features=1000)


In [52]:
vect_text = tfidfvectorizer.fit_transform(data['cleaned_text'])

##### LDA on Vectorized text

In [56]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=42, max_iter=1)
lda_top=lda_model.fit_transform(vect_text)

In [59]:
#checking the output
lda_top[0:5]

array([[0.02341706, 0.0234179 , 0.02341956, 0.02341794, 0.31378355,
        0.02342413, 0.0234194 , 0.12577401, 0.02342208, 0.39650437],
       [0.22822555, 0.12898856, 0.02642639, 0.27171894, 0.0264333 ,
        0.0264245 , 0.02642634, 0.02640721, 0.21250359, 0.02644563],
       [0.02272474, 0.0226792 , 0.02267194, 0.02268252, 0.79588356,
        0.02267184, 0.0226708 , 0.02266954, 0.0226707 , 0.02267516],
       [0.0231176 , 0.02311831, 0.02312499, 0.02311768, 0.02312547,
        0.02311998, 0.37114395, 0.02311714, 0.44389168, 0.0231232 ],
       [0.02766786, 0.14364936, 0.02766968, 0.02767097, 0.02767272,
        0.02767311, 0.02766891, 0.02767561, 0.63498248, 0.02766929]])

##### Checking the results

In [60]:
print("Document 0: ")
for i,topic in enumerate(lda_top[0]):
    print("Topic ",i,":",topic*100,"%")

Document 0: 
Topic  0 : 2.341706372425738 %
Topic  1 : 2.3417900119648496 %
Topic  2 : 2.3419556953566034 %
Topic  3 : 2.3417941445308688 %
Topic  4 : 31.378354750706972 %
Topic  5 : 2.342413439889342 %
Topic  6 : 2.341939902452953 %
Topic  7 : 12.577400802225464 %
Topic  8 : 2.3422083355725616 %
Topic  9 : 39.65043654487465 %


##### Analyzing the topics

In [116]:
#Let us check what are the top words that comprise the topics. 
#This would give us a view of what defines each of these topics.

vocab = tfidfvectorizer.get_feature_names()
for i, comp in enumerate(lda_model.components_):
     vocab_comp = zip(vocab, comp)
     sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
     print("Topic "+str(i)+": ")
     for t in sorted_words:
            print(t[0], end=" ")
           
            
            

    

Topic 0: 
treat dog chew training grain jerky teeth beef love salmon Topic 1: 
peanut chocolate butter bar candy love snack cracker yummy almond Topic 2: 
coffee flavor taste like good strong cup green drink dark Topic 3: 
hair ball soft puppy cake bread skin filling morning oatmeal Topic 4: 
chip like taste good flavor great cooky salt snack love Topic 5: 
amazon price product shipping store great order love good time Topic 6: 
sugar water coconut drink taste product com like bottle chai Topic 7: 
sauce store local grocery raisin amazon awesome online buying price Topic 8: 
arrived quickly gift order delivery product ordered received great thanks Topic 9: 
food love like product cat good year time baby really 