In [1]:
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv("watch_reviews.tsv", sep = '\t', header = 0, error_bad_lines = False)

b'Skipping line 8704: expected 15 fields, saw 22\nSkipping line 16933: expected 15 fields, saw 22\nSkipping line 23726: expected 15 fields, saw 22\n'
b'Skipping line 85637: expected 15 fields, saw 22\n'
b'Skipping line 132136: expected 15 fields, saw 22\nSkipping line 158070: expected 15 fields, saw 22\nSkipping line 166007: expected 15 fields, saw 22\nSkipping line 171877: expected 15 fields, saw 22\nSkipping line 177756: expected 15 fields, saw 22\nSkipping line 181773: expected 15 fields, saw 22\nSkipping line 191085: expected 15 fields, saw 22\nSkipping line 196273: expected 15 fields, saw 22\nSkipping line 196331: expected 15 fields, saw 22\n'
b'Skipping line 197000: expected 15 fields, saw 22\nSkipping line 197011: expected 15 fields, saw 22\nSkipping line 197432: expected 15 fields, saw 22\nSkipping line 208016: expected 15 fields, saw 22\nSkipping line 214110: expected 15 fields, saw 22\nSkipping line 244328: expected 15 fields, saw 22\nSkipping line 248519: expected 15 fields,

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960204 entries, 0 to 960203
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   marketplace        960204 non-null  object
 1   customer_id        960204 non-null  int64 
 2   review_id          960204 non-null  object
 3   product_id         960204 non-null  object
 4   product_parent     960204 non-null  int64 
 5   product_title      960202 non-null  object
 6   product_category   960204 non-null  object
 7   star_rating        960204 non-null  int64 
 8   helpful_votes      960204 non-null  int64 
 9   total_votes        960204 non-null  int64 
 10  vine               960204 non-null  object
 11  verified_purchase  960204 non-null  object
 12  review_headline    960197 non-null  object
 13  review_body        960056 non-null  object
 14  review_date        960200 non-null  object
dtypes: int64(5), object(10)
memory usage: 109.9+ MB


In [4]:
df.dropna(subset=['review_body'], inplace = True) # inplace?

In [5]:
# use the first 10000 data as out training data
data = df.loc[:10007, 'review_body'].tolist()
len(data)

10000

## Tokenizing and stemming

In [6]:
import nltk

In [7]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append("'m")
stopwords.append("'s")
stopwords.append("br")
stopwords.append("n't")
stopwords.append("ve")
print(stopwords[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [8]:
from nltk.stem.snowball import SnowballStemmer
import re

In [9]:
stemmer = SnowballStemmer('english')

In [10]:
def tokenizing_and_stemming(text):
    tokens = []
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())
    filtered_tokens = []
    
    for token in tokens:
        if re.search('[a-z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [11]:
tokenizing_and_stemming(data[9999])

['watch', 'face', 'great', 'leather', 'band', 'tear', 'alreadi', 'wear', 'day']

## TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [13]:
tfidf_model = TfidfVectorizer(max_df = 0.99, max_features = 1500, 
                             min_df= 0.01, stop_words='english',
                             use_idf = True, tokenizer= tokenizing_and_stemming, ngram_range=(1,2))
tfidf_matrix = tfidf_model.fit_transform(data)
print("review number:", tfidf_matrix.shape[0], 'terms number:',tfidf_matrix.shape[1])

  'stop_words.' % sorted(inconsistent))


review number: 10000 terms number: 258


In [54]:
# tfidf_model.get_params()
tf_selected_words = tfidf_model.get_feature_names()

In [19]:
tfidf_matrix

<10000x258 sparse matrix of type '<class 'numpy.float64'>'
	with 78851 stored elements in Compressed Sparse Row format>

## Clustering

In [42]:
from sklearn.cluster import KMeans
num_clusters = 5

km = KMeans(n_clusters = num_clusters)
km.fit(tfidf_matrix) 

# array to list
clusters = km.labels_.tolist()

In [37]:
df[:10].review_body

0    Absolutely love this watch! Get compliments al...
1         I love this watch it keeps time wonderfully.
2                                            Scratches
3    It works well on me. However, I found cheaper ...
4    Beautiful watch face.  The band looks nice all...
5    i love this watch for my purpose, about the pe...
6    for my wife and she loved it, looks great and ...
7    I was about to buy this thinking it was a Swis...
8    Watch is perfect. Rugged with the metal &#34;B...
9    Great quality and build.<br />The motors are r...
Name: review_body, dtype: object

In [43]:
reviewcode = {
    "review" : df[:10000].review_body,
    "cluster": clusters
}
codedf = pd.DataFrame(reviewcode, columns = ["review","cluster"])

In [44]:
codedf.head(10)

Unnamed: 0,review,cluster
0,Absolutely love this watch! Get compliments al...,0
1,I love this watch it keeps time wonderfully.,0
2,Scratches,4
3,"It works well on me. However, I found cheaper ...",4
4,Beautiful watch face. The band looks nice all...,4
5,"i love this watch for my purpose, about the pe...",0
6,"for my wife and she loved it, looks great and ...",2
7,I was about to buy this thinking it was a Swis...,4
8,Watch is perfect. Rugged with the metal &#34;B...,2
9,Great quality and build.<br />The motors are r...,2


In [45]:
codedf['cluster'].value_counts().to_frame()

Unnamed: 0,cluster
4,6598
2,1169
0,968
3,639
1,626


In [46]:
km.cluster_centers_

array([[0.00136575, 0.02621321, 0.00050246, ..., 0.00285365, 0.00698811,
        0.0087124 ],
       [0.00084447, 0.        , 0.00159272, ..., 0.0036197 , 0.00346334,
        0.00148756],
       [0.00185264, 0.00231729, 0.00471825, ..., 0.00479869, 0.00926884,
        0.01232898],
       [0.        , 0.        , 0.00155345, ..., 0.00371263, 0.01063194,
        0.0015916 ],
       [0.00526333, 0.00436649, 0.00622658, ..., 0.00866152, 0.02330434,
        0.01529667]])

In [47]:
order_centroids = km.cluster_centers_.argsort()

In [52]:
# sort the feature by decresing importance
order_centroids[:,::-1]

array([[136, 137, 232, ..., 167, 166, 127],
       [ 94, 172,  97, ..., 196, 195, 127],
       [ 99, 102, 232, ..., 195, 166,  32],
       [148, 150, 232, ..., 167,  33,   0],
       [232, 129, 123, ..., 253, 100, 101]], dtype=int64)

In [56]:
cluster_keyword={}
for i in range(num_clusters):
    print("cluster ", i, "words:", end = '')
    cluster_keyword[i] = []
    for ind in order_centroids[i, :10]:
        cluster_keyword[i].append(tf_selected_words[ind])
        print(tf_selected_words[ind],",", end= '')
    print()

cluster  0 words:ll ,pin ,plastic ,origin ,great look ,hour ,button ,good watch ,second hand ,movement ,
cluster  1 words:ll ,second hand ,seiko ,bought watch ,watch love ,bracelet ,sinc ,watch great ,watch face ,small wrist ,
cluster  2 words:broke ,pin ,second hand ,like watch ,bracelet ,review ,easili ,quit ,ok ,model ,
cluster  3 words:abl ,button ,plastic ,origin ,cute ,new ,metal ,digit ,bought watch ,hour ,
cluster  4 words:great price ,great look ,work great ,watch great ,great watch ,good qualiti ,good watch ,watch good ,water resist ,look great ,


## Latent Dirichlet Allocation (LDA)

In [57]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 5)

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
tfidf_lda = CountVectorizer(max_df=0.99, max_features=1000,
                           min_df=0.01, stop_words = 'english',
                           tokenizer=tokenizing_and_stemming, ngram_range=(1,2))

tfidf_matrix_lda = tfidf_lda.fit_transform(data)
print("reviews:",tfidf_matrix_lda.shape[0], "feature", tfidf_matrix_lda.shape[1])

  'stop_words.' % sorted(inconsistent))


reviews: 10000 feature 258


In [59]:
lda_output = lda.fit_transform(tfidf_matrix_lda)
print(lda_output)

[[0.02258025 0.02241308 0.67944206 0.02285546 0.25270915]
 [0.04060411 0.04068079 0.59234168 0.04128062 0.28509279]
 [0.10000049 0.1009091  0.10000036 0.10086956 0.59822048]
 ...
 [0.00889445 0.15054642 0.22948582 0.60204023 0.00903309]
 [0.01011648 0.01018586 0.08810265 0.01017458 0.88142042]
 [0.020284   0.02009447 0.15395578 0.29438745 0.5112783 ]]


In [60]:
topic_word = lda.components_
print(topic_word)

[[2.02088384e-01 2.03887426e-01 2.00357546e+01 ... 4.46877932e+01
  5.33766960e+01 3.26986306e-01]
 [2.02123680e-01 2.11131816e-01 2.67615040e+01 ... 4.43596798e+01
  1.48671398e+01 2.69605469e-01]
 [2.03392814e-01 1.26611337e+02 2.02404703e-01 ... 2.01223972e+01
  1.92697030e+02 2.34791470e+01]
 [6.27145104e+01 2.07710581e-01 1.11143376e+01 ... 8.89311562e+01
  4.02952140e+01 4.30784285e+02]
 [1.01677885e+02 2.37659331e+01 1.27885999e+02 ... 2.78989735e+01
  5.98763920e+02 1.12139976e+02]]


In [62]:
topic_name = ["topic" + str(i) for i in range(lda.n_components)]
doc_names = ["Doc" + str(i) for i in range(len(data))]

df_doc_topic = pd.DataFrame(np.round(lda_output, 2), columns= topic_name, index= doc_names)
topic = np.argmax(df_doc_topic.values, axis = 1) # get the max value of the topic 
df_doc_topic['topic'] = topic
df_doc_topic.head()


Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic
Doc0,0.02,0.02,0.68,0.02,0.25,2
Doc1,0.04,0.04,0.59,0.04,0.29,2
Doc2,0.1,0.1,0.1,0.1,0.6,4
Doc3,0.43,0.03,0.03,0.47,0.03,3
Doc4,0.15,0.15,0.14,0.14,0.41,4


In [63]:
df_doc_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
4,2676
2,2231
3,1921
0,1714
1,1458


In [64]:
df_topic_words = pd.DataFrame(lda.components_)

df_topic_words.columns = tfidf_lda.get_feature_names()
df_topic_words.index = topic_name
df_topic_words.head()

Unnamed: 0,abl,absolut,accur,actual,adjust,alarm,alreadi,alway,amaz,amazon,...,weight,white,wife,wish,work,work great,worn,worth,wrist,year
topic0,0.202088,0.203887,20.035755,19.759086,0.203891,0.200855,0.429116,0.20378,0.226788,43.032776,...,24.733577,31.954617,129.681643,27.964251,0.413478,0.201453,4.625367,44.687793,53.376696,0.326986
topic1,0.202124,0.211132,26.761504,1.231702,10.803901,24.210906,0.261693,14.631208,0.203015,0.202593,...,15.053441,0.201929,5.509059,0.203214,89.330168,1.041585,0.202665,44.35968,14.86714,0.269605
topic2,0.203393,126.611337,0.202405,8.874735,15.455996,0.202435,2.768709,18.327735,141.342631,0.434699,...,23.306913,0.205083,3.255911,0.961538,115.584576,124.8608,0.204043,20.122397,192.69703,23.479147
topic3,62.71451,0.207711,11.114338,28.881846,1.392283,0.203656,104.716629,48.92303,22.949872,230.545662,...,0.201698,0.203933,0.203517,20.579897,854.999679,13.254165,47.850486,88.931156,40.295214,430.784285
topic4,101.677885,23.765933,127.885999,140.252632,238.143929,136.182148,8.823853,88.914247,19.277694,24.78427,...,91.704371,102.434437,12.349871,109.291101,291.672098,0.641996,93.117439,27.898974,598.76392,112.139976


In [66]:
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names())
    topic_words = []
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words

topic_keywords = print_topic_words(tfidf_model = tfidf_lda, lda_model= lda, n_words=15)

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,watch,nice,beauti,price,nice watch,great,awesom,color,thank,beauti watch,cool,realli,purchas,best,watch price
Topic 1,good,look,qualiti,watch,excel,product,price,nice,cheap,look good,perfect,look watch,nice look,time,easi
Topic 2,love,watch,great,look,love watch,great watch,look great,gift,expect,husband,perfect,band,wrist,big,watch look
Topic 3,watch,work,band,time,batteri,replac,year,recommend,strap,day,month,use,bought,buy,receiv
Topic 4,watch,like,time,look,band,face,use,wear,wrist,hand,read,easi,light,day,realli
