In [1]:
# Load essential libraries
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from math import sqrt
import copy
from sklearn.feature_extraction.text import TfidfTransformer
from math import sqrt
import copy

## Question 1 Explore and clean the data

In [2]:
# Load the data
data = pd.read_csv("amazon-reviews.csv.bz2", sep="\t")

In [3]:
# Check first few observations
data.head()

Unnamed: 0,date,summary,review,rating
0,2013-07-16,Awesine,Perfect for new parents. We were able to keep ...,5
1,2013-06-29,Should be required for all new parents!,This book is such a life saver. It has been s...,5
2,2014-03-19,Grandmother watching baby,Helps me know exactly how my babies day has go...,5
3,2013-08-17,repeat buyer,I bought this a few times for my older son and...,5
4,2014-04-01,Great,I wanted an alternative to printing out daily ...,4


In [4]:
# Checking the shape of data
data.shape

(205331, 4)

In [5]:
# Check data types
data.dtypes

date       object
summary    object
review     object
rating      int64
dtype: object

In [6]:
# Check null values in reviews
data['review'].isna().value_counts()

False    205251
True         80
Name: review, dtype: int64

There are 80 null values in reviews

In [7]:
# Checking duplicate rows
len(data['review'][data.duplicated()])

9500

There are 9500 duplicate reviews

In [8]:
# Remove missing and empty reviews
data.drop_duplicates(subset = None, keep = 'first', inplace = True)
data.drop(data[data['review'].isna()].index, inplace=True)
data.drop(list(data['review'].filter(regex = '\\s+')), axis = 1, inplace = True)
data.drop(data[data['review'].str.len() == 0].index, axis = 1, inplace = True)

In [9]:
# Shape of data after removing missing, duplicate and empty values
data.shape

(195758, 4)

In [10]:
# Setting random seed
random.seed(123)

# Taking 1000 random samples of data
reviews = data.sample(n = 1000) 

# Checking the shape of data
reviews.shape

(1000, 4)

## Question 2 Implement TF-IDF transform

In [11]:
# Tokenize reviews
bow_transformer = CountVectorizer().fit(reviews['review'])

# Print total number of vocab words 
print(len(bow_transformer.vocabulary_))

reviews_bow = bow_transformer.transform(reviews['review'])

print('Shape of sparse matrix', reviews_bow.shape)
print('Amount of non-zero occurrences', reviews_bow.nnz)
sparsity = (100.0 * reviews_bow.nnz/ ( reviews_bow.shape[0] * reviews_bow.shape[1]))
print("sparsity: {}".format(round(sparsity)))

6988
Shape of sparse matrix (1000, 6988)
Amount of non-zero occurrences 63336
sparsity: 1


In [12]:
# implement Tf-idf
Y = reviews_bow.toarray()

# sparse matrix
Y_df = pd.DataFrame(Y, columns = bow_transformer.get_feature_names() )  
len_df = Y_df.shape[0]

# create tf_df dataframe
tf_df = np.log(1 + Y_df) 

In [13]:
# create idf
contain_word = Y_df > 0
contain_word = contain_word.astype(int)

# count of documents that contain the word
sum_word_cont = contain_word.sum(axis = 0)

#  Calculate idf 
idf =  np.log(len_df/(1 + sum_word_cont))
idf_array = np.array(idf)
idf_dataframe = pd.DataFrame([idf_array], columns = bow_transformer.get_feature_names(),index = [0])

tf_idf = tf_df * np.squeeze(idf_dataframe)

# Final tf_idf data frame
tf_idf.head()

Unnamed: 0,00,02,0241,03,05,0mm,10,100,1000,10lb,...,zipper,zippered,zippers,zippy,zips,zit,zoli,zone,zoom,zooms
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Converting data frame to array
reviews_array = np.asarray(tf_idf)


## Question 3 Implement k-Means Clustering

Taking k = 3

In [15]:


# Set random seed
random.seed(122)

# Pick random 3 reviews as centers
center =  random.sample(range(0,reviews_array.shape[0]-1), 3)

centers = []
for i in center:
    centers.append(reviews_array[i])
    
# validate centers   
centers

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.])]

In [16]:
# Creating clusters
clusters = [[] for c in centers]

# Initially assigning every review to the same cluster
for i in range(reviews_array.shape[0]-1):
    clusters[1].append(reviews_array[i])

In [17]:
# Computing norm of review vectors
norm_reviews = []
for i in range(reviews_array.shape[0]-1):
    norm_reviews.append(np.linalg.norm(reviews_array[i]))   

norm_reviews

[17.30482520021442,
 9.44418602694101,
 7.348225767866009,
 16.419603158368822,
 19.77691860635892,
 19.50177443672591,
 15.13396178462844,
 10.038117738984784,
 16.987722087115035,
 34.50637853091263,
 10.717939846984764,
 16.37097222672165,
 24.117854137139894,
 48.21055883541764,
 17.80757753667318,
 18.23239024711451,
 16.357574777679428,
 17.916056874343894,
 12.758587274131338,
 2.5741447961235897,
 12.186532150661668,
 9.907052489426412,
 14.037058413038446,
 17.331678316631145,
 14.0052639108716,
 24.75973187711804,
 32.65517757449065,
 17.428642628684166,
 16.051330343455597,
 21.82074874460699,
 12.850513475603734,
 17.434469638865227,
 15.852843504847812,
 19.48350416793917,
 22.086134713162668,
 42.32958505941374,
 12.307961023762301,
 9.398852659737972,
 18.456392270317146,
 28.46956984721138,
 15.83329875869177,
 18.4247355215224,
 36.289903779556724,
 16.068501933815906,
 24.174010397378613,
 16.672601908714025,
 32.14535581573317,
 30.66871166323363,
 15.012170085797322

In [18]:
# Computing the norm of the cluster center vectors
norm_centers = []
for i in range(len(centers)):
     norm_centers.append(np.linalg.norm(centers[i]))  
norm_centers

[18.5383719083467, 23.30726732886451, 21.31139550297487]

In [19]:
# Function to compute dot product
def dotproduct(vector1, vector2):
    return sum(x*y for x,y in zip(vector1, vector2))

# Function to compute cosine similarity
def similarity(vector1, vector2):
    return dotproduct(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

In [20]:
# Update clusters
indexes = [[] for c in centers]
def update_clusters():
    def closest_center(vector):
        similarity_vector = lambda center : similarity(center, vector)
        center = max(centers, key=similarity_vector)
        return np.where(np.all(center==centers,axis=1))[0][0]
    
    for i, vector in enumerate(reviews_array):
        index = closest_center(vector)
        indexes[index].append(i)
        clusters[index].append(vector)
        

In the above function, we are finding the cosine similarity of each review with every center and then the reviews are classified to that cluster with whose center they had the highest cosine similarity. 

In [21]:
# Update centers
def update_centers():
    new_centers = []
    for cluster in clusters:
        center = np.array([np.mean(c) for c in zip(*cluster)])
        new_centers.append(center)
    return new_centers

In the above function, we are updating our centers by taking the mean of all the reviews present in that cluster.

In [22]:
# Updating the clusters and centers 30 times
for i in range(30):
    update_clusters()
    centers = update_centers()

In [23]:
# check the centers of cluster
centers

[array([0.00999265, 0.01121418, 0.        , ..., 0.        , 0.        ,
        0.0112471 ]),
 array([0.01233009, 0.0117932 , 0.00042054, ..., 0.00042054, 0.01261634,
        0.00042054]),
 array([0.01239106, 0.        , 0.01394659, ..., 0.01394659, 0.00046489,
        0.        ])]

In [24]:
# Check the centers after 31st interation
update_clusters()
centers = update_centers()
print(centers)

[array([0.00998844, 0.01118686, 0.        , ..., 0.        , 0.        ,
       0.01124236]), array([0.01233289, 0.01183057, 0.00040827, ..., 0.00040827, 0.01265632,
       0.00040827]), array([0.01239736, 0.        , 0.01395369, ..., 0.01395369, 0.00045012,
       0.        ])]


Since there's not much change in the center in 30th and 31st iteration and taking running time into account, we are stopping the iterations. Check for K=5 in the next section. 

The algorithm is condsidering the similar products mentioned within the reviews. For e.g.: Some products have reviews of baby products, and some have reviews for electronics.

## Question 4 Play and Analyze Your Clusters

In [25]:
# Printing out a set of reviews from each cluster

sample = []
for i in range(len(indexes)):
    sample.append(indexes[i][50:70])
    
for i in range(len(sample)):
    print('-------------------------------------------------')
    print("Some reviews in cluster number : ", i)
    for j in range(len(sample[i])):
        print(reviews.iloc[sample[i][j]]['review'])
        print()

-------------------------------------------------
Some reviews in cluster number :  0
Arrived early. Looks and works really good. Can't wait to give it to my daughtor on her birthday. She is going to love it.

The perfect bed before going to a full size twin bed, and not ugly like the race car or princess beds. My son loves his, and it is simple. The only down-side is that there is a weight limit, so I cannot sit on it with him when putting him to bed.

My baby was knawing on her fingers, she is 5 months.  This has been a great thing when she can figure out how to hold it right.

Really like this shampoo, it has the mint/tea tree scent and feel but unlike many other mint shampoos it doesn't make your head feel like you've been dunked into a bucket of Listerine and are on fire! It's very subtle and mild. It is a bit drying as most shampoos with sulfates are so I'd recommend using it with conditioner.

I returned this. It didn't fit and it was really scratchy.

My 1-yr-old is incredibly 

While taking 3 clusters we can see that many of the reviews in the 3 clusters are baby products related.

There are some reviews for other products like musical instruments but the bulk of the reviews are related to baby products.

We are now going to play with k=10.

### Taking K=10

In [26]:
# Set random seed
random.seed(122)

# Pick random 10 reviews as centers
center =  random.sample(range(0,reviews_array.shape[0]-1), 10)

centers = []
for i in center:
    centers.append(reviews_array[i])

# Creating clusters
clusters = [[] for c in centers]

# Initially assigning every review to the same cluster
for i in range(reviews_array.shape[0]-1):
    clusters[1].append(reviews_array[i])

In [27]:
# Updating the clusters and centers 30 times
indexes = [[] for c in centers]
for i in range(30):
    update_clusters()
    centers = update_centers()

In [28]:
# Printing out a set of reviews from each cluster
sample = []
for i in range(len(indexes)):
    sample.append(indexes[i][0:10])
    
for i in range(len(sample)):
    print('-----------------------------------------------------------------------------------------------')
    print("Some reviews in cluster number : ", i)
    for j in range(len(sample[i])):
        print(reviews.iloc[sample[i][j]]['review'])
        print()

-----------------------------------------------------------------------------------------------
Some reviews in cluster number :  0
I got this for a friend she really likes it.The quality and color are very nice.  I am thinking of getting one for myself :)

I already had the girl pack, but these are so useful and so good, I wanted more. So I came and got this pack in order to have more variety of colors. Below is my original review for the girl pack:&#34;My baby has a little bit of reflux and she's ALWAYS spitting. Not only after feedings, but sometimes even two hours after feeding we have some spitting episodes. So of course she is ALWAYS wearing a bib. What I loved about this:- Ten pack of great solid colors, now the bibs are accessories that she wears with her outfits, not ugly items she HAS to wear.- Now she has enough bibs for two days, at least.- Waterproof lining, so it prevents her clothes from getting wet.- The lining is SO thin I can't tell it is there. I have other bibs with

The algorithm is condsidering the similar products mentioned within the reviews. For e.g.: Some products have reviews of baby products, and some have reviews for electronics.

When we take k=10, we can see that more clusters define the reviews better.
For e.g.:

Cluster 0 is centered around baby products

Cluster 1 is centered around beauty products

Cluster 6 is centered around electronics etc.

With more number of clusters 30-40, the data would fit better but our local machine cannot take the load, therefore we are stopping at this iteration.