# Clustering

In [52]:
import seaborn as sns
import pandas as pd
import autograd.numpy as np
from autograd import grad
import matplotlib.pyplot as plt
%matplotlib inline


In [53]:
# import data
test_data = pd.read_csv('Test.csv')

test_data.describe(include='all')


Unnamed: 0,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style,category
count,4500,4500,4500,4500,4500,4500,4497,4500.0,922.0,96,2363,4500
unique,2,2317,4455,4312,4069,4418,3460,,,96,1178,6
top,True,"12 26, 2016",D014D493AC2EC7F701D5D5E8827858B4,849AF742279BA95B2DFB8C2D14AAB61D,760C63E8E5E8DC3FAA01878D37BA5678,good,Five Stars,,,['https://images-na.ssl-images-amazon.com/imag...,{'Format:': ' Audio CD'},automotive
freq,3581,10,4,8,166,22,294,,,1,673,750
mean,,,,,,,,1409970000.0,7.039046,,,
std,,,,,,,,113578500.0,16.246092,,,
min,,,,,,,,909014400.0,2.0,,,
25%,,,,,,,,1390565000.0,2.0,,,
50%,,,,,,,,1439942000.0,3.0,,,
75%,,,,,,,,1479686000.0,6.0,,,


In [54]:
import re

# Preprocessing data


# edit summary content
def edit_text(content):

    # step 1 - convert the text to only lower case
    content = content.lower()

    filtered_words = [word for word in content.split()]
    text = " ".join(filtered_words)

    return text

# create categories for vote label


def assign_vote_label(i):
    if i <= 2.0:
        return 'low'
    if i > 2.0 and i <= 10.0:
        return 'medium'
    if i > 10.0 and i <= 50.0:
        return 'good'
    if i > 50.0:
        return 'high'


# preprocess
def preprocess(data):

    # filter out only important columns
    X_cols = ['reviewText', 'summary', 'verified', 'vote', 'style']

    # editing text of "reviewText" & "summary"
    data['reviewText'] = data['reviewText'].astype(str).apply(
        edit_text)
    data['summary'] = data['summary'].astype(str).apply(edit_text)

    # editing vote - assigning NAN values to 0
    data['vote'] = data['vote'].fillna(0)
    data['vote'] = data['vote'].apply(lambda x: assign_vote_label(x))

    data['style'] = data['style'].fillna('')

    X = test_data[X_cols]

    return X


In [55]:
# Data
X = preprocess(test_data)


In [56]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder

# define the column transformer for all data
ct1 = ColumnTransformer(
    [
        ("vect_summary", CountVectorizer(), 'summary'),
        ("vect_reviewText", CountVectorizer(), 'reviewText'),
        ("vect_style", CountVectorizer(), 'style'),
        ("encd_verified", OrdinalEncoder(dtype=int), ['verified']),
        ("encd_votes", OrdinalEncoder(dtype=int), ['vote']),
    ])


# define the column transformer for non-text data
ct2 = ColumnTransformer(
    [
        ("vect_style", CountVectorizer(), 'style'),
        ("encd_verified", OrdinalEncoder(dtype=int), ['verified']),
        ("encd_votes", OrdinalEncoder(dtype=int), ['vote']),
    ])

# define the column transformer for only text-data
ct3 = ColumnTransformer(
    [
        ("vect_summary", CountVectorizer(), 'summary'),
        ("vect_reviewText", CountVectorizer(), 'reviewText'),
    ])

# define the column transformer for only style data
ct4 = ColumnTransformer(
    [
        ("vect_style", CountVectorizer(), 'style'),
    ])

# For Results
data = []
n_clust = []
best_sil_score = []
best_rand_score = []


# Kmeans Clustering

# For all data

In [57]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score

n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 300, 500]
sil_scores_1 = []
rand_scores_1 = []

for n in n_clusters:
    # Fit
    kmeans = KMeans(n_clusters=n, n_init=10, random_state=101)
    transformed_X = ct1.fit_transform(X)
    kmeans.fit(transformed_X)

    # silhouette score
    silhouette_avg = silhouette_score(transformed_X, kmeans.labels_)
    sil_scores_1.append(silhouette_avg)

    # rand index
    ari = adjusted_rand_score(test_data["category"], kmeans.labels_)
    rand_scores_1.append(ari)

print(pd.DataFrame({"Clusters": n_clusters,
      "silhouette_score": sil_scores_1, "rand_scores": rand_scores_1}))

data.append("All Data")
best_sil_score.append(max(sil_scores_1))
n_clust.append(n_clusters[sil_scores_1.index(max(sil_scores_1))])
best_rand_score.append(rand_scores_1[sil_scores_1.index(max(sil_scores_1))])


    Clusters  silhouette_score  rand_scores
0          2          0.669292     0.002898
1          3          0.454393     0.011391
2          4          0.427389     0.012330
3          5          0.346668     0.013058
4          6          0.242157     0.013754
5          7          0.242090     0.013758
6          8          0.207398     0.013897
7          9          0.171408     0.014159
8         10          0.148355     0.014417
9        100         -0.058228     0.029597
10       300         -0.069182     0.033779
11       500         -0.068950     0.010179


# For non-text data


In [58]:

n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 300, 500]
sil_scores_2 = []
rand_scores_2 = []

for n in n_clusters:
    # Fit
    kmeans = KMeans(n_clusters=n, n_init=10, random_state=101)
    transformed_X = ct2.fit_transform(X)
    kmeans.fit(transformed_X)

    # silhouette score
    silhouette_avg = silhouette_score(transformed_X, kmeans.labels_)
    sil_scores_2.append(silhouette_avg)

    # rand index
    ari = adjusted_rand_score(test_data["category"], kmeans.labels_)
    rand_scores_2.append(ari)

print(pd.DataFrame({"Clusters": n_clusters,
      "silhouette_score": sil_scores_2, "rand_scores": rand_scores_2}))

data.append("Non-Text Data")
best_sil_score.append(max(sil_scores_2))
n_clust.append(n_clusters[sil_scores_2.index(max(sil_scores_2))])
best_rand_score.append(rand_scores_2[sil_scores_2.index(max(sil_scores_2))])


    Clusters  silhouette_score  rand_scores
0          2          0.363078     0.112200
1          3          0.378734     0.158963
2          4          0.394763     0.177285
3          5          0.406634     0.174259
4          6          0.421249     0.163726
5          7          0.426161     0.159127
6          8          0.414054     0.177877
7          9          0.434594     0.182155
8         10          0.425720     0.136177
9        100          0.621245     0.087932
10       300          0.657036     0.086033
11       500          0.692069     0.084945


# For only text data


In [59]:

n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 300, 500]
sil_scores_3 = []
rand_scores_3 = []

for n in n_clusters:
    # Fit
    kmeans = KMeans(n_clusters=n, n_init=10, random_state=101)
    transformed_X = ct3.fit_transform(X)
    kmeans.fit(transformed_X)

    # silhouette score
    silhouette_avg = silhouette_score(transformed_X, kmeans.labels_)
    sil_scores_3.append(silhouette_avg)

    # rand index
    ari = adjusted_rand_score(test_data["category"], kmeans.labels_)
    rand_scores_3.append(ari)

print(pd.DataFrame({"Clusters": n_clusters,
      "silhouette_score": sil_scores_3, "rand_scores": rand_scores_3}))

data.append("Only Text Data")
best_sil_score.append(max(sil_scores_3))
n_clust.append(n_clusters[sil_scores_3.index(max(sil_scores_3))])
best_rand_score.append(rand_scores_2[sil_scores_3.index(max(sil_scores_3))])


    Clusters  silhouette_score  rand_scores
0          2          0.675094     0.002798
1          3          0.463293     0.010859
2          4          0.437115     0.011946
3          5          0.346559     0.012891
4          6          0.267419     0.013063
5          7          0.250198     0.013527
6          8          0.208802     0.013640
7          9          0.167885     0.013662
8         10          0.159410     0.014268
9        100         -0.062177     0.010673
10       300         -0.059621     0.009933
11       500         -0.059989     0.008167


# For only style data

In [60]:

n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 300, 500]
sil_scores_4 = []
rand_scores_4 = []

for n in n_clusters:
    # Fit
    kmeans = KMeans(n_clusters=n, n_init=10, random_state=101)
    transformed_X = ct4.fit_transform(X)
    kmeans.fit(transformed_X)

    # silhouette score
    silhouette_avg = silhouette_score(transformed_X, kmeans.labels_)
    sil_scores_4.append(silhouette_avg)

    # rand index
    ari = adjusted_rand_score(test_data["category"], kmeans.labels_)
    rand_scores_4.append(ari)

print(pd.DataFrame({"Clusters": n_clusters,
      "silhouette_score": sil_scores_4, "rand_scores": rand_scores_4}))

data.append("Only Style Data")
best_sil_score.append(max(sil_scores_4))
n_clust.append(n_clusters[sil_scores_4.index(max(sil_scores_4))])
best_rand_score.append(rand_scores_4[sil_scores_4.index(max(sil_scores_4))])


    Clusters  silhouette_score  rand_scores
0          2          0.441155     0.112200
1          3          0.496103     0.158963
2          4          0.536410     0.177285
3          5          0.558401     0.174141
4          6          0.565777     0.172626
5          7          0.579977     0.167456
6          8          0.575522     0.167501
7          9          0.576248     0.165509
8         10          0.590985     0.163837
9        100          0.679890     0.161545
10       300          0.708331     0.162622
11       500          0.733584     0.161114


# Results

For data including Text n = 2, provides the best silhouette_score.

For data not including Text n >> 6(500), provides the best silhouette_score.

In [62]:
pd.DataFrame({"Data Used": data, "Number of  clusers": n_clust,"Silhouette score": best_sil_score,
              "Rand Index Score": best_rand_score}).head()


Unnamed: 0,Data Used,Number of clusers,Silhouette score,Rand Index Score
0,All Data,2,0.669292,0.002898
1,Non-Text Data,500,0.692069,0.084945
2,Only Text Data,2,0.675094,0.1122
3,Only Style Data,500,0.733584,0.161114
