# Cluster Analysis- User Clusters and Movie Clusters (kmeans)

In [12]:

import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from scipy.stats import mode
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode()
from sklearn.decomposition import PCA

In [13]:
# pass in column names for each CSV and read them using pandas. 
# Column names available in the readme file

#Reading data file:

rating_df = pd.read_csv('../data/ml-latest-small/ratings.csv', sep=',',header=0)
movie_df = pd.read_csv('../data/ml-latest-small/movies.csv',sep=',', header=0)

In [14]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [16]:
combined_df=pd.merge(movie_df,rating_df,how='inner', on='movieId')

In [17]:
combined_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0,851866703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0,938629179
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0,1331380058
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0,997938310
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,855190091


# User X Item Matrix

In [18]:
table=pd.pivot_table(index='userId', columns='movieId', values='rating', data=rating_df, fill_value=0)

In [19]:
table=table.reset_index()

In [20]:
rating_df.userId.unique().shape

(671,)

In [21]:
cluster=KMeans(n_clusters=5)

In [22]:
table['cluster']=cluster.fit_predict(table)

In [23]:
cols=table.columns

## Principal Component Analysis to reduce the dataset to 2 dimensions

In [25]:
pca=PCA(n_components=2)
table['x']=pca.fit_transform(table[cols])[:,0]
table['y']=pca.fit_transform(table[cols])[:,1]
table=table.reset_index()

In [26]:
table.head()

movieId,index,userId,1,2,3,4,5,6,7,8,...,161830,161918,161944,162376,162542,162672,163949,cluster,x,y
0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0.0,0,0.0,0,0,0,4,335.011706,-10.885999
1,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0.0,0,0.0,0,0,0,4,333.997945,-5.257939
2,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0.0,0,0.0,0,0,0,4,333.022564,-6.082293
3,3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0.0,0,0.0,0,0,0,4,332.00759,15.837411
4,4,5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0,...,0,0.0,0,0.0,0,0,0,4,331.001337,0.797163


In [27]:
user_clusters=table[['userId','cluster','x','y']]

In [28]:
user_clusters.tail()

movieId,userId,cluster,x,y
666,667,0,-331.000862,-7.42444
667,668,0,-331.997375,-10.477362
668,669,0,-332.981665,-10.042687
669,670,0,-333.996988,-8.398167
670,671,0,-335.003125,5.392092


## User clusters

In [19]:
trace0=go.Scatter(x=user_clusters[user_clusters.cluster==0]['x'],y=user_clusters[user_clusters.cluster==0]['y'],
                  name='Cluster 1', 
                  marker=dict(size=10,
                             color="rgba(15, 152, 152, 0.5)",
                             line=dict(width=1, color="rgb(0,0,0)")))
trace1=go.Scatter(x=user_clusters[user_clusters.cluster==1]['x'],y=user_clusters[user_clusters.cluster==1]['y'],
                  name='Cluster 2', 
                  marker=dict(size=10,
                             color="rgba(122, 152, 152, 0.5)",
                             line=dict(width=1, color="rgb(0,0,0)")))
trace2=go.Scatter(x=user_clusters[user_clusters.cluster==2]['x'],y=user_clusters[user_clusters.cluster==2]['y'],
                  name='Cluster 3', 
                  marker=dict(size=10,
                             color="rgba(132, 132, 132, 0.5)",
                             line=dict(width=1, color="rgb(0,0,0)")))
trace3=go.Scatter(x=user_clusters[user_clusters.cluster==3]['x'],y=user_clusters[user_clusters.cluster==3]['y'],
                  name='Cluster 4', 
                  marker=dict(size=10,
                             color="rgba(122, 122, 12, 0.8)",
                             line=dict(width=1, color="rgb(0,0,0)")))
trace4=go.Scatter(x=user_clusters[user_clusters.cluster==4]['x'],y=user_clusters[user_clusters.cluster==4]['y'],
                  name='Cluster 5', 
                  marker=dict(size=10,
                             color="rgba(230, 20, 30, 0.5)",
                             line=dict(width=1, color="rgb(0,0,0)")))
data=[trace0, trace1, trace2, trace3, trace4]
iplot(data)

In [29]:
user_clusters[user_clusters.cluster==1].shape[0]

133

## Movie Clusters

In [30]:
table=pd.pivot_table(index='userId', columns='movieId', values='rating', data=rating_df, fill_value=0)
table=table.T

cluster=KMeans(n_clusters=5)
table['cluster']=cluster.fit_predict(table)
cols=table.columns
pca=PCA(n_components=2)
table['x']=pca.fit_transform(table[cols])[:,0]
table['y']=pca.fit_transform(table[cols])[:,1]
table=table.reset_index()
movie_clusters=table[['movieId','cluster','x','y']]

trace0=go.Scatter(x=movie_clusters[movie_clusters.cluster==0]['x'],y=movie_clusters[movie_clusters.cluster==0]['y'],
                  name='Cluster 1', 
                  marker=dict(size=10,
                             color="rgba(15, 152, 152, 0.5)",
                             line=dict(width=1, color="rgb(0,0,0)")))
trace1=go.Scatter(x=movie_clusters[movie_clusters.cluster==1]['x'],y=movie_clusters[movie_clusters.cluster==1]['y'],
                  name='Cluster 2', 
                  marker=dict(size=10,
                             color="rgba(15, 152, 152, 0.5)",
                             line=dict(width=1, color="rgb(0,0,0)")))
trace2=go.Scatter(x=movie_clusters[movie_clusters.cluster==2]['x'],y=movie_clusters[movie_clusters.cluster==2]['y'],
                  name='Cluster 3', 
                  marker=dict(size=10,
                             color="rgba(132, 132, 132, 0.5)",
                             line=dict(width=1, color="rgb(0,0,0)")))
trace3=go.Scatter(x=movie_clusters[movie_clusters.cluster==3]['x'],y=movie_clusters[movie_clusters.cluster==3]['y'],
                  name='Cluster 4', 
                  marker=dict(size=10,
                             color="rgba(122, 122, 12, 0.8)",
                             line=dict(width=1, color="rgb(0,0,0)")))
trace4=go.Scatter(x=movie_clusters[movie_clusters.cluster==4]['x'],y=movie_clusters[movie_clusters.cluster==4]['y'],
                  name='Cluster 5', 
                  marker=dict(size=10,
                             color="rgba(230, 20, 30, 0.5)",
                             line=dict(width=1, color="rgb(0,0,0)")))
data=[trace0, trace1, trace2, trace3, trace4]
iplot(data)

<img src="newplot.png">

## Cluster Analysis

In [31]:
Cluster_1=movie_clusters['movieId'][movie_clusters.cluster==0].values
Cluster_2=movie_clusters['movieId'][movie_clusters.cluster==1].values
Cluster_3=movie_clusters['movieId'][movie_clusters.cluster==2].values
Cluster_4=movie_clusters['movieId'][movie_clusters.cluster==3].values
Cluster_5=movie_clusters['movieId'][movie_clusters.cluster==4].values

In [32]:
movie_df[movie_df.movieId.isin(Cluster_1)].shape

(796, 3)

In [33]:
movie_df[movie_df.movieId.isin(Cluster_2)].shape

(7690, 3)

In [34]:
movie_df[movie_df.movieId.isin(Cluster_3)].shape

(390, 3)

In [35]:
movie_df[movie_df.movieId.isin(Cluster_4)].shape

(53, 3)

In [36]:
movie_df[movie_df.movieId.isin(Cluster_5)].shape

(137, 3)