<a href="https://colab.research.google.com/github/rposhala/Recommender-System-on-MovieLens-dataset/blob/main/Item_based_Collaborative_Recommender_System_using_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### KMeans clusteting model to build item-based collaborative Recommender System.

In [2]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

## Loading refined_dataset

In [3]:
refined_dataset = pd.read_csv("../data/refined_dataset.csv")

#### Working test a small dataset

In [4]:
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144999 entries, 0 to 144998
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  144999 non-null  int64  
 1   userId      144999 non-null  int64  
 2   title       144999 non-null  object 
 3   movieId     144999 non-null  int64  
 4   rating      144999 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 5.5+ MB


In [5]:
refined_dataset = refined_dataset.loc[0:1000000,:]
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144999 entries, 0 to 144998
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  144999 non-null  int64  
 1   userId      144999 non-null  int64  
 2   title       144999 non-null  object 
 3   movieId     144999 non-null  int64  
 4   rating      144999 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 5.5+ MB


In [6]:
refined_dataset = refined_dataset.drop('Unnamed: 0', axis='columns')
refined_dataset.head()

Unnamed: 0,userId,title,movieId,rating
0,1,2001: A Space Odyssey (1968),924,3.5
1,1,28 Days Later (2002),6502,3.5
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0
4,1,Alien (1979),1214,4.0


### Recommender System by User
#### Inputs = userId
#### output = 5 movies

In [7]:
dataset_user = refined_dataset.loc[:, ['movieId', 'rating']] 
dataset_user

Unnamed: 0,movieId,rating
0,924,3.5
1,6502,3.5
2,3153,4.0
3,4467,4.0
4,1214,4.0
...,...,...
144994,1934,3.5
144995,4563,3.0
144996,1378,3.0
144997,4816,3.5


In [8]:
kmeans_user = KMeans(25)
kmeans_user.fit(dataset_user)

# Get the cluster labels
labels = kmeans_user.labels_

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(dataset_user, labels)
print("Davies-Bouldin score:", db_score)

# Calculate silhouette score
#si_score = silhouette_score(dataset_user, labels)
#print("Silhouette score:", si_score)

ch_score = calinski_harabasz_score(dataset_user, labels)
print("Calinski Harabasz score:", ch_score)


  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.4662174219805517
Calinski Harabasz score: 8284606.698188847


In [9]:
identified_users= kmeans_user.fit_predict(dataset_user)
identified_users = list(identified_users)

  super()._check_params_vs_input(X, default_n_init=10)


In [10]:
refined_dataset['loc_clusters_users'] = identified_users
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,16
1,1,28 Days Later (2002),6502,3.5,15
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,22
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,4
4,1,Alien (1979),1214,4.0,16
...,...,...,...,...,...
144994,971,You Can't Take It with You (1938),1934,3.5,10
144995,971,Young Einstein (1988),4563,3.0,24
144996,971,Young Guns (1988),1378,3.0,16
144997,971,Zoolander (2001),4816,3.5,24


In [11]:
print(refined_dataset['loc_clusters_users'].value_counts())

0     33308
16    26957
10    18269
22    16372
4     13310
24     8842
15     8171
7      3184
17     1845
12     1837
5      1472
1      1342
14     1326
6      1060
8      1000
11      970
20      868
23      816
21      758
18      650
13      591
2       574
3       560
19      556
9       361
Name: loc_clusters_users, dtype: int64


In [12]:
input_user = 100
input_user = int(input_user)
cluster_users = refined_dataset.loc[refined_dataset['userId'] == input_user, 'loc_clusters_users']
print(cluster_users.value_counts())
cluster_users = Counter(cluster_users).most_common(1)[0] # 4, 6 times
print(cluster_users)
cluster_users[0]

0     46
16     6
Name: loc_clusters_users, dtype: int64
(0, 46)


0

#### Check the name of the movie from its ID

In [13]:
users = refined_dataset.loc[refined_dataset['loc_clusters_users'] == cluster_users[0], 'userId']
users_list = users.tolist()
list_movie_id = []
list_movies_title = []

for c in users_list:
        movie = refined_dataset.loc[refined_dataset['movieId'] == users.iloc[c]]
        if len(movie) != 0:
            movie_data = movie.iloc[0]
            movie_title = movie_data.title
            movie_id = movie_data.movieId
            list_movie_id.append(movie_id)
            list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df = movie_reco_df.drop_duplicates()
movie_reco_df.sample(n=5) 

Unnamed: 0,list_movies_title,list_movie_id
11236,Dracula: Dead and Loving It (1995),12
17738,Sense and Sensibility (1995),17
27907,Othello (1995),26
18025,Ace Ventura: When Nature Calls (1995),19
1093,Grumpier Old Men (1995),3


In [14]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,Toy Story (1995),1
539,Jumanji (1995),2
1093,Grumpier Old Men (1995),3
1877,Waiting to Exhale (1995),4
2589,Father of the Bride Part II (1995),5
4420,Heat (1995),6
5123,Sabrina (1995),7
6783,Tom and Huck (1995),8
8821,Sudden Death (1995),9
8839,GoldenEye (1995),10


In [15]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29 entries, 0 to 31280
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  29 non-null     object
 1   list_movie_id      29 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 696.0+ bytes


### Recommender System by Movie
#### Inputs = movieId
#### Output = 5 movies

In [16]:
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,16
1,1,28 Days Later (2002),6502,3.5,15
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,22
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,4
4,1,Alien (1979),1214,4.0,16
...,...,...,...,...,...
144994,971,You Can't Take It with You (1938),1934,3.5,10
144995,971,Young Einstein (1988),4563,3.0,24
144996,971,Young Guns (1988),1378,3.0,16
144997,971,Zoolander (2001),4816,3.5,24


In [17]:
dataset_movie = refined_dataset.loc[:, ['userId', 'rating']] 

In [18]:
kmeans_movie = KMeans(25)
kmeans_movie.fit(dataset_user)

# Get the cluster labels
labels = kmeans_movie.labels_

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(dataset_movie, labels)
print("Davies-Bouldin score:", db_score)

# Calculate silhouette score
#si_score = silhouette_score(dataset_movie, labels)
#print("Silhouette score:", si_score)

ch_score = calinski_harabasz_score(dataset_movie, labels)
print("Calinski Harabasz score:", ch_score)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 871.4482840414654
Calinski Harabasz score: 18.785424119566382


In [19]:
identified_movies = kmeans_movie.fit_predict(dataset_movie)
identified_movies = list(identified_movies)

  super()._check_params_vs_input(X, default_n_init=10)


In [20]:
refined_dataset['loc_clusters_movies'] = identified_movies
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,16,11
1,1,28 Days Later (2002),6502,3.5,15,11
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,22,11
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,4,11
4,1,Alien (1979),1214,4.0,16,11
...,...,...,...,...,...,...
144994,971,You Can't Take It with You (1938),1934,3.5,10,19
144995,971,Young Einstein (1988),4563,3.0,24,19
144996,971,Young Guns (1988),1378,3.0,16,19
144997,971,Zoolander (2001),4816,3.5,24,19


In [21]:
print(refined_dataset['loc_clusters_movies'].value_counts())

16    8596
9     8304
17    7536
8     7420
7     7339
13    7118
12    7100
10    6332
18    6310
14    6246
2     5771
15    5645
4     5608
21    5311
3     5209
1     5118
0     4934
20    4810
23    4706
22    4626
19    4538
5     4396
11    4305
24    3867
6     3854
Name: loc_clusters_movies, dtype: int64


In [22]:
input_movie = 467
input_movie = int(input_movie)
cluster_movies = refined_dataset.loc[refined_dataset['movieId'] == input_movie, 'loc_clusters_movies']
print(cluster_movies.value_counts())
cluster_movies = Counter(cluster_movies).most_common(1)[0] # 4, 6 times
print(cluster_movies)
cluster_movies[0]

17    2
11    1
15    1
23    1
16    1
24    1
7     1
21    1
13    1
22    1
19    1
Name: loc_clusters_movies, dtype: int64
(17, 2)


17

In [23]:
movie_reco_df = []
movies = refined_dataset.loc[refined_dataset['loc_clusters_movies'] == cluster_movies[0], 'movieId']
movies
list_movies_title = []
list_movie_id = []
for c in range(len(movies)):
    if movies.iloc[c] == int(input_movie):
        continue
    else:
        movie = refined_dataset.loc[refined_dataset['movieId'] == movies.iloc[c]]
        movie_data = movie.iloc[0]
        movie_title = movie_data.title
        movie_id = movie_data.movieId
        list_movie_id.append(movie_id)
        list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df.sample(n=5)    
        

Unnamed: 0,list_movies_title,list_movie_id
6628,"Kingdom, The (2007)",54736
1957,"Island, The (2005)",34319
4503,Monty Python's Life of Brian (1979),1080
6064,Jean de Florette (1986),1131
6899,Ronin (1998),2278


In [24]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,10 Things I Hate About You (1999),2572
1,13 Going on 30 (2004),7444
2,21 Grams (2003),6953
3,3-Iron (Bin-jip) (2004),30803
4,8 Women (2002),5613
...,...,...
7529,"Wedding Singer, The (1998)",1777
7530,Westworld (1973),2527
7531,What Ever Happened to Baby Jane? (1962),3546
7532,When Harry Met Sally... (1989),1307


In [25]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7534 entries, 0 to 7533
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  7534 non-null   object
 1   list_movie_id      7534 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 117.8+ KB


### Export the model 

In [26]:
with open("../models/kmeans_movie.pkl", "wb") as f:
    pickle.dump(kmeans_movie, f)

In [27]:
with open("../models/kmeans_user.pkl", "wb") as f:
    pickle.dump(kmeans_user, f)

### Export the final_db ti csv and pickle

In [28]:
final_db = refined_dataset
final_db.head()

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,16,11
1,1,28 Days Later (2002),6502,3.5,15,11
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,22,11
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,4,11
4,1,Alien (1979),1214,4.0,16,11


In [29]:
final_db.to_pickle("../data/final_db.pkl")

In [30]:
final_db.to_csv('../data/final_db.csv', sep=',', encoding='utf-8')