<a href="https://colab.research.google.com/github/rposhala/Recommender-System-on-MovieLens-dataset/blob/main/Item_based_Collaborative_Recommender_System_using_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### KMeans clusteting model to build item-based collaborative Recommender System.

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn import datasets, cluster
from sklearn.cluster import AgglomerativeClustering

## Loading refined_dataset

In [2]:
refined_dataset = pd.read_csv("../data/refined_dataset.csv")

#### Working test a small dataset

In [3]:
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144999 entries, 0 to 144998
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  144999 non-null  int64  
 1   userId      144999 non-null  int64  
 2   title       144999 non-null  object 
 3   movieId     144999 non-null  int64  
 4   rating      144999 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 5.5+ MB


In [4]:
refined_dataset = refined_dataset.loc[0:1000000,:]
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144999 entries, 0 to 144998
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  144999 non-null  int64  
 1   userId      144999 non-null  int64  
 2   title       144999 non-null  object 
 3   movieId     144999 non-null  int64  
 4   rating      144999 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 5.5+ MB


In [5]:
refined_dataset = refined_dataset.drop('Unnamed: 0', axis='columns')
refined_dataset.head()

Unnamed: 0,userId,title,movieId,rating
0,1,2001: A Space Odyssey (1968),924,3.5
1,1,28 Days Later (2002),6502,3.5
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0
4,1,Alien (1979),1214,4.0


### Recommender System by User
#### Inputs = userId
#### output = 5 movies

In [6]:
dataset_user = refined_dataset.loc[:, ['movieId', 'rating']] 
dataset_user

Unnamed: 0,movieId,rating
0,924,3.5
1,6502,3.5
2,3153,4.0
3,4467,4.0
4,1214,4.0
...,...,...
144994,1934,3.5
144995,4563,3.0
144996,1378,3.0
144997,4816,3.5


### Kmeans 15

In [7]:
kmeans_user_15 = KMeans(15)
kmeans_user_15.fit(dataset_user)

# Get the cluster labels
labels_15 = kmeans_user_15.labels_

# Calculate Davies-Bouldin Index
db_score_user_15 = davies_bouldin_score(dataset_user, labels_15)
print("Davies-Bouldin score:", db_score_user_15)


ch_score_user_15 = calinski_harabasz_score(dataset_user, labels_15)
print("Calinski Harabasz score:", ch_score_user_15)


  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.49344049470372586
Calinski Harabasz score: 4683158.228648474


### Kmeans 25

In [8]:
kmeans_user_25 = KMeans(25)
kmeans_user_25.fit(dataset_user)

# Get the cluster labels
labels_25 = kmeans_user_25.labels_

# Calculate Davies-Bouldin Index
db_score_user_25 = davies_bouldin_score(dataset_user, labels_25)
print("Davies-Bouldin score:", db_score_user_25)


ch_score_user_25 = calinski_harabasz_score(dataset_user, labels_25)
print("Calinski Harabasz score:", ch_score_user_25)


  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.4620841911804072
Calinski Harabasz score: 8115978.014425691


### Kmeans 35

In [9]:
kmeans_user_35 = KMeans(35)
kmeans_user_35.fit(dataset_user)

# Get the cluster labels
labels_35 = kmeans_user_35.labels_

# Calculate Davies-Bouldin Index
db_score_user_35 = davies_bouldin_score(dataset_user, labels_35)
print("Davies-Bouldin score:", db_score_user_35)


ch_score_user_35 = calinski_harabasz_score(dataset_user, labels_35)
print("Calinski Harabasz score:", ch_score_user_35)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.4739652898653966
Calinski Harabasz score: 11674312.353188436


### Kmeans 45

In [10]:
kmeans_user_45 = KMeans(45)
kmeans_user_45.fit(dataset_user)

# Get the cluster labels
labels_45 = kmeans_user_45.labels_

# Calculate Davies-Bouldin Index
db_score_user_45 = davies_bouldin_score(dataset_user, labels_45)
print("Davies-Bouldin score:", db_score_user_45)


ch_score_user_45 = calinski_harabasz_score(dataset_user, labels_45)
print("Calinski Harabasz score:", ch_score_user_45)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.4752483436826433
Calinski Harabasz score: 14895074.798272185


### Comparing Davies-Bouldin

In [11]:
db_score_user = [['db_score_user_15', db_score_user_15], ['db_score_user_25', db_score_user_25], ['db_score_user_35' , db_score_user_35], ['db_score_user_45', db_score_user_45]]
 
# Create the pandas DataFrame
DB_score_user = pd.DataFrame(db_score_user, columns=['num_cluster_user', 'Davies-Bouldin_score'])
DB_score_user.head()

Unnamed: 0,num_cluster_user,Davies-Bouldin_score
0,db_score_user_15,0.49344
1,db_score_user_25,0.462084
2,db_score_user_35,0.473965
3,db_score_user_45,0.475248


In [13]:
best_DB_score_user = DB_score_user.loc[DB_score_user['Davies-Bouldin_score'].idxmin()]
num_clus_user = int(best_DB_score_user.num_cluster_user[14:16])
num_clus_user


25

In [15]:
kmeans_user = KMeans(num_clus_user)

In [16]:
identified_users= kmeans_user.fit_predict(dataset_user)
identified_users = list(identified_users)

  super()._check_params_vs_input(X, default_n_init=10)


In [17]:
refined_dataset['loc_clusters_users'] = identified_users
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,8
1,1,28 Days Later (2002),6502,3.5,4
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,23
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,18
4,1,Alien (1979),1214,4.0,8
...,...,...,...,...,...
144994,971,You Can't Take It with You (1938),1934,3.5,1
144995,971,Young Einstein (1988),4563,3.0,12
144996,971,Young Guns (1988),1378,3.0,8
144997,971,Zoolander (2001),4816,3.5,12


In [18]:
print(refined_dataset['loc_clusters_users'].value_counts())

14    33159
8     25670
1     18478
23    17230
18    13655
12     8866
4      8171
17     3184
11     1926
6      1420
9      1321
19     1298
21     1179
2      1053
22     1039
15     1013
7       970
20      943
10      891
0       814
5       754
24      593
16      570
3       500
13      302
Name: loc_clusters_users, dtype: int64


In [19]:
input_user = 100
input_user = int(input_user)
cluster_users = refined_dataset.loc[refined_dataset['userId'] == input_user, 'loc_clusters_users']
print(cluster_users.value_counts())
cluster_users = Counter(cluster_users).most_common(1)[0] # 4, 6 times
print(cluster_users)
cluster_users[0]

14    46
8      6
Name: loc_clusters_users, dtype: int64
(14, 46)


14

#### Check the name of the movie from its ID

In [20]:
users = refined_dataset.loc[refined_dataset['loc_clusters_users'] == cluster_users[0], 'userId']
users_list = users.tolist()
list_movie_id = []
list_movies_title = []

for c in users_list:
        movie = refined_dataset.loc[refined_dataset['movieId'] == users.iloc[c]]
        if len(movie) != 0:
            movie_data = movie.iloc[0]
            movie_title = movie_data.title
            movie_id = movie_data.movieId
            list_movie_id.append(movie_id)
            list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df = movie_reco_df.drop_duplicates()
movie_reco_df.sample(n=5) 

Unnamed: 0,list_movies_title,list_movie_id
4401,Heat (1995),6
19063,Money Train (1995),20
17925,Ace Ventura: When Nature Calls (1995),19
17190,Casino (1995),16
6726,Tom and Huck (1995),8


In [21]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,Toy Story (1995),1
537,Jumanji (1995),2
1089,Grumpier Old Men (1995),3
1870,Waiting to Exhale (1995),4
2581,Father of the Bride Part II (1995),5
4401,Heat (1995),6
5084,Sabrina (1995),7
6726,Tom and Huck (1995),8
8738,Sudden Death (1995),9
8788,GoldenEye (1995),10


In [22]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29 entries, 0 to 31104
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  29 non-null     object
 1   list_movie_id      29 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 696.0+ bytes


### Recommender System by Movie
#### Inputs = movieId
#### Output = 5 movies

In [23]:
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,8
1,1,28 Days Later (2002),6502,3.5,4
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,23
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,18
4,1,Alien (1979),1214,4.0,8
...,...,...,...,...,...
144994,971,You Can't Take It with You (1938),1934,3.5,1
144995,971,Young Einstein (1988),4563,3.0,12
144996,971,Young Guns (1988),1378,3.0,8
144997,971,Zoolander (2001),4816,3.5,12


In [24]:
dataset_movie = refined_dataset.loc[:, ['userId', 'rating']] 

### Kmeans 15

In [25]:
kmeans_movie_15 = KMeans(15)
kmeans_movie_15.fit(dataset_movie)

# Get the cluster labels
labels_15 = kmeans_movie_15.labels_

# Calculate Davies-Bouldin Index
db_score_movie_15 = davies_bouldin_score(dataset_movie, labels_15)
print("Davies-Bouldin score:", db_score_movie_15)

ch_score_movie_15 = calinski_harabasz_score(dataset_movie, labels_15)
print("Calinski Harabasz score:", ch_score_movie_15)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.49038151276109093
Calinski Harabasz score: 2430950.6569256373


### Kmeans 25

In [26]:
kmeans_movie_25 = KMeans(25)
kmeans_movie_25.fit(dataset_movie)

# Get the cluster labels
labels_25 = kmeans_movie_25.labels_

# Calculate Davies-Bouldin Index
db_score_movie_25 = davies_bouldin_score(dataset_movie, labels_25)
print("Davies-Bouldin score:", db_score_movie_25)

ch_score_movie_25 = calinski_harabasz_score(dataset_movie, labels_25)
print("Calinski Harabasz score:", ch_score_movie_25)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.4515360341522074
Calinski Harabasz score: 4416161.444776686


### Kmeans 35

In [27]:
kmeans_movie_35 = KMeans(35)
kmeans_movie_35.fit(dataset_movie)

# Get the cluster labels
labels_35 = kmeans_movie_35.labels_

# Calculate Davies-Bouldin Index
db_score_movie_35 = davies_bouldin_score(dataset_movie, labels_35)
print("Davies-Bouldin score:", db_score_movie_35)

ch_score_movie_35 = calinski_harabasz_score(dataset_movie, labels_35)
print("Calinski Harabasz score:", ch_score_movie_35)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.4812208442552928
Calinski Harabasz score: 6064560.954918882


### Kmeans 45

In [28]:
kmeans_movie_45 = KMeans(45)
kmeans_movie_45.fit(dataset_movie)

# Get the cluster labels
labels_45 = kmeans_movie_45.labels_

# Calculate Davies-Bouldin Index
db_score_movie_45 = davies_bouldin_score(dataset_movie, labels_45)
print("Davies-Bouldin score:", db_score_movie_45)

ch_score_movie_45 = calinski_harabasz_score(dataset_movie, labels_45)
print("Calinski Harabasz score:", ch_score_movie_45)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.48911157211429823
Calinski Harabasz score: 7760412.83325567


In [29]:
db_score_movie = [['db_score_movie_15', db_score_movie_15], ['db_score_movie_25', db_score_movie_25], ['db_score_movie_35' , db_score_movie_35], ['db_score_movie_45', db_score_movie_45]]
 
# Create the pandas DataFrame
DB_score_movie = pd.DataFrame(db_score_movie, columns=['num_cluster_movie', 'Davies-Bouldin_score'])
DB_score_movie.head()

Unnamed: 0,num_cluster_movie,Davies-Bouldin_score
0,db_score_movie_15,0.490382
1,db_score_movie_25,0.451536
2,db_score_movie_35,0.481221
3,db_score_movie_45,0.489112


In [30]:
best_DB_score_movie = DB_score_movie.loc[DB_score_movie['Davies-Bouldin_score'].idxmin()]
num_clus_movie = int(best_DB_score_movie.num_cluster_movie[15:17])
num_clus_movie

25

In [31]:
kmeans_movie = KMeans(num_clus_movie)

In [32]:
identified_movies = kmeans_movie.fit_predict(dataset_movie)
identified_movies = list(identified_movies)

  super()._check_params_vs_input(X, default_n_init=10)


In [33]:
refined_dataset['loc_clusters_movies'] = identified_movies
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,8,23
1,1,28 Days Later (2002),6502,3.5,4,23
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,23,23
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,18,23
4,1,Alien (1979),1214,4.0,8,23
...,...,...,...,...,...,...
144994,971,You Can't Take It with You (1938),1934,3.5,1,22
144995,971,Young Einstein (1988),4563,3.0,12,22
144996,971,Young Guns (1988),1378,3.0,8,22
144997,971,Zoolander (2001),4816,3.5,12,22


In [34]:
print(refined_dataset['loc_clusters_movies'].value_counts())

8     8719
18    8295
13    7507
11    7200
10    7185
20    7048
0     6980
14    6602
24    6367
16    6272
2     6169
6     6117
15    6006
5     5648
12    5513
1     5057
19    4934
7     4608
22    4513
4     4419
23    4305
17    4291
9     3819
21    3778
3     3647
Name: loc_clusters_movies, dtype: int64


In [35]:
input_movie = 467
input_movie = int(input_movie)
cluster_movies = refined_dataset.loc[refined_dataset['movieId'] == input_movie, 'loc_clusters_movies']
print(cluster_movies.value_counts())
cluster_movies = Counter(cluster_movies).most_common(1)[0] # 4, 6 times
print(cluster_movies)
cluster_movies[0]

13    2
23    1
16    1
8     1
20    1
9     1
5     1
21    1
0     1
7     1
22    1
Name: loc_clusters_movies, dtype: int64
(13, 2)


13

In [36]:
movie_reco_df = []
movies = refined_dataset.loc[refined_dataset['loc_clusters_movies'] == cluster_movies[0], 'movieId']
movies
list_movies_title = []
list_movie_id = []
for c in range(len(movies)):
    if movies.iloc[c] == int(input_movie):
        continue
    else:
        movie = refined_dataset.loc[refined_dataset['movieId'] == movies.iloc[c]]
        movie_data = movie.iloc[0]
        movie_title = movie_data.title
        movie_id = movie_data.movieId
        list_movie_id.append(movie_id)
        list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df.sample(n=5)    
        

Unnamed: 0,list_movies_title,list_movie_id
4926,"Stepford Wives, The (1975)",2346
3134,"Celluloid Closet, The (1995)",581
6185,Fly Away Home (1996),986
2815,Angel Heart (1987),3706
7351,Gladiator (1992),8132


In [37]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,Amadeus (1984),1225
1,Apocalypse Now (1979),1208
2,"Blues Brothers, The (1980)",1220
3,"Client, The (1994)",350
4,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",3996
...,...,...
7500,"Wizard of Oz, The (1939)",919
7501,"Wrestler, The (2008)",64839
7502,X-Men (2000),3793
7503,X-Men: First Class (2011),87232


In [38]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7505 entries, 0 to 7504
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  7505 non-null   object
 1   list_movie_id      7505 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 117.4+ KB


### Export the model 

In [40]:
with open("../models/kmeans_movie.pkl", "wb") as f:
    pickle.dump(kmeans_movie, f)

In [41]:
with open("../models/kmeans_user.pkl", "wb") as f:
    pickle.dump(kmeans_user, f)

### Export the final_db ti csv and pickle

In [42]:
final_db = refined_dataset
final_db.head()

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,8,23
1,1,28 Days Later (2002),6502,3.5,4,23
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,23,23
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,18,23
4,1,Alien (1979),1214,4.0,8,23


In [43]:
final_db.to_pickle("../data/final_db.pkl")

In [44]:
final_db.to_csv('../data/final_db.csv', sep=',', encoding='utf-8', index=False)