<a href="https://colab.research.google.com/github/rposhala/Recommender-System-on-MovieLens-dataset/blob/main/Item_based_Collaborative_Recommender_System_using_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### KMeans clusteting model to build item-based collaborative Recommender System.

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

## Loading refined_dataset

In [2]:
refined_dataset = pd.read_csv("../data/refined_dataset.csv")

#### Working test a small dataset

In [3]:
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144999 entries, 0 to 144998
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  144999 non-null  int64  
 1   userId      144999 non-null  int64  
 2   title       144999 non-null  object 
 3   movieId     144999 non-null  int64  
 4   rating      144999 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 5.5+ MB


In [4]:
refined_dataset = refined_dataset.loc[0:1000000,:]
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144999 entries, 0 to 144998
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  144999 non-null  int64  
 1   userId      144999 non-null  int64  
 2   title       144999 non-null  object 
 3   movieId     144999 non-null  int64  
 4   rating      144999 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 5.5+ MB


In [5]:
refined_dataset = refined_dataset.drop('Unnamed: 0', axis='columns')
refined_dataset.head()

Unnamed: 0,userId,title,movieId,rating
0,1,2001: A Space Odyssey (1968),924,3.5
1,1,28 Days Later (2002),6502,3.5
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0
4,1,Alien (1979),1214,4.0


### Recommender System by User
#### Inputs = userId
#### output = 5 movies

In [6]:
dataset_user = refined_dataset.loc[:, ['movieId', 'rating']] 
dataset_user

Unnamed: 0,movieId,rating
0,924,3.5
1,6502,3.5
2,3153,4.0
3,4467,4.0
4,1214,4.0
...,...,...
144994,1934,3.5
144995,4563,3.0
144996,1378,3.0
144997,4816,3.5


In [7]:
kmeans_user = KMeans(15)
kmeans_user.fit(dataset_user)

# Get the cluster labels
labels = kmeans_user.labels_

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(dataset_user, labels)
print("Davies-Bouldin score:", db_score)

# Calculate silhouette score
#si_score = silhouette_score(dataset_user, labels)
#print("Silhouette score:", si_score)

ch_score = calinski_harabasz_score(dataset_user, labels)
print("Calinski Harabasz score:", ch_score)


  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.4771538061456546
Calinski Harabasz score: 4790444.849948682


In [8]:
identified_users= kmeans_user.fit_predict(dataset_user)
identified_users = list(identified_users)

  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
refined_dataset['loc_clusters_users'] = identified_users
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,7
1,1,28 Days Later (2002),6502,3.5,10
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,4
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,0
4,1,Alien (1979),1214,4.0,7
...,...,...,...,...,...
144994,971,You Can't Take It with You (1938),1934,3.5,4
144995,971,Young Einstein (1988),4563,3.0,0
144996,971,Young Guns (1988),1378,3.0,7
144997,971,Zoolander (2001),4816,3.5,0


In [10]:
print(refined_dataset['loc_clusters_users'].value_counts())

7     58093
4     36712
0     21537
10    12071
11     2355
5      2338
3      2081
1      1950
9      1929
6      1170
14     1148
13     1101
12      970
2       824
8       720
Name: loc_clusters_users, dtype: int64


In [11]:
input_user = 100
input_user = int(input_user)
cluster_users = refined_dataset.loc[refined_dataset['userId'] == input_user, 'loc_clusters_users']
print(cluster_users.value_counts())
cluster_users = Counter(cluster_users).most_common(1)[0] # 4, 6 times
print(cluster_users)
cluster_users[0]

7    52
Name: loc_clusters_users, dtype: int64
(7, 52)


7

#### Check the name of the movie from its ID

In [12]:
users = refined_dataset.loc[refined_dataset['loc_clusters_users'] == cluster_users[0], 'userId']
users_list = users.tolist()
list_movie_id = []
list_movies_title = []

for c in users_list:
        movie = refined_dataset.loc[refined_dataset['movieId'] == users.iloc[c]]
        if len(movie) != 0:
            movie_data = movie.iloc[0]
            movie_title = movie_data.title
            movie_id = movie_data.movieId
            list_movie_id.append(movie_id)
            list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df = movie_reco_df.drop_duplicates()
movie_reco_df.sample(n=5) 

Unnamed: 0,list_movies_title,list_movie_id
34152,Dracula: Dead and Loving It (1995),12
21980,Tom and Huck (1995),8
50244,Sense and Sensibility (1995),17
15349,Heat (1995),6
40379,Nixon (1995),14


In [13]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,Toy Story (1995),1
3343,Jumanji (1995),2
4364,Grumpier Old Men (1995),3
10692,Waiting to Exhale (1995),4
11750,Father of the Bride Part II (1995),5
15349,Heat (1995),6
16928,Sabrina (1995),7
21980,Tom and Huck (1995),8
26750,Sudden Death (1995),9
26933,GoldenEye (1995),10


In [14]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21 entries, 0 to 56196
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  21 non-null     object
 1   list_movie_id      21 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 504.0+ bytes


### Recommender System by Movie
#### Inputs = movieId
#### Output = 5 movies

In [15]:
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,7
1,1,28 Days Later (2002),6502,3.5,10
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,4
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,0
4,1,Alien (1979),1214,4.0,7
...,...,...,...,...,...
144994,971,You Can't Take It with You (1938),1934,3.5,4
144995,971,Young Einstein (1988),4563,3.0,0
144996,971,Young Guns (1988),1378,3.0,7
144997,971,Zoolander (2001),4816,3.5,0


In [16]:
dataset_movie = refined_dataset.loc[:, ['userId', 'rating']] 

In [17]:
kmeans_movie = KMeans(15)
kmeans_movie.fit(dataset_user)

# Get the cluster labels
labels = kmeans_movie.labels_

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(dataset_movie, labels)
print("Davies-Bouldin score:", db_score)

# Calculate silhouette score
#si_score = silhouette_score(dataset_movie, labels)
#print("Silhouette score:", si_score)

ch_score = calinski_harabasz_score(dataset_movie, labels)
print("Calinski Harabasz score:", ch_score)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 611.4101652502048
Calinski Harabasz score: 29.166257683486613


In [18]:
identified_movies = kmeans_movie.fit_predict(dataset_movie)
identified_movies = list(identified_movies)

  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
refined_dataset['loc_clusters_movies'] = identified_movies
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,7,14
1,1,28 Days Later (2002),6502,3.5,10,14
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,4,14
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,0,14
4,1,Alien (1979),1214,4.0,7,14
...,...,...,...,...,...,...
144994,971,You Can't Take It with You (1938),1934,3.5,4,12
144995,971,Young Einstein (1988),4563,3.0,0,12
144996,971,Young Guns (1988),1378,3.0,7,12
144997,971,Zoolander (2001),4816,3.5,0,12


In [20]:
print(refined_dataset['loc_clusters_movies'].value_counts())

6     12327
7     11882
13    11835
0     11561
10    11394
4     10856
1      9695
11     9667
9      9446
8      8652
5      8599
14     7477
12     7312
2      7160
3      7136
Name: loc_clusters_movies, dtype: int64


In [21]:
input_movie = 467
input_movie = int(input_movie)
cluster_movies = refined_dataset.loc[refined_dataset['movieId'] == input_movie, 'loc_clusters_movies']
print(cluster_movies.value_counts())
cluster_movies = Counter(cluster_movies).most_common(1)[0] # 4, 6 times
print(cluster_movies)
cluster_movies[0]

0     2
8     2
14    1
3     1
11    1
9     1
7     1
6     1
13    1
12    1
Name: loc_clusters_movies, dtype: int64
(0, 2)


0

In [22]:
movie_reco_df = []
movies = refined_dataset.loc[refined_dataset['loc_clusters_movies'] == cluster_movies[0], 'movieId']
movies
list_movies_title = []
list_movie_id = []
for c in range(len(movies)):
    if movies.iloc[c] == int(input_movie):
        continue
    else:
        movie = refined_dataset.loc[refined_dataset['movieId'] == movies.iloc[c]]
        movie_data = movie.iloc[0]
        movie_title = movie_data.title
        movie_id = movie_data.movieId
        list_movie_id.append(movie_id)
        list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df.sample(n=5)    
        

Unnamed: 0,list_movies_title,list_movie_id
883,Citizen Kane (1941),923
6045,X-Men Origins: Wolverine (2009),68319
8212,Reality Bites (1994),372
188,Friday the 13th Part 3: 3D (1982),1976
9040,"Grapes of Wrath, The (1940)",3095


In [23]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,"'burbs, The (1989)",2072
1,101 Dalmatians (1996),1367
2,101 Dalmatians (One Hundred and One Dalmatians...,2085
3,12 Angry Men (1957),1203
4,"20,000 Leagues Under the Sea (1954)",1019
...,...,...
11554,"Thing, The (1982)",2288
11555,Waterworld (1995),208
11556,Willy Wonka & the Chocolate Factory (1971),1073
11557,"X-Files: Fight the Future, The (1998)",1909


In [24]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11559 entries, 0 to 11558
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  11559 non-null  object
 1   list_movie_id      11559 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 180.7+ KB


### Export the model 

In [25]:
with open("../models/kmeans_movie.pkl", "wb") as f:
    pickle.dump(kmeans_movie, f)

In [26]:
with open("../models/kmeans_user.pkl", "wb") as f:
    pickle.dump(kmeans_user, f)

### Export the final_db ti csv and pickle

In [27]:
final_db = refined_dataset
final_db.head()

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,7,14
1,1,28 Days Later (2002),6502,3.5,10,14
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,4,14
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,0,14
4,1,Alien (1979),1214,4.0,7,14


In [28]:
final_db.to_pickle("../data/final_db.pkl")

In [30]:
final_db.to_csv('../data/final_db.csv', sep=',', encoding='utf-8', index=False)