<a href="https://colab.research.google.com/github/rposhala/Recommender-System-on-MovieLens-dataset/blob/main/Item_based_Collaborative_Recommender_System_using_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### KMeans clusteting model to build item-based collaborative Recommender System.

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

## Loading refined_dataset

In [2]:
refined_dataset = pd.read_csv("../ml-20m/refined_dataset.csv")

#### Working test a small dataset

In [3]:
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   userId      int64  
 2   title       object 
 3   movieId     int64  
 4   rating      float64
dtypes: float64(1), int64(3), object(1)
memory usage: 762.9+ MB


In [4]:
refined_dataset = refined_dataset.loc[0:1000000,:]
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000001 entries, 0 to 1000000
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   Unnamed: 0  1000001 non-null  int64  
 1   userId      1000001 non-null  int64  
 2   title       1000001 non-null  object 
 3   movieId     1000001 non-null  int64  
 4   rating      1000001 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 38.1+ MB


In [5]:
refined_dataset = refined_dataset.drop('Unnamed: 0', axis='columns')
refined_dataset.head()

Unnamed: 0,userId,title,movieId,rating
0,1,2001: A Space Odyssey (1968),924,3.5
1,1,28 Days Later (2002),6502,3.5
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0
4,1,Alien (1979),1214,4.0


### Recommender System by User
#### Inputs = userId
#### output = 5 movies

In [6]:
dataset_user = refined_dataset.loc[:, ['movieId', 'rating']] 
dataset_user

Unnamed: 0,movieId,rating
0,924,3.5
1,6502,3.5
2,3153,4.0
3,4467,4.0
4,1214,4.0
...,...,...
999996,455,2.0
999997,3510,4.5
999998,1271,2.5
999999,799,4.0


In [7]:
kmeans_user = KMeans(25)
kmeans_user.fit(dataset_user)

# Get the cluster labels
labels = kmeans_user.labels_

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(dataset_user, labels)
print("Davies-Bouldin score:", db_score)

# Calculate silhouette score
#si_score = silhouette_score(dataset_user, labels)
#print("Silhouette score:", si_score)

ch_score = calinski_harabasz_score(dataset_user, labels)
print("Calinski Harabasz score:", ch_score)


  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.4633252520883584
Calinski Harabasz score: 54328408.59010631


In [8]:
identified_users= kmeans_user.fit_predict(dataset_user)
identified_users = list(identified_users)

  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
refined_dataset['loc_clusters_users'] = identified_users
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,22
1,1,28 Days Later (2002),6502,3.5,5
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,18
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,11
4,1,Alien (1979),1214,4.0,22
...,...,...,...,...,...
999996,6743,Free Willy (1993),455,2.0,9
999997,6743,Frequency (2000),3510,4.5,11
999998,6743,Fried Green Tomatoes (1991),1271,2.5,22
999999,6743,"Frighteners, The (1996)",799,4.0,22


In [10]:
print(refined_dataset['loc_clusters_users'].value_counts())

9     230416
22    191582
0     123991
18    111349
11     89848
24     66162
5      51573
17     22747
2      13512
8      13127
4      10656
10      9361
21      9352
12      6751
6       6610
1       6421
14      6043
16      5312
13      5251
15      3944
23      3802
3       3698
20      3479
7       3329
19      1685
Name: loc_clusters_users, dtype: int64


In [11]:
input_user = 100
input_user = int(input_user)
cluster_users = refined_dataset.loc[refined_dataset['userId'] == input_user, 'loc_clusters_users']
print(cluster_users.value_counts())
cluster_users = Counter(cluster_users).most_common(1)[0] # 4, 6 times
print(cluster_users)
cluster_users[0]

9     46
22     6
Name: loc_clusters_users, dtype: int64
(9, 46)


9

#### Check the name of the movie from its ID

In [12]:
users = refined_dataset.loc[refined_dataset['loc_clusters_users'] == cluster_users[0], 'userId']
users_list = users.tolist()
list_movie_id = []
list_movies_title = []

for c in users_list:
        movie = refined_dataset.loc[refined_dataset['movieId'] == users.iloc[c]]
        if len(movie) != 0:
            movie_data = movie.iloc[0]
            movie_title = movie_data.title
            movie_id = movie_data.movieId
            list_movie_id.append(movie_id)
            list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df = movie_reco_df.drop_duplicates()
movie_reco_df.sample(n=5) 

Unnamed: 0,list_movies_title,list_movie_id
63896,Kids of the Round Table (1995),56
73830,Two Bits (1995),67
90987,Black Sheep (1996),88
118877,Anne Frank Remembered (1995),116
47277,"Cry, the Beloved Country (1995)",40


In [13]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,Toy Story (1995),1
539,Jumanji (1995),2
1093,Grumpier Old Men (1995),3
1877,Waiting to Exhale (1995),4
2589,Father of the Bride Part II (1995),5
...,...,...
214902,Nadja (1994),184
220410,"Net, The (1995)",185
221067,Nine Months (1995),186
223051,Party Girl (1995),187


In [14]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 223279
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  183 non-null    object
 1   list_movie_id      183 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.3+ KB


### Recommender System by Movie
#### Inputs = movieId
#### Output = 5 movies

In [15]:
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,22
1,1,28 Days Later (2002),6502,3.5,5
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,18
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,11
4,1,Alien (1979),1214,4.0,22
...,...,...,...,...,...
999996,6743,Free Willy (1993),455,2.0,9
999997,6743,Frequency (2000),3510,4.5,11
999998,6743,Fried Green Tomatoes (1991),1271,2.5,22
999999,6743,"Frighteners, The (1996)",799,4.0,22


In [16]:
dataset_movie = refined_dataset.loc[:, ['userId', 'rating']] 

In [17]:
kmeans_movie = KMeans(25)
kmeans_movie.fit(dataset_user)

# Get the cluster labels
labels = kmeans_movie.labels_

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(dataset_movie, labels)
print("Davies-Bouldin score:", db_score)

# Calculate silhouette score
#si_score = silhouette_score(dataset_movie, labels)
#print("Silhouette score:", si_score)

ch_score = calinski_harabasz_score(dataset_movie, labels)
print("Calinski Harabasz score:", ch_score)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 4425.281721599264
Calinski Harabasz score: 6.518235525231497


In [18]:
identified_movies = kmeans_movie.fit_predict(dataset_movie)
identified_movies = list(identified_movies)

  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
refined_dataset['loc_clusters_movies'] = identified_movies
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,22,3
1,1,28 Days Later (2002),6502,3.5,5,3
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,18,3
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,11,3
4,1,Alien (1979),1214,4.0,22,3
...,...,...,...,...,...,...
999996,6743,Free Willy (1993),455,2.0,9,0
999997,6743,Frequency (2000),3510,4.5,11,0
999998,6743,Fried Green Tomatoes (1991),1271,2.5,22,0
999999,6743,"Frighteners, The (1996)",799,4.0,22,0


In [20]:
print(refined_dataset['loc_clusters_movies'].value_counts())

17    52448
7     51816
15    48134
11    46570
2     45826
24    42415
14    41143
21    41098
5     39808
20    39753
16    39200
12    39124
9     38760
10    38300
13    38220
6     38197
23    37750
4     37474
8     36821
18    36783
1     36161
3     34283
19    33597
22    33300
0     33020
Name: loc_clusters_movies, dtype: int64


In [21]:
input_movie = 3467
input_movie = int(input_movie)
cluster_movies = refined_dataset.loc[refined_dataset['movieId'] == input_movie, 'loc_clusters_movies']
print(cluster_movies.value_counts())
cluster_movies = Counter(cluster_movies).most_common(1)[0] # 4, 6 times
print(cluster_movies)
cluster_movies[0]

21    8
20    6
18    6
0     5
6     4
23    4
15    4
5     3
16    3
2     3
17    3
10    3
1     3
13    3
22    2
19    2
11    2
24    2
8     2
3     2
4     1
7     1
12    1
9     1
14    1
Name: loc_clusters_movies, dtype: int64
(21, 8)


21

In [22]:
movie_reco_df = []
movies = refined_dataset.loc[refined_dataset['loc_clusters_movies'] == cluster_movies[0], 'movieId']
movies
list_movies_title = []
list_movie_id = []
for c in range(len(movies)):
    if movies.iloc[c] == int(input_movie):
        continue
    else:
        movie = refined_dataset.loc[refined_dataset['movieId'] == movies.iloc[c]]
        movie_data = movie.iloc[0]
        movie_title = movie_data.title
        movie_id = movie_data.movieId
        list_movie_id.append(movie_id)
        list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df.sample(n=5)    
        

Unnamed: 0,list_movies_title,list_movie_id
5240,Under Siege (1992),1385
26592,"Silence of the Lambs, The (1991)",593
25837,Love Don't Cost a Thing (2003),7148
39335,National Security (2003),6014
29218,Ghost Dog: The Way of the Samurai (1999),3328


In [23]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,Alien³ (a.k.a. Alien 3) (1992),1320
1,Boogie Nights (1997),1673
2,First Knight (1995),168
3,"Game, The (1997)",1625
4,"Green Mile, The (1999)",3147
...,...,...
41085,RoboCop 3 (1993),519
41086,Rocky II (1979),2409
41087,"Shawshank Redemption, The (1994)",318
41088,Sweet November (2001),4155


In [24]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41090 entries, 0 to 41089
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  41090 non-null  object
 1   list_movie_id      41090 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 642.2+ KB


### Export the model 

In [25]:
with open("../models/kmeans_movie.pkl", "wb") as f:
    pickle.dump(kmeans_movie, f)

In [26]:
with open("../models/kmeans_user.pkl", "wb") as f:
    pickle.dump(kmeans_user, f)

### Export the final_db ti csv and pickle

In [27]:
final_db = refined_dataset
final_db.head()

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,22,3
1,1,28 Days Later (2002),6502,3.5,5,3
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,18,3
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,11,3
4,1,Alien (1979),1214,4.0,22,3


In [28]:
final_db.to_pickle("final_db.pkl")

In [29]:
final_db.to_csv('final_db.csv', sep=',', encoding='utf-8')