<a href="https://colab.research.google.com/github/rposhala/Recommender-System-on-MovieLens-dataset/blob/main/Item_based_Collaborative_Recommender_System_using_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### KMeans clusteting model to build item-based collaborative Recommender System.

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

## Loading refined_dataset

In [2]:
refined_dataset = pd.read_csv("../data/refined_dataset.csv")

#### Working test a small dataset

In [3]:
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324215 entries, 0 to 324214
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  324215 non-null  int64  
 1   userId      324215 non-null  int64  
 2   title       324215 non-null  object 
 3   movieId     324215 non-null  int64  
 4   rating      324215 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 12.4+ MB


In [4]:
refined_dataset = refined_dataset.loc[0:1000000,:]
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324215 entries, 0 to 324214
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  324215 non-null  int64  
 1   userId      324215 non-null  int64  
 2   title       324215 non-null  object 
 3   movieId     324215 non-null  int64  
 4   rating      324215 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 12.4+ MB


In [5]:
refined_dataset = refined_dataset.drop('Unnamed: 0', axis='columns')
refined_dataset.head()

Unnamed: 0,userId,title,movieId,rating
0,1,2001: A Space Odyssey (1968),924,3.5
1,1,28 Days Later (2002),6502,3.5
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0
4,1,Alien (1979),1214,4.0


### Recommender System by User
#### Inputs = userId
#### output = 5 movies

In [6]:
dataset_user = refined_dataset.loc[:, ['movieId', 'rating']] 
dataset_user

Unnamed: 0,movieId,rating
0,924,3.5
1,6502,3.5
2,3153,4.0
3,4467,4.0
4,1214,4.0
...,...,...
324210,1,4.0
324211,380,4.0
324212,32,1.5
324213,50,3.5


In [7]:
kmeans_user = KMeans(25)
kmeans_user.fit(dataset_user)

# Get the cluster labels
labels = kmeans_user.labels_

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(dataset_user, labels)
print("Davies-Bouldin score:", db_score)

# Calculate silhouette score
#si_score = silhouette_score(dataset_user, labels)
#print("Silhouette score:", si_score)

ch_score = calinski_harabasz_score(dataset_user, labels)
print("Calinski Harabasz score:", ch_score)


  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 0.46293120750166367
Calinski Harabasz score: 17480716.687836777


In [8]:
identified_users= kmeans_user.fit_predict(dataset_user)
identified_users = list(identified_users)

  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
refined_dataset['loc_clusters_users'] = identified_users
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,13
1,1,28 Days Later (2002),6502,3.5,20
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,21
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,12
4,1,Alien (1979),1214,4.0,13
...,...,...,...,...,...
324210,2215,Toy Story (1995),1,4.0,9
324211,2215,True Lies (1994),380,4.0,9
324212,2215,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),32,1.5,9
324213,2215,"Usual Suspects, The (1995)",50,3.5,9


In [10]:
print(refined_dataset['loc_clusters_users'].value_counts())

9     78127
13    62916
0     41041
21    35746
12    27293
7     20857
20    16295
10     7315
3      4337
16     3700
1      3259
6      2946
23     2860
14     2104
5      2103
2      1996
17     1878
11     1701
19     1574
22     1251
18     1144
4      1139
24     1081
15     1024
8       528
Name: loc_clusters_users, dtype: int64


In [11]:
input_user = 100
input_user = int(input_user)
cluster_users = refined_dataset.loc[refined_dataset['userId'] == input_user, 'loc_clusters_users']
print(cluster_users.value_counts())
cluster_users = Counter(cluster_users).most_common(1)[0] # 4, 6 times
print(cluster_users)
cluster_users[0]

9     48
13     4
Name: loc_clusters_users, dtype: int64
(9, 48)


9

#### Check the name of the movie from its ID

In [12]:
users = refined_dataset.loc[refined_dataset['loc_clusters_users'] == cluster_users[0], 'userId']
users_list = users.tolist()
list_movie_id = []
list_movies_title = []

for c in users_list:
        movie = refined_dataset.loc[refined_dataset['movieId'] == users.iloc[c]]
        if len(movie) != 0:
            movie_data = movie.iloc[0]
            movie_title = movie_data.title
            movie_id = movie_data.movieId
            list_movie_id.append(movie_id)
            list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df = movie_reco_df.drop_duplicates()
movie_reco_df.sample(n=5) 

Unnamed: 0,list_movies_title,list_movie_id
52214,How to Make an American Quilt (1995),46
29834,Othello (1995),26
42686,Wings of Courage (1995),33
75173,Eye for an Eye (1996),61
51443,Dead Presidents (1995),42


In [13]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,Toy Story (1995),1
558,Jumanji (1995),2
1125,Grumpier Old Men (1995),3
1986,Waiting to Exhale (1995),4
2722,Father of the Bride Part II (1995),5
...,...,...
75173,Eye for an Eye (1996),61
76529,Mr. Holland's Opus (1995),62
76734,Don't Be a Menace to South Central While Drink...,63
76852,Two if by Sea (1996),64


In [14]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64 entries, 0 to 77987
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  64 non-null     object
 1   list_movie_id      64 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ KB


### Recommender System by Movie
#### Inputs = movieId
#### Output = 5 movies

In [15]:
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,13
1,1,28 Days Later (2002),6502,3.5,20
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,21
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,12
4,1,Alien (1979),1214,4.0,13
...,...,...,...,...,...
324210,2215,Toy Story (1995),1,4.0,9
324211,2215,True Lies (1994),380,4.0,9
324212,2215,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),32,1.5,9
324213,2215,"Usual Suspects, The (1995)",50,3.5,9


In [16]:
dataset_movie = refined_dataset.loc[:, ['userId', 'rating']] 

In [17]:
kmeans_movie = KMeans(25)
kmeans_movie.fit(dataset_user)

# Get the cluster labels
labels = kmeans_movie.labels_

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(dataset_movie, labels)
print("Davies-Bouldin score:", db_score)

# Calculate silhouette score
#si_score = silhouette_score(dataset_movie, labels)
#print("Silhouette score:", si_score)

ch_score = calinski_harabasz_score(dataset_movie, labels)
print("Calinski Harabasz score:", ch_score)

  super()._check_params_vs_input(X, default_n_init=10)


Davies-Bouldin score: 757.89880899625
Calinski Harabasz score: 10.28843440219295


In [18]:
identified_movies = kmeans_movie.fit_predict(dataset_movie)
identified_movies = list(identified_movies)

  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
refined_dataset['loc_clusters_movies'] = identified_movies
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,13,14
1,1,28 Days Later (2002),6502,3.5,20,14
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,21,14
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,12,14
4,1,Alien (1979),1214,4.0,13,14
...,...,...,...,...,...,...
324210,2215,Toy Story (1995),1,4.0,9,9
324211,2215,True Lies (1994),380,4.0,9,9
324212,2215,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),32,1.5,9,9
324213,2215,"Usual Suspects, The (1995)",50,3.5,9,9


In [20]:
print(refined_dataset['loc_clusters_movies'].value_counts())

8     16341
0     15848
6     15530
17    15195
2     14702
24    14649
23    14585
19    14386
21    14185
4     13546
11    13395
3     13324
5     13319
10    13196
9     12810
20    12262
22    12022
15    11661
12    11599
16    11546
7     10608
18    10550
1     10218
13     9528
14     9210
Name: loc_clusters_movies, dtype: int64


In [21]:
input_movie = 3467
input_movie = int(input_movie)
cluster_movies = refined_dataset.loc[refined_dataset['movieId'] == input_movie, 'loc_clusters_movies']
print(cluster_movies.value_counts())
cluster_movies = Counter(cluster_movies).most_common(1)[0] # 4, 6 times
print(cluster_movies)
cluster_movies[0]

0     4
8     4
2     3
16    3
3     2
4     2
24    2
23    2
13    1
10    1
1     1
14    1
7     1
12    1
17    1
11    1
20    1
21    1
Name: loc_clusters_movies, dtype: int64
(0, 4)


0

In [22]:
movie_reco_df = []
movies = refined_dataset.loc[refined_dataset['loc_clusters_movies'] == cluster_movies[0], 'movieId']
movies
list_movies_title = []
list_movie_id = []
for c in range(len(movies)):
    if movies.iloc[c] == int(input_movie):
        continue
    else:
        movie = refined_dataset.loc[refined_dataset['movieId'] == movies.iloc[c]]
        movie_data = movie.iloc[0]
        movie_title = movie_data.title
        movie_id = movie_data.movieId
        list_movie_id.append(movie_id)
        list_movies_title.append(movie_title)
        #print('Recommended movie title:', movie_reco.iloc[0])
        #print('Recommended movie id:', movie_id.iloc[0])


movie_reco_df = pd.DataFrame(list(zip(list_movies_title, list_movie_id)), columns=['list_movies_title', 'list_movie_id'])
movie_reco_df.sample(n=5)    
        

Unnamed: 0,list_movies_title,list_movie_id
2309,Sherlock Holmes: A Game of Shadows (2011),91542
6137,Out of Sight (1998),1912
11715,Star Trek VI: The Undiscovered Country (1991),1372
8484,"Thin Blue Line, The (1988)",1189
9382,Signs (2002),5502


In [23]:
movie_reco_df

Unnamed: 0,list_movies_title,list_movie_id
0,2001: A Space Odyssey (1968),924
1,28 Days Later (2002),6502
2,A.I. Artificial Intelligence (2001),4370
3,Aladdin (1992),588
4,Alien (1979),1214
...,...,...
15839,Sleeper (1973),1077
15840,Spy Hard (1996),743
15841,Three Colors: Blue (Trois couleurs: Bleu) (1993),307
15842,Three Colors: Red (Trois couleurs: Rouge) (1994),306


In [24]:
movie_reco_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15844 entries, 0 to 15843
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   list_movies_title  15844 non-null  object
 1   list_movie_id      15844 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 247.7+ KB


### Export the model 

In [25]:
with open("../models/kmeans_movie.pkl", "wb") as f:
    pickle.dump(kmeans_movie, f)

In [26]:
with open("../models/kmeans_user.pkl", "wb") as f:
    pickle.dump(kmeans_user, f)

### Export the final_db ti csv and pickle

In [27]:
final_db = refined_dataset
final_db.head()

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,13,14
1,1,28 Days Later (2002),6502,3.5,20,14
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,21,14
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,12,14
4,1,Alien (1979),1214,4.0,13,14


In [28]:
final_db.to_pickle("../data/final_db.pkl")

In [29]:
final_db.to_csv('../data/final_db.csv', sep=',', encoding='utf-8')