<a href="https://colab.research.google.com/github/rposhala/Recommender-System-on-MovieLens-dataset/blob/main/Item_based_Collaborative_Recommender_System_using_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### KMeans clusteting model to build item-based collaborative Recommender System.

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from collections import Counter

## Loading refined_dataset

In [2]:
refined_dataset = pd.read_csv("../ml-20m/refined_dataset.csv")

#### Working test a small dataset

In [3]:
refined_dataset = refined_dataset.loc[0:150000,:]
refined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  150001 non-null  int64  
 1   userId      150001 non-null  int64  
 2   title       150001 non-null  object 
 3   movieId     150001 non-null  int64  
 4   rating      150001 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 5.7+ MB


In [4]:
refined_dataset = refined_dataset.drop('Unnamed: 0', axis='columns')
refined_dataset.head()

Unnamed: 0,userId,title,movieId,rating
0,1,2001: A Space Odyssey (1968),924,3.5
1,1,28 Days Later (2002),6502,3.5
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0
4,1,Alien (1979),1214,4.0


### Recommender System by User
#### Inputs = userId
#### output = 5 movies

In [5]:
dataset_user = refined_dataset.loc[:, ['movieId', 'rating']] 
dataset_user

Unnamed: 0,movieId,rating
0,924,3.5
1,6502,3.5
2,3153,4.0
3,4467,4.0
4,1214,4.0
...,...,...
149996,3809,5.0
149997,4018,4.0
149998,3263,4.0
149999,5668,4.0


In [6]:
kmeans_user = KMeans(10)
kmeans_user.fit(dataset_user)
identified_users= kmeans_user.fit_predict(dataset_user)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [7]:
identified_users = list(identified_users)

In [8]:
refined_dataset['loc_clusters_users'] = identified_users
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users
0,1,2001: A Space Odyssey (1968),924,3.5,0
1,1,28 Days Later (2002),6502,3.5,7
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,4
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,4
4,1,Alien (1979),1214,4.0,0
...,...,...,...,...,...
149996,994,What About Bob? (1991),3809,5.0,4
149997,994,What Women Want (2000),4018,4.0,4
149998,994,White Men Can't Jump (1992),3263,4.0,4
149999,994,White Oleander (2002),5668,4.0,7


In [9]:
print(refined_dataset['loc_clusters_users'].value_counts())

0    67443
4    45481
7    19884
9     3714
2     3534
5     3399
1     2439
3     1600
6     1511
8      996
Name: loc_clusters_users, dtype: int64


In [18]:
input_user = input("Enter a userid: ")
input_user = int(input_user)
cluster_users = refined_dataset.loc[refined_dataset['userId'] == input_user, 'loc_clusters_users']
print(cluster_users.value_counts())
cluster_users = Counter(cluster_users).most_common(1)[0] # 4, 6 times
print(cluster_users)
cluster_users[0]

Enter a userid: 994
4    106
7    100
0     91
5     23
9     18
2     11
1      7
6      1
3      1
Name: loc_clusters_users, dtype: int64
(4, 106)


4

In [19]:
users = refined_dataset.loc[refined_dataset['loc_clusters_users'] == cluster_users[0], 'userId']
users
list_movies = []
for c in range(1000):
    if users.iloc[c] == input_user:
        continue
    else:
       movie1 = refined_dataset.loc[refined_dataset['movieId'] == users.iloc[c]]
       movie1 = movie1['title']
       movie1 = movie1.iloc[1]
       list_movies.append(movie1)
uniqueElements = list(set(list_movies))
print(uniqueElements[0:5])

['Four Rooms (1995)', 'American President, The (1995)', 'Casino (1995)', 'GoldenEye (1995)', 'Jumanji (1995)']


### Recommender System by Movie
#### Inputs = movieId
#### Output = 5 movies

In [20]:
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,0,7
1,1,28 Days Later (2002),6502,3.5,7,7
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,4,7
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,4,7
4,1,Alien (1979),1214,4.0,0,7
...,...,...,...,...,...,...
149996,994,What About Bob? (1991),3809,5.0,4,4
149997,994,What Women Want (2000),4018,4.0,4,4
149998,994,White Men Can't Jump (1992),3263,4.0,4,4
149999,994,White Oleander (2002),5668,4.0,7,4


In [21]:
dataset_movie = refined_dataset.loc[:, ['userId', 'rating']] 

In [22]:
kmeans_movie = KMeans(10)
kmeans_movie.fit(dataset_movie)
identified_movies = kmeans_movie.fit_predict(dataset_movie)
identified_movies = list(identified_movies)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [23]:
refined_dataset['loc_clusters_movies'] = identified_movies
refined_dataset

Unnamed: 0,userId,title,movieId,rating,loc_clusters_users,loc_clusters_movies
0,1,2001: A Space Odyssey (1968),924,3.5,0,8
1,1,28 Days Later (2002),6502,3.5,7,8
2,1,"7th Voyage of Sinbad, The (1958)",3153,4.0,4,8
3,1,"Adventures of Baron Munchausen, The (1988)",4467,4.0,4,8
4,1,Alien (1979),1214,4.0,0,8
...,...,...,...,...,...,...
149996,994,What About Bob? (1991),3809,5.0,4,9
149997,994,What Women Want (2000),4018,4.0,4,9
149998,994,White Men Can't Jump (1992),3263,4.0,4,9
149999,994,White Oleander (2002),5668,4.0,7,9


In [24]:
print(refined_dataset['loc_clusters_movies'].value_counts())

0    19043
5    17122
1    15893
3    15543
2    15530
6    15171
4    14970
7    14732
9    12787
8     9210
Name: loc_clusters_movies, dtype: int64


In [25]:
input_movie = input("Enter a movieID: ")
input_movie = int(input_movie)
cluster_movies = refined_dataset.loc[refined_dataset['movieId'] == input_movie, 'loc_clusters_movies']
print(cluster_movies.value_counts())
cluster_movies = Counter(cluster_movies).most_common(1)[0] # 4, 6 times
print(cluster_movies)
cluster_movies[0]

Enter a movieID: 1214
5    31
0    29
1    24
6    22
3    21
7    20
2    18
4    15
9    15
8    14
Name: loc_clusters_movies, dtype: int64
(5, 31)


5

In [26]:
movies = refined_dataset.loc[refined_dataset['loc_clusters_movies'] == cluster_movies[0], 'movieId']
movies
for c in range(5):
    if movies.iloc[c] == int(input_movie):
        continue
    else:
       movie1 = refined_dataset.loc[refined_dataset['movieId'] == movies.iloc[c]]
       movie1 = movie1['title']
       print(movie1.iloc[1])

Addams Family Values (1993)
Adventures of Priscilla, Queen of the Desert, The (1994)
Aladdin (1992)
Apollo 13 (1995)
Blade Runner (1982)


### Export the model 

In [30]:
with open("../models/kmeans_movie.pkl", "wb") as f:
    pickle.dump(kmeans_movie, f)

In [31]:
with open("../models/kmeans_user.pkl", "wb") as f:
    pickle.dump(kmeans_user, f)