# Objective: Make a movie recommendation system that suggests relevant movies according to a user's interest and previously rated movies.

Recommendation systems improve the quality of search results and provide elements that are more relevant to the search item or that are related to the search history of the user. Recommendation systems are widely used to recommend movies, items, restaurants, places to visit, items to buy, etc.


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sp
import seaborn as sns
import itertools

In [2]:
movies=pd.read_csv("movies.csv")
rating=pd.read_csv("ratings.csv")

In [3]:
print(movies.describe())
print(movies.shape)

             movieId
count   10329.000000
mean    31924.282893
std     37734.741149
min         1.000000
25%      3240.000000
50%      7088.000000
75%     59900.000000
max    149532.000000
(10329, 3)


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
rating.shape

(105339, 4)

In [6]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [7]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [8]:
rating['timestamp']=pd.to_datetime(rating['timestamp'])
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   userId     105339 non-null  int64         
 1   movieId    105339 non-null  int64         
 2   rating     105339 non-null  float64       
 3   timestamp  105339 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 3.2 MB


In [9]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1970-01-01 00:00:01.217897793
1,1,24,1.5,1970-01-01 00:00:01.217895807
2,1,32,4.0,1970-01-01 00:00:01.217896246
3,1,47,4.0,1970-01-01 00:00:01.217896556
4,1,50,4.0,1970-01-01 00:00:01.217896523


In [10]:
movie_rating= pd.merge(movies, rating, how='left', on='movieId')

In [11]:
movie_rating.shape

(105343, 6)

In [12]:
movie_rating.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,5.0,1970-01-01 00:00:00.859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,1970-01-01 00:00:01.303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,5.0,1970-01-01 00:00:00.858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.0,1970-01-01 00:00:00.850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14.0,4.0,1970-01-01 00:00:00.851766286


In [13]:
movie_rating.tail()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
105338,148238,A Very Murray Christmas (2015),Comedy,475.0,3.0,1970-01-01 00:00:01.451213043
105339,148626,The Big Short (2015),Drama,458.0,4.0,1970-01-01 00:00:01.452014749
105340,148626,The Big Short (2015),Drama,576.0,4.5,1970-01-01 00:00:01.451687664
105341,148626,The Big Short (2015),Drama,668.0,4.5,1970-01-01 00:00:01.451148148
105342,149532,Marco Polo: One Hundred Eyes (2015),(no genres listed),475.0,4.0,1970-01-01 00:00:01.451223429


for i in movie_ratings.genres:
    if 

In [14]:
movie_rating.describe()

Unnamed: 0,movieId,userId,rating
count,105343.0,105339.0,105339.0
mean,13382.696373,364.924539,3.51685
std,26172.698128,197.486905,1.044872
min,1.0,1.0,0.5
25%,1073.0,192.0,3.0
50%,2497.0,383.0,3.5
75%,5991.0,557.0,4.0
max,149532.0,668.0,5.0


In [15]:
movie_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105343 entries, 0 to 105342
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   movieId    105343 non-null  int64         
 1   title      105343 non-null  object        
 2   genres     105343 non-null  object        
 3   userId     105339 non-null  float64       
 4   rating     105339 non-null  float64       
 5   timestamp  105339 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 5.6+ MB


In [16]:
movie_rating.isnull().sum()

movieId      0
title        0
genres       0
userId       4
rating       4
timestamp    4
dtype: int64

In [17]:
movie_rating.dropna(inplace=True)

In [18]:

movie_rating.isnull().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [19]:
movie_rating.shape

(105339, 6)

In [20]:
movie_rating.columns

Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')

In [21]:
movie_rating["movieId"].value_counts()

296       325
356       311
318       308
480       294
593       290
         ... 
26726       1
26717       1
26712       1
26703       1
149532      1
Name: movieId, Length: 10325, dtype: int64

In [22]:
movie_rating["rating"].value_counts()

4.0    28880
3.0    21729
5.0    14856
3.5    12237
4.5     8187
2.0     7943
2.5     5484
1.0     3258
1.5     1567
0.5     1198
Name: rating, dtype: int64

In [23]:
movierating = movie_rating.rating

In [24]:
movierating.head()

0    5.0
1    4.0
2    5.0
3    4.0
4    4.0
Name: rating, dtype: float64

In [25]:
#sns.countplot(x="rating",hue="title", data=movie_rating, palette="Set1")

In [26]:
movie_rating.corr()["movieId"]

movieId    1.000000
userId     0.089403
rating    -0.025446
Name: movieId, dtype: float64

In [27]:
#new=movie_rating.genres.str.split("|",expand=True)

In [28]:
#new.value_counts()

new[1].unique()
new[2].unique()
new[3].unique()


In [29]:
#new=new.assign(Animation =" ", Children=" ", Romance=" ", Drama=" ", Crime=" ", Adventure=" ", Horror=" ", Comedy=" ", Sci_Fi=" ", War==" ", Thriller=" ",Mystery=" ", Film_Noir=" ",Musical=" ", Fantasy=" ", Documentary=" ",Western=" ", IMAX=" ",Children=" ")


new["Animation"] = np.nan
new["Children"] = np.nan
new["Romance"] = np.nan
new["Drama"] = np.nan
new["Crime"] = np.nan
new["Horror"] = np.nan
new["Comedy"] = np.nan
new["Sci_Fi"] = np.nan
new["War"] = np.nan
new["Mystery"] = np.nan
new["Film_Noir"] = np.nan
new["Musical"] = np.nan
new["Fantasy"] = np.nan
new["Documentary"] = np.nan
new["Thriller"] = np.nan
new["Western"] = np.nan
new["IMAX"] = np.nan
new["IMAX"] = np.nan
new["Children"] = np.nan

new

In [45]:
x=movie_rating.drop(['rating',"timestamp","title","genres"], axis = 1)
y=movie_rating["rating"]

In [46]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

In [47]:
sc=StandardScaler()

In [48]:
x_scaled=sc.fit_transform(x)

In [49]:
x_scaled=pd.DataFrame(x_scaled)
x_scaled

Unnamed: 0,0,1
0,-0.511278,-1.837723
1,-0.511278,-1.822532
2,-0.511278,-1.807341
3,-0.511278,-1.792150
4,-0.511278,-1.776959
...,...,...
105334,5.153037,0.557384
105335,5.167863,0.471302
105336,5.167863,1.068812
105337,5.167863,1.534668


In [50]:
from sklearn.model_selection import train_test_split

In [51]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

In [52]:
from sklearn.neighbors import KNeighborsClassifier

In [57]:
model=KNeighborsClassifier(n_neighbors=3,p=2,metric="euclidean")
model

KNeighborsClassifier(metric='euclidean', n_neighbors=3)

In [59]:
from sklearn import preprocessing
from sklearn import utils

#convert y values to categorical values
lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

#view transformed values
print(y_transformed)


[9 7 9 ... 8 8 7]


In [61]:
#fit logistic regression model
classifier = KNeighborsClassifier()
classifier.fit(x, y_transformed)


KNeighborsClassifier()

In [71]:
x_train

Unnamed: 0,movieId,userId
87413,26306,475.0
89575,34405,575.0
103615,103339,599.0
89891,37720,403.0
72001,4478,88.0
...,...,...
21243,765,232.0
45891,2021,432.0
42613,1792,177.0
43567,1918,203.0


In [72]:
y_train

87413     3.5
89575     4.0
103615    4.0
89891     3.5
72001     4.0
         ... 
21243     4.0
45891     5.0
42613     3.0
43567     5.0
68268     1.0
Name: rating, Length: 73737, dtype: float64

In [73]:
model.fit(x_train,y_transformed)

ValueError: Found input variables with inconsistent numbers of samples: [73737, 105339]

In [74]:
y_pred=model.predict(x_test)

AttributeError: 'KNeighborsClassifier' object has no attribute 'n_samples_fit_'

In [75]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

In [76]:
print(classification_report(y_test,y_pred))

NameError: name 'y_pred' is not defined

In [77]:
d_prediction=model.predict(x_test)
print("KNN =",accuracy_score(d_prediction,y_test))

AttributeError: 'KNeighborsClassifier' object has no attribute 'n_samples_fit_'