# Build a Movie Recommendation System

## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Read the required files

In [2]:
movies=pd.read_csv('movies.csv')
ratings=pd.read_csv('ratings.csv')
tags=pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


## Preprocessing the data

### Sorting tags using movieId

In [6]:
tags = tags.sort_values('movieId')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
358938,110148,1,friendship,1422605756
42994,10616,1,animation,1277357417
42993,10616,1,animated,1277357419
42992,10616,1,adventure,1277357423
134542,35984,1,animation,1152405430


### Joining all tag values for each movie to create metadata

In [7]:
# Group tags by 'movieId' and aggregate tags as a space-separated string
tags_agg = tags.groupby('movieId')['tag'].agg(lambda x: ' '.join(x.astype(str))).reset_index()

# Merge the aggregated tags back to the 'movies' DataFrame
movies = movies.merge(tags_agg, how='left', left_on='movieId', right_on='movieId')

# Rename the 'tag' column in the merged DataFrame
movies.rename(columns={'tag': 'tags'}, inplace=True)

# Fill missing values in 'tags' column with an empty string
movies['tags'].fillna('', inplace=True)

# Display the movies DataFrame (metadata)
movies.head(10)

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,friendship animation animated adventure animat...
1,2,Jumanji (1995),Adventure|Children|Fantasy,time travel Robin Williams game board game Kir...
2,3,Grumpier Old Men (1995),Comedy|Romance,comedinha de velhinhos engraÃƒÂ§ada Jack Lemmo...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,characters CLV revenge chick flick chick flick...
4,5,Father of the Bride Part II (1995),Comedy,CLV wedding watched under duress Comedy it tho...
5,6,Heat (1995),Action|Crime|Thriller,philosophy realistic action tense suspense gre...
6,7,Sabrina (1995),Comedy|Romance,romance Nancy Marchand Harrison Ford great cas...
7,8,Tom and Huck (1995),Adventure|Children,adapted from:book based on a book author:Mark ...
8,9,Sudden Death (1995),Action,Peter Hyams Jean-Claude Van Damme Jean-Claude ...
9,10,GoldenEye (1995),Action|Adventure|Thriller,espionage funny good dialogue one-liners James...


### Creating ratings pivot table

In [8]:
# Considering only some part of data to avoid crashing issues
from sklearn.model_selection import train_test_split
train,test = train_test_split(ratings,test_size = 0.99, random_state = 42)
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
10266942,71011,1721,1.0,1161153938
6459144,44370,2716,3.0,998116576
7925187,54592,1653,3.0,961800904
13478872,93133,924,5.0,942435045
18858253,130586,1544,4.0,1043144736


In [9]:
# Extracting important columns
ratings_data = pd.pivot_table(train, index = 'movieId', columns = 'userId', values = 'rating', fill_value = 0)
ratings_data.head()

userId,1,2,3,5,7,8,9,10,11,14,...,138473,138474,138475,138477,138483,138484,138486,138487,138489,138493
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0,0,0,0,0,0,0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0.0,0.0
2,0.0,0,0,0,0,0,0,0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0.0,0.0
3,0.0,0,0,0,0,0,0,0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0.0,0.0
4,0.0,0,0,0,0,0,0,0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0.0,0.0
5,0.0,0,0,0,0,0,0,0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0.0,0.0


## Creating matrix for content filter method on the above metadata

In [10]:
# Build a Tfidf Vectorizer model
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['tags'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),index = movies.index.tolist())
tfidf_df.shape

(27278, 24111)

In [11]:
# Build a TruncatedSVD model
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components = 10)
latent_matrix = svd.fit_transform(tfidf_df)
latent_matrix1_df = pd.DataFrame(latent_matrix[:,0:10],index = movies['title'].tolist())
latent_matrix1_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Toy Story (1995),0.010493,0.008531,0.049432,-0.034828,0.019146,0.001657,0.054915,0.006238,0.064435,-0.103410
Jumanji (1995),0.008409,0.012993,0.070620,-0.050528,0.025463,0.003318,0.121662,-0.013381,0.016850,-0.015506
Grumpier Old Men (1995),0.004847,0.007098,0.025620,-0.013141,-0.000895,-0.002341,0.020708,0.006664,0.085621,-0.008607
Waiting to Exhale (1995),0.006970,0.009196,0.033035,-0.004887,-0.000399,-0.000772,0.029941,0.009429,0.178809,0.077264
Father of the Bride Part II (1995),0.006596,0.012793,0.042707,-0.020698,0.005849,0.000488,0.046059,0.010116,0.116127,-0.041424
...,...,...,...,...,...,...,...,...,...,...
Kein Bund für's Leben (2007),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
"Feuer, Eis & Dosenbier (2002)",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
The Pirates (2014),0.000625,0.000279,0.002372,-0.001612,0.000827,0.000251,0.001715,-0.000235,0.001513,-0.001772
Rentun Ruusu (2001),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Creating matrix for collaborative filter method on the above metadata

In [12]:
svd = TruncatedSVD(n_components = 15)
latent_matrix2 = svd.fit_transform(ratings_data)
latent_matrix2_df = pd.DataFrame(latent_matrix2, index = [movies.loc[(movies.movieId == i),'title'].values[0] for i in (train['movieId'].unique())])
latent_matrix2_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Titanic (1997),4.076304,3.796520,3.359235,2.094298,10.222334,0.816778,-0.967441,-5.192608,0.730604,-2.962878,9.989082,-22.576094,-15.080040,23.924699,38.270802
Ghostbusters (a.k.a. Ghost Busters) (1984),0.624433,0.505868,0.460656,0.612010,1.073580,-0.465257,-0.455791,-0.283094,-0.184912,0.309081,0.469686,-0.530059,-0.125805,0.239047,0.787473
Gattaca (1997),0.159688,0.127021,0.099917,0.109227,0.559596,-0.440823,0.444526,-0.001097,-0.194770,-0.115662,0.083887,0.010704,-0.094088,0.103693,0.149118
2001: A Space Odyssey (1968),0.081449,0.139021,-0.015949,0.031667,0.138419,-0.055170,-0.071706,0.014101,-0.097460,0.014291,0.135507,0.080914,0.001215,0.016496,0.106058
"Lost World: Jurassic Park, The (1997)",0.408608,0.160939,0.363371,0.329957,0.195333,-0.193060,-0.115022,0.082981,0.167440,-0.092724,0.000707,-0.092427,0.136541,0.027245,0.006787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Fragile (2005),0.001067,0.000901,0.000639,0.000359,0.001614,0.000152,-0.001023,0.001285,-0.000332,0.000101,0.000495,-0.000390,-0.000398,-0.000022,-0.000138
Tokyo Gore Police (Tôkyô zankoku keisatsu) (2008),0.005075,0.003436,0.002061,0.004215,0.007534,-0.002413,0.000152,-0.001162,-0.001886,-0.000568,0.002079,-0.009884,0.003668,0.001184,-0.003078
Ballast (2008),0.000935,0.000821,0.000504,0.000957,0.001789,-0.000720,-0.001177,0.000032,-0.000779,0.000079,-0.000004,-0.000700,0.000282,-0.000039,0.000622
Trespass (2011),0.000888,0.000718,0.000374,0.000708,0.002257,-0.000849,-0.000754,-0.000191,-0.000852,0.000085,0.000615,-0.000842,-0.000070,0.000508,0.000994


## Cleaning above matrices

### Checking for duplicates

In [13]:
latent_matrix1_df.duplicated()

Toy Story (1995)                      False
Jumanji (1995)                        False
Grumpier Old Men (1995)               False
Waiting to Exhale (1995)              False
Father of the Bride Part II (1995)    False
                                      ...  
Kein Bund für's Leben (2007)           True
Feuer, Eis & Dosenbier (2002)          True
The Pirates (2014)                    False
Rentun Ruusu (2001)                    True
Innocence (2014)                       True
Length: 27278, dtype: bool

In [14]:
latent_matrix2_df.duplicated()

Titanic (1997)                                       False
Ghostbusters (a.k.a. Ghost Busters) (1984)           False
Gattaca (1997)                                       False
2001: A Space Odyssey (1968)                         False
Lost World: Jurassic Park, The (1997)                False
                                                     ...  
Fragile (2005)                                       False
Tokyo Gore Police (Tôkyô zankoku keisatsu) (2008)     True
Ballast (2008)                                        True
Trespass (2011)                                      False
Merry Christmas Mr. Lawrence (1983)                  False
Length: 10363, dtype: bool

### Removing duplicate rows

In [15]:
latent_matrix1_df = latent_matrix1_df.drop_duplicates()
latent_matrix2_df = latent_matrix2_df.drop_duplicates()

### Making latent matrices similar

In [16]:
latent_matrix1_df.shape

(16878, 10)

In [17]:
latent_matrix2_df.shape

(10034, 15)

In [18]:
# Adding indexes for latent_matrix2_df
matrix = latent_matrix2_df.copy()
for i in latent_matrix1_df.index:
    if i not in matrix.index:
        matrix.loc[i] = np.zeros(15)
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Titanic (1997),4.076304,3.796520,3.359235,2.094298,10.222334,0.816778,-0.967441,-5.192608,0.730604,-2.962878,9.989082,-22.576094,-15.080040,23.924699,38.270802
Ghostbusters (a.k.a. Ghost Busters) (1984),0.624433,0.505868,0.460656,0.612010,1.073580,-0.465257,-0.455791,-0.283094,-0.184912,0.309081,0.469686,-0.530059,-0.125805,0.239047,0.787473
Gattaca (1997),0.159688,0.127021,0.099917,0.109227,0.559596,-0.440823,0.444526,-0.001097,-0.194770,-0.115662,0.083887,0.010704,-0.094088,0.103693,0.149118
2001: A Space Odyssey (1968),0.081449,0.139021,-0.015949,0.031667,0.138419,-0.055170,-0.071706,0.014101,-0.097460,0.014291,0.135507,0.080914,0.001215,0.016496,0.106058
"Lost World: Jurassic Park, The (1997)",0.408608,0.160939,0.363371,0.329957,0.195333,-0.193060,-0.115022,0.082981,0.167440,-0.092724,0.000707,-0.092427,0.136541,0.027245,0.006787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Execution Squad (1972),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Hellgate (2011),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
La liga no es cosa de hombres (1972),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Playground (2009),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [19]:
# Adding indexes in latent_matrix1_df
newmatrix = latent_matrix1_df.copy()
for i in matrix.index:
    if i not in newmatrix.index:
        newmatrix.loc[i] = np.zeros(10)
newmatrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Toy Story (1995),0.010493,0.008531,0.049432,-0.034828,0.019146,0.001657,0.054915,0.006238,0.064435,-0.103410
Jumanji (1995),0.008409,0.012993,0.070620,-0.050528,0.025463,0.003318,0.121662,-0.013381,0.016850,-0.015506
Grumpier Old Men (1995),0.004847,0.007098,0.025620,-0.013141,-0.000895,-0.002341,0.020708,0.006664,0.085621,-0.008607
Waiting to Exhale (1995),0.006970,0.009196,0.033035,-0.004887,-0.000399,-0.000772,0.029941,0.009429,0.178809,0.077264
Father of the Bride Part II (1995),0.006596,0.012793,0.042707,-0.020698,0.005849,0.000488,0.046059,0.010116,0.116127,-0.041424
...,...,...,...,...,...,...,...,...,...,...
"Drowning Pool, The (1975)",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Battle Hymn (1957),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
"Wife, The (1995)",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Kurt Cobain About a Son (2006),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### Checking for duplicate indexes in matrix and newmatrix

In [20]:
matrix.index[matrix.index.duplicated(keep = 'first')]

Index(['War of the Worlds (2005)', 'Hamlet (2000)'], dtype='object')

In [21]:
newmatrix.index[newmatrix.index.duplicated(keep = 'first')]

Index(['Emma (1996)', 'Men with Guns (1997)', 'War of the Worlds (2005)',
       'Hamlet (2000)', '20,000 Leagues Under the Sea (1997)',
       'Aladdin (1992)', 'Casanova (2005)', 'Johnny Express (2014)'],
      dtype='object')

### Removing duplicate indexes in matrix and newmatrix

In [22]:
matrix = matrix[~matrix.index.duplicated(keep='first')]
matrix.shape

(17935, 15)

In [23]:
newmatrix = newmatrix[~newmatrix.index.duplicated(keep='first')]
newmatrix.shape

(17935, 10)

## Creating content filtering, collaborative filtering and hybrid model

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

def similar_movies(title):
    if title in newmatrix.index:
        newmatrix_reshaped = np.array(newmatrix.loc[title]).reshape(1,-1)
        content_rec_score = cosine_similarity(newmatrix, newmatrix_reshaped).reshape(-1)
    else:
        content_rec_score = 0
        
    if title in matrix.index:
        matrix_reshaped = np.array(matrix.loc[title]).reshape(1,-1)
        collab_rec_score = cosine_similarity(matrix, matrix_reshaped).reshape(-1)
    else:
        collab_rec_score = 0
    
    hybrid_rec_score = (content_rec_score + collab_rec_score) / 2

    scores = {'content':content_rec_score, 'collab':collab_rec_score, 'hybrid':hybrid_rec_score}
    movies = pd.DataFrame(scores, index = matrix.index)
    
    movies.sort_values('hybrid',ascending = False, inplace = True)
    
    return movies

## Finding similarity scores

In [25]:
similar_movies('Lost World: Jurassic Park, The (1997)')

Unnamed: 0,content,collab,hybrid
Enemy at the Gates (2001),0.960326,0.872188,0.916257
"Long Walk Home, The (1990)",0.931628,0.891912,0.911770
Ocean's Thirteen (2007),0.932604,0.875086,0.903845
Roger & Me (1989),0.970702,0.833387,0.902045
Instinct (1999),0.933944,0.863994,0.898969
...,...,...,...
Fall From Grace (2007),-0.206985,0.000000,-0.103493
Mirage (1995),0.025709,-0.233092,-0.103692
Heading South (Vers le sud) (2005),-0.215570,0.000000,-0.107785
New Tale of Zatoichi (Shin Zatôichi monogatari) (Zatôichi 3) (1963),-0.235691,0.000000,-0.117846


### Popularity method score is incorporated into hybrid score because ratings data is used for collab filter (ratings are based on popularity).

### Matrix factorization method score is incorporated into hybrid score because TruncatedSVD model (matrix factorization technique) is used for content and collab filter.

## Implementing collaborative filtering method using surprise library

In [26]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.99, random_state=42)

model = SVD()
model.fit(trainset)

# Making predictions for a specific userId
user_id = 100
user_movies = ratings[ratings['userId'] == user_id]['movieId']
unrated_movies = ratings[~ratings['movieId'].isin(user_movies)]['movieId']
user_unrated_movies = list(set([(user_id, movie_id, model.predict(user_id, movie_id).est) for movie_id in unrated_movies]))
sorted_recommendations = sorted(user_unrated_movies, key=lambda x: x[2], reverse=True)
for i, (user_id, movie_id, rating) in enumerate(sorted_recommendations[:5]):
    print(f"Rank {i+1}: Movie ID {movie_id}, Predicted Rating: {rating}")

Rank 1: Movie ID 858, Predicted Rating: 4.495552858645667
Rank 2: Movie ID 2019, Predicted Rating: 4.399289427247683
Rank 3: Movie ID 79132, Predicted Rating: 4.355021955363461
Rank 4: Movie ID 2858, Predicted Rating: 4.354748275454396
Rank 5: Movie ID 2920, Predicted Rating: 4.3459821061305615


### These top ranked movie IDs can now be easily mapped to their names using movies dataframe.