In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# Ignore warnings :
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading ratings file
# Ignore unnecessary columns

ratings = pd.read_csv('ratings.csv', sep=',', usecols=['userId', 'movieId', 'rating'])
movies = pd.read_csv('movies.csv', sep=',', usecols=[ 'movieId','title', 'genres'])
links_map = pd.read_csv('links.csv', sep=',', usecols=[ 'movieId', 'tmdbId'])
tmdb_data = pd.read_csv('tmdb_data.csv', sep=',',index_col = False)

In [3]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [4]:
#tmdb_data.drop(['index'], axis=1,inplace=True)

In [5]:
tmdb_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year', 'cast', 'crew', 'keywords',
       'cast_size', 'crew_size', 'director', 'Film_director', 'metadata'],
      dtype='object')

In [6]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(tmdb_data['metadata'])

In [7]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [8]:
tmdb_data = tmdb_data.reset_index()
titles = tmdb_data['title']
indices = pd.Series(tmdb_data.index, index=tmdb_data['title'])

In [9]:
links_map['tmdbId'] = links_map['tmdbId'].apply(convert_int)
links_map.columns = ['movieId', 'id']
links_map = links_map.set_index('id')
links_map = links_map.merge(tmdb_data[['title', 'id']], on='id').set_index('title')
index_map = links_map.set_index('id')

In [10]:
links_map.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3452 entries, Toy Story to Shin Godzilla
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       3452 non-null   float64
 1   movieId  3452 non-null   int64  
dtypes: float64(1), int64(1)
memory usage: 80.9+ KB


In [11]:
#reading in the range of ratings score
reader = Reader(rating_scale = (1,5))
#loading the dataframe into surprise
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [12]:
method = SVD()
cross_validate(algo=method, data=data, measures=['RMSE'], cv=3)

{'test_rmse': array([0.87983523, 0.88629467, 0.87503505]),
 'fit_time': (0.47231006622314453, 0.47971630096435547, 0.41410040855407715),
 'test_time': (0.17564988136291504, 0.10202479362487793, 0.10299539566040039)}

In [13]:
param_grid = {'n_factors' : [50, 75], 'lr_all' : [0.5, 0.05], 'reg_all' : [0.06, 0.04]}

grid = GridSearchCV(algo_class=SVD, measures=['RMSE'], param_grid=param_grid,n_jobs = -1)
grid.fit(data)

# Best RMSE score
print('Best Score :', round(grid.best_score['rmse'],2)*100)

# Combination of parameters that gave the best RMSE score
print('Best Parameters :', grid.best_params['rmse'])

Best Score : 86.0
Best Parameters : {'n_factors': 75, 'lr_all': 0.05, 'reg_all': 0.06}


In [14]:
dataset = data.build_full_trainset()

In [15]:
svd = SVD(n_factors= 100, n_epochs = 60, reg_all=0.1, lr_all=0.01)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21356a3f250>

In [16]:
svd.predict(1, 90)

Prediction(uid=1, iid=90, r_ui=None, est=4.278631390501736, details={'was_impossible': False})

In [None]:
tmdb_data.rename(columns = {'vote_count':'Total_votes','vote_average': 'Average_votes'}, inplace = True)
tmdb_data.columns

In [137]:
def recommendation(userId, title):
    idx = indices[title]
    tmdbId = links_map.loc[title]['id']
    movie_id = links_map.loc[title]['movieId']
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    movies = tmdb_data.iloc[movie_indices][['title', 'Total_votes', 'Average_votes', 'Film_director', 'id']]
    movies['rating'] = ratings['rating']
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, index_map.loc[x]['movieId']).est)
    movies = round(movies.sort_values('est', ascending=False),2)
    return movies.head(10)

In [142]:
recommendation(56, "Avatar")

Unnamed: 0,title,Total_votes,Average_votes,Film_director,id,rating,est
284,Aliens,3282.0,7.7,James Cameron,679,0.5,4.26
306,The Terminator,4208.0,7.4,James Cameron,218,4.0,4.21
154,Terminator 2: Judgment Day,4274.0,7.7,James Cameron,280,4.0,4.19
3186,X-Men: Days of Future Past,6155.0,7.5,Bryan Singer,127585,4.0,4.13
937,Predator,2129.0,7.3,John McTiernan,106,4.0,4.11
3065,Star Trek Into Darkness,4479.0,7.4,J.J. Abrams,54138,4.0,4.05
113,True Lies,1138.0,6.8,James Cameron,36955,4.0,4.0
386,The Fifth Element,3962.0,7.3,Luc Besson,18,4.0,4.0
722,Superman,1042.0,6.9,Richard Donner,1924,4.0,3.76
988,Titan A.E.,320.0,6.3,Gary Goldman,7450,4.5,3.66


In [147]:
recommendation(2, "The Abyss")

Unnamed: 0,title,Total_votes,Average_votes,Film_director,id,rating,est
154,Terminator 2: Judgment Day,4274.0,7.7,James Cameron,280,4.0,4.02
306,The Terminator,4208.0,7.4,James Cameron,218,4.0,3.96
284,Aliens,3282.0,7.7,James Cameron,679,0.5,3.89
2603,Avatar,12114.0,7.2,James Cameron,19995,3.0,3.77
822,Thunderball,572.0,6.5,Terence Young,660,4.0,3.7
113,True Lies,1138.0,6.8,James Cameron,36955,4.0,3.67
2597,Planet 51,573.0,5.6,Jorge Blanco,16866,1.0,3.65
977,Mad Max,1235.0,6.6,George Miller,9659,2.0,3.62
893,Pitch Black,1812.0,6.7,David Twohy,2787,3.0,3.54
2346,Doomsday,374.0,5.8,Neil Marshall,13460,3.0,3.53


In [89]:
def recommendation2(userId, title):
    idx = indices[title]
    tmdbId = links_map.loc[title]['id']
    movie_id = links_map.loc[title]['movieId']
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    movies = tmdb_data.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'release_date', 'Film_director', 'id']]
    movies['rating'] = ratings['rating']
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, index_map.loc[x]['movieId']).est)
    movies = round(movies.sort_values('est', ascending=False),2)
    return movies.head(10)

In [105]:
pd.set_option('display.max_columns', None)
tmdb_data.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size,director,Film_director,metadata,rating,userId
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,"['tomhanks', 'timallen', 'donrickles']","[{'credit_id': '52fe4284c3a36847f8024f55', 'de...","['jealousi', 'toy', 'boy', 'friendship', 'frie...",13,106,"['johnlasseter', 'johnlasseter', 'johnlasseter']",John Lasseter,jealousi toy boy friendship friend rivalri new...,4.0,1


In [108]:
tmdb_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year', 'cast', 'crew', 'keywords',
       'cast_size', 'crew_size', 'director', 'Film_director', 'metadata',
       'rating', 'userId'],
      dtype='object')

In [98]:
tmdb_data['rating'] = ratings['rating']
tmdb_data['userId'] = ratings['userId']

In [106]:
recommendation2(2, "Avatar")

Unnamed: 0,title,vote_count,vote_average,release_date,Film_director,id,rating,est
154,Terminator 2: Judgment Day,4274.0,7.7,1991-07-01,James Cameron,280,4.0,4.02
306,The Terminator,4208.0,7.4,1984-10-26,James Cameron,218,4.0,3.96
3186,X-Men: Days of Future Past,6155.0,7.5,2014-05-15,Bryan Singer,127585,4.0,3.96
386,The Fifth Element,3962.0,7.3,1997-05-07,Luc Besson,18,4.0,3.92
284,Aliens,3282.0,7.7,1986-07-18,James Cameron,679,0.5,3.89
3065,Star Trek Into Darkness,4479.0,7.4,2013-05-05,J.J. Abrams,54138,4.0,3.81
937,Predator,2129.0,7.3,1987-06-11,John McTiernan,106,4.0,3.8
113,True Lies,1138.0,6.8,1994-07-14,James Cameron,36955,4.0,3.67
722,Superman,1042.0,6.9,1978-12-13,Richard Donner,1924,4.0,3.64
988,Titan A.E.,320.0,6.3,2000-06-16,Gary Goldman,7450,4.5,3.63


In [107]:
pickle.dump(tmdb_data,open('tmdb_data.pkl','wb'))
pickle.dump(cosine_sim,open('similarity.pkl','wb'))

In [52]:
# saving the model
import pickle

# saving the columns
model_columns = list(tmdb_data.columns)
with open('model_columns.pkl','wb') as file:
    pickle.dump(model_columns, file)
    
    
pickle.dump(svd, open('svd.pkl', 'wb'))

In [21]:
import pickle
import pandas as pd
import streamlit as st
from streamlit import session_state as session
st.set_page_config(page_title="Movie Recommendation", layout="wide")
import streamlit.components.v1 as components

In [22]:
st.cache(allow_output_mutation=True)

<function streamlit.runtime.legacy_caching.caching.cache.<locals>.wrapper(f: ~F) -> ~F>

In [None]:
title = "Movie Recommendation Engine"
st.title(title)
st.write("Welcome! You can get recommendation of movies based on title of the movie")
st.markdown("##")
with st.container():
    col3 = st.columns((2,0.5,0.5,0.5))
    with col3:
        st.markdown("***Choose your movie:***")
        movie = st.title(
            "",
            title)

2023-03-29 20:17:32.301 
  command:

    streamlit run c:\Users\Manak\anaconda3\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


AttributeError: __enter__

In [None]:
streamlit run app.py

SyntaxError: invalid syntax (3737097518.py, line 1)