# Models 

In [2]:
#Importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, KNNWithMeans, SVD
from surprise.model_selection import cross_validate
import gc 

In [4]:
#Loading various datasets required for the model
ratings_df = pd.read_csv('/Downloads/ratings.csv')
links_small_df = pd.read_csv('/Downloads/links_small.csv')
credits_df = pd.read_csv('/Downloads/credits.csv')
keywords_df = pd.read_csv('/Downloads/keywords.csv')
metadata_df = pd.read_csv('/Downloads/movies_metadata.csv')
ratings_small_df = pd.read_csv('/Downloads/ratings_small.csv')
links_df = pd.read_csv('/Downloads/links.csv')

In [None]:
#Displaying shape and head of the all dataframes to understand its structure

In [3]:
print(ratings_df.shape)
ratings_df.head()

(26024289, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [4]:
print(links_small_df.shape)
links_small_df.head()

(9125, 3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
print(credits_df.shape)
credits_df.head()

(45476, 3)


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [6]:
print(keywords_df.shape)
keywords_df.head()

(46419, 2)


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [7]:
credit_key_df = pd.merge(credits_df, keywords_df, on = 'id')
credit_key_df.head()

Unnamed: 0,cast,crew,id,keywords
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [8]:
metadata_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [9]:
metadata_df['imdb_id'] = metadata_df['imdb_id'].str.replace('tt0','')
metadata_df['imdb_id'] = metadata_df['imdb_id'].str.replace('tt','')

In [10]:
print(ratings_small_df.shape)

(100004, 4)


In [11]:
#There arethree date values that have been stored in the id column,so we are removing that and trying to replace the column as an int type
metadata_df = metadata_df[metadata_df['id']!= '1997-08-20']
metadata_df = metadata_df[metadata_df['id']!= '2012-09-29']
metadata_df = metadata_df[metadata_df['id']!= '2014-01-01']
metadata_df['id'] = metadata_df['id'].astype(int)
metadata_df.shape

(45463, 24)

In [12]:
metadata_credit_key_df = pd.merge(metadata_df , credit_key_df , on = 'id')

# Working on relevant datasets

In [13]:
#Function to check dataframes
def check_dt(dataframe):
    print("SHAPE".center(70, "-"))
    print(dataframe.shape)
    print("TYPE".center(70, "-"))
    print(dataframe.dtypes)
    print("INFO".center(70, "-"))
    print(dataframe.info())
    print("NA".center(70, "-"))
    print(dataframe.isnull().sum())
    print("DESCRIBE".center(70, "-"))
    print(dataframe.describe().T)
    print("NUNIQUE".center(70, "-"))
    print(dataframe.nunique())
check_dt(metadata_df)

--------------------------------SHAPE---------------------------------
(45463, 24)
---------------------------------TYPE---------------------------------
adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                         int64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object
---------------------------------INFO---

# Content Based Recommendation

## Using TF-IDF 

In [14]:
metadata_df["overview"] = metadata_df["overview"].fillna("")

In [15]:
#creating TF-IDF matrix
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(metadata_df["overview"])

In [16]:
print(tfidf_matrix)

  (0, 17764)	0.13483140468394997
  (0, 4388)	0.1474882244462112
  (0, 38030)	0.10142880926736773
  (0, 21887)	0.10438725114202559
  (0, 19641)	0.13281873425914947
  (0, 48558)	0.10339321363039604
  (0, 59519)	0.13008002839980845
  (0, 12490)	0.12544410597609326
  (0, 51108)	0.13434807786223
  (0, 29238)	0.10093878381737394
  (0, 50914)	0.09190750979278976
  (0, 39423)	0.11907100362134965
  (0, 1847)	0.1409117371499745
  (0, 58571)	0.1135589102041042
  (0, 38693)	0.2062797868291261
  (0, 9874)	0.5028044643301558
  (0, 9087)	0.10635340920377444
  (0, 7491)	0.1238053438144067
  (0, 56872)	0.11124821198451369
  (0, 28729)	0.13311511596337425
  (0, 39012)	0.0871863805054524
  (0, 67874)	0.14878287905973758
  (0, 3159)	0.41178344935492905
  (0, 73468)	0.48098311456983145
  (0, 38088)	0.10739672665533007
  :	:
  (45461, 26957)	0.07350932445482808
  (45461, 18919)	0.0927149738310257
  (45461, 18119)	0.07466602681357981
  (45461, 39012)	0.06829582617553882
  (45462, 16520)	0.32373390574646316
 

In [17]:
#computing cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.01504115, 0.        , ..., 0.        , 0.00595447,
        0.        ],
       [0.01504115, 1.        , 0.04681946, ..., 0.        , 0.02198632,
        0.00929394],
       [0.        , 0.04681946, 1.        , ..., 0.        , 0.01402544,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00595447, 0.02198632, 0.01402544, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.00929394, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [18]:
movies= pd.Series(metadata_df.index ,index= metadata_df["title"])
movies = movies.drop_duplicates()
movies

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45461
Century of Birthing            45462
Betrayal                       45463
Satan Triumphant               45464
Queerama                       45465
Length: 45463, dtype: int64

In [19]:
#choosing a random movie
random_movie  = movies.sample(random_state = 32)
random_movie

title
Hidden Assassin    14518
dtype: int64

In [20]:
# creatingDataFrame of similarity scores and printingtop related movies
mov_index = movies["Hidden Assassin"]
cosine_sim[mov_index]
sim_score = pd.DataFrame(data= cosine_sim[mov_index], columns=["Score"])
sim_score
related_movies = sim_score.sort_values("Score", ascending=False).head(10)
related_movies_index = related_movies.index
metadata_df["title"].iloc[related_movies_index]

14518                       Hidden Assassin
4340                         Apartment Zero
36531                                  Khel
25374                             Gangsters
43278                       Love on the Run
26646                           Man on Fire
30995                                Tooken
26973                          Once a Thief
16756                  Background to Danger
28377    The Iguana with the Tongue of Fire
Name: title, dtype: object

In [21]:
related_movies

Unnamed: 0,Score
14518,1.0
4340,0.259579
36528,0.256268
25373,0.250795
43275,0.218851
26645,0.21412
30993,0.210298
26972,0.206063
16756,0.205012
28376,0.204894


# Item based Recommendation

In [22]:
ratings_df.drop('timestamp', axis =1 , inplace = True)

In [23]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [24]:
movie_title = metadata_df.drop(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'original_language',  'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'video','original_title',
       'vote_average', 'vote_count'],axis = 1 )

ratings_df.rename(columns = {'movieId':'id'},inplace = True)


In [25]:
final_df  = pd.merge(ratings_df , movie_title,on = 'id')
final_df.head()

Unnamed: 0,userId,id,rating,imdb_id,title
0,1,110,1.0,111495,Three Colors: Red
1,11,110,3.5,111495,Three Colors: Red
2,22,110,5.0,111495,Three Colors: Red
3,24,110,5.0,111495,Three Colors: Red
4,29,110,3.0,111495,Three Colors: Red


In [26]:
final_df.shape

(11437637, 5)

In [27]:
#Garbage collection
gc.collect()

0

In [29]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(final_df[["userId", "id", "rating"]], reader)
sim_options = {
    "name": "cosine",
    "user_based": False,
}
algo = KNNWithMeans(sim_options=sim_options)

In [35]:
#cross-validate KNNWithMeans model
cv_results = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8620  0.8615  0.8615  0.8632  0.8614  0.8619  0.0007  
Fit time          82.33   95.95   92.91   92.49   93.21   91.38   4.68    
Test time         414.10  415.43  418.65  412.51  481.94  428.53  26.78   


In [30]:
gc.collect()

29

In [41]:
#function to get top recommendations for a user
def get_top_n_recommendation_movies(user_id, n=10):
    all_movie_ids = final_df['id'].unique()
    unrated_movies = [movie_id for movie_id in all_movie_ids if movie_id not in final_df[final_df['userId'] == user_id]['id']]
    predictions = [algo.predict(user_id, movie_id) for movie_id in unrated_movies]
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    top_n_recommendations = [pred.iid for pred in sorted_predictions[:n]]
    return top_n_recommendations

In [42]:
get_top_n_recommendation_movies(1)

[82395, 164777, 160329, 137853, 146946, 95578, 172149, 106113, 165739, 149883]

In [33]:
gc.collect()

24

In [32]:
#SVD model and cross-validate SVD model
algo_2 = SVD(n_factors=100, n_epochs=20)
cv_results = cross_validate(algo_2, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8347  0.8334  0.8343  0.8356  0.8346  0.8345  0.0007  
Fit time          149.49  158.83  159.55  158.45  158.57  156.98  3.76    
Test time         40.22   40.76   35.52   40.74   41.58   39.76   2.16    
