**Load Dataset**

In [3]:
#import required libraries
import pandas as pd
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
import pandas as pd
# Load the dataset
movies_df = pd.read_csv('./dataset/movies_metadata.csv', low_memory=False)

In [5]:
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
#Check dataset info
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [7]:
#Check for null values in the dataset
movies_df.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

**Data Cleaning Process**

In [8]:
#In this dataset we are interested in the Title and Overview column
# Select the columns and remove missing values
movies_df = movies_df[['title', 'overview']].dropna()

In [9]:
movies_df.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [10]:
#Load spacy pipeline for text processing 
nlp  = spacy.load('en_core_web_lg')

In [11]:
#Function to perform preprocessing with spacy
def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

In [12]:
#Apply preprocessor function
movies_df['new_overview'] = np.vectorize(preprocess_text)(movies_df['overview'])


In [13]:
movies_df.head()

Unnamed: 0,title,overview,new_overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",lead woody andy toy live happily room andy bir...
1,Jumanji,When siblings Judy and Peter discover an encha...,sibling judy peter discover enchanted board ga...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,family wedding reignite ancient feud door neig...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",cheat mistreat step woman hold breath wait elu...
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,george banks recover daughter wedding receive ...


In [14]:
#Transform overviews to vectors
#write a function to transform to vectors
def convert_to_vector(text):
    doc = nlp(text)
    return doc.vector

In [15]:
#Apply the function
movies_df['vector'] = (movies_df['new_overview']).apply(convert_to_vector)


In [16]:
movies_df['vector'].iloc[0].shape

(300,)

**Building the Recommendation system**

In [17]:
#write a function for recommending
def recommend(input_desc, num_recommend=5):
    # Process the input description to get its vector
    input_vector = convert_to_vector(preprocess_text(input_desc))
    input_vector = input_vector.reshape(1, -1)  # Reshape for compatibility with cosine_similarity

    # Compute similarity with all movies
    similarity_scores = cosine_similarity(input_vector, list(movies_df['vector']))
    
    # Get top N similarities
    sim_scores_indices = similarity_scores[0].argsort()[-num_recommend-1:-1][::-1]  # Exclude the input movie itself
    
    # Get movie titles based on indices
    recommended_titles = movies_df['title'].iloc[sim_scores_indices].tolist()
    
    # Get the corresponding similarity scores
    recommended_scores = similarity_scores[0][sim_scores_indices].tolist()
    
    # Combine titles and scores into a list of tuples
    recommendations = list(zip(recommended_titles, recommended_scores))
    
    return recommendations

**Evaluation**

In [18]:
# Get recommendations based on a few movie descriptions
recommend("A family of undercover superheroes, while trying to live the quiet suburban life, are forced into action to save the world.")

[("Lookin' Italian", 0.9030148386955261),
 ('Luster', 0.9007611274719238),
 ('Eagle Eye', 0.9004954099655151),
 ('Verbo', 0.89686119556427),
 ('The Code', 0.8960847854614258)]

In [19]:
recommend("A young lion prince is cast out of his pride by his cruel uncle, who claims he killed his father. While the uncle rules with an iron paw, the prince grows up beyond the Savannah, living by a philosophy: No worries for the rest of your days.")

[('The Merry Widow', 0.884589433670044),
 ('Chinese Odyssey 2002', 0.8802981376647949),
 ('Eye of the Eagle', 0.8796935081481934),
 ('Samson and Delilah', 0.8781486749649048),
 ('The Prince and the Pauper', 0.8767232298851013)]

**Save model to Pickle**

In [78]:
import dill,gzip,pickletools,pickle


dill.settings['recurse'] = True
filename = 'recomend_movie.pickle'

with open(filename,'wb') as f:
    dill.dump(recommend, f)
    

In [66]:
# import joblib,gzip,pickle
# filename = 'recomend_movie.joblib'
# with gzip.GzipFile(filename + '.gz', 'wb', compresslevel=3) as fo:
#     joblib.dump(recommend, fo)

In [71]:
#cloudpickle.dump(recommend,'recom.pickle','wb')

TypeError: 'str' object cannot be interpreted as an integer