In [1]:
import pandas as pd
import numpy as np
import ast 
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle as pkl
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

### Reading the datasets

In [2]:
def convert_to_forward_slash(file_path):
    return file_path.replace('\\', '/')

In [3]:
file_path_movies = r"C:\Users\Priyaranjan\Desktop\Kaggle datasets\TMDB Movies dataser\tmdb_5000_movies.csv"
file_path_credits = r"C:\Users\Priyaranjan\Desktop\Kaggle datasets\TMDB Movies dataser\tmdb_5000_credits.csv"

file_path_fslash_movies = convert_to_forward_slash(file_path_movies)
file_path_fslash_credits = convert_to_forward_slash(file_path_credits)

file_path_movies = file_path_fslash_movies
file_path_credits = file_path_fslash_credits

In [4]:
df_movies = pd.read_csv(file_path_movies)
df_credits = pd.read_csv(file_path_credits)

In [5]:
df_movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [6]:
df_credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

Merging both the columns based on 'movie_id'.

In [7]:
# Dropping title column from here as its present in the first table
df_credits = df_credits.drop(columns='title',axis=0)

In [8]:
df_movies.rename(columns={'id': 'movie_id'}, inplace=True)
df_movies.columns

Index(['budget', 'genres', 'homepage', 'movie_id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count'],
      dtype='object')

In [9]:
df_movies = df_movies.merge(df_credits, on='movie_id')
df_movies.shape

(4803, 22)

Now, lets pick the variables which we require for the recommendor system afterwards.

- genres
- movie_id
- keywords
- title
- overview
- cast
- crew


In [10]:
df_movies = df_movies[['movie_id','title', 'genres', 'overview', 'keywords', 'cast', 'crew']]

In [11]:
df_movies.head()

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",A cryptic message from Bond’s past sends him o...,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",Following the death of District Attorney Harve...,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","John Carter is a war-weary, former military ca...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### Data Preprocessing stage:

- The basic idea here is to get a cleaner df for our recommendation systems.
- We will first check the missing or null values in the columns.
- Then, we are going to merge the genres, overview, keywords, cast & crew details together into a single corpus.
    - Currently the info on keywords, cast & crew is in a list of dictionaries format where dictionary contains different information about a specific entity or person.
    - Hence, we will be extracting only the names of entities / persons here.
        - We will go with first 3 entities in 'cast' column
        - Also, for column 'crew' , we will go with the director name as director of the movie is important in recommending a movie to any user based on content based filtering.

In [12]:
df_movies.isnull().sum()

movie_id    0
title       0
genres      0
overview    3
keywords    0
cast        0
crew        0
dtype: int64

In [13]:
# Removing the 3 rows which has null values in overview as its an important variable in recommendor system.
df_movies.dropna(inplace=True)

In [14]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [15]:
def convert3(obj):
    count = 0
    L = []
    for i in ast.literal_eval(obj):
        if count < 4:
            L.append(i['name'])
            count+=1
        else:
            break
    return L

In [16]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [17]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    L = []
    for i in text.split():
        L.append(ps.stem(i))
    return " ".join(L)

In [18]:
df_movies['genres'] = df_movies['genres'].apply(convert)
df_movies['keywords'] = df_movies['keywords'].apply(convert)
df_movies['cast'] = df_movies['cast'].apply(convert3)
df_movies['crew'] = df_movies['crew'].apply(fetch_director)
df_movies['overview'] = df_movies['overview'].apply(lambda x:x.split())

df_movies.head()

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[John, Carter, is, a, war-weary,, former, mili...","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


Transformations we would need to apply now:
- We will be merging the last 4 columns into a single feature which will store tags for the respective movies.
- However, before that we need to apply some transformation on last 4 columns too.
    - We need to merge the words found in a single element which will be stored as comma-separated tags.

In [19]:
df_movies['genres'] = df_movies['genres'].apply(lambda x:[i.replace(' ','') for i in x])
df_movies['keywords'] = df_movies['keywords'].apply(lambda x:[i.replace(' ','') for i in x])
df_movies['cast'] = df_movies['cast'].apply(lambda x:[i.replace(' ','') for i in x])
df_movies['crew'] = df_movies['crew'].apply(lambda x:[i.replace(' ','') for i in x])

df_movies.head()

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[John, Carter, is, a, war-weary,, former, mili...","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]


In [20]:
df_movies['tags'] = df_movies['overview'] + df_movies['genres'] + df_movies['keywords'] + df_movies['cast'] + df_movies['crew']

In [21]:
df_movies_new = df_movies[['movie_id', 'title', 'tags']]
df_movies_new

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4798,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4799,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4800,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4801,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [22]:
# converting the list into a string format
# converting to lowercase
df_movies_new['tags'] = df_movies_new['tags'].apply(lambda x: ' '.join(x))
df_movies_new['tags'] = df_movies_new['tags'].apply(lambda x: x.lower())
df_movies_new.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


### Data Preparation

Now that we have our desired data, our next steps woudl be as follows.

- text cleaning: stemming & lemmatizing
- remove stopwords from tags
- apply word vectorization ( using CountVectorizer Class in scikit learn library )

In [23]:
df_movies_new['tags'] = df_movies_new['tags'].apply(stem)

In [24]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df_movies_new['tags']).toarray()

In [25]:
print(cv.get_feature_names_out())

['000' '007' '10' ... 'zone' 'zoo' 'zooeydeschanel']


Post the vectorization of the movies, calculating the distances between 2 vectors (movies)
- We cannot use 'euclidean distance' as it doesnt perform well in higher dimensions ( Curse of dimensionality )
- Instead we can calculate the cosine distance ( angle between the vectors in that dimensional space)

Then , we can calculate similarity based on cosine distance (cosine similarity) as distance is inversely proportional to similarity.
- using cosine_similarity function from sklearn.metrics.pairwise

In [26]:
similarity = cosine_similarity(vectors)

In [27]:
similarity[0]

array([1.        , 0.08226127, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

### Recommendor Function

- Creating a recommendor function which will recommend 5 movies out when provided with a movie as input
    - when provided with a movie title as input, find the index position in the data.
    - using index of the input movie, fetch the cosine similarity vector for that index position.
        - fetch the top 5 movies ( first 5 similar movies from the descending sorted cosine simialrity vectors)

In [28]:
def recommendor(movie):
    movie_index = df_movies_new[df_movies_new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[movie_index])),reverse=True,key = lambda x: x[1])
  
    for i in distances[1:6]:
        print(df_movies_new.iloc[i[0]].title)

In [29]:
recommendor('The Avengers')

Avengers: Age of Ultron
Iron Man 3
Captain America: Civil War
Captain America: The First Avenger
Iron Man


Now that we have our recommendor function ready, dumping the data file used for this into a pickle file to be used by our app file further.

In [30]:
df_movies_new.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [31]:
# Dumping the df to be used by app
directory = 'C:/Users/Priyaranjan/Desktop/My Projects/movies-recommendor-system'
filename = 'movies_list.pkl'

full_file_path = f'{directory}/{filename}'

with open(full_file_path,'wb') as file:
    pkl.dump(df_movies_new,file)
    
print('Pickle file saved to path:',directory)

Pickle file saved to path: C:/Users/Priyaranjan/Desktop/My Projects/movies-recommendor-system


In [32]:
# Dumping the similarity pickle file to be used by app
directory = 'C:/Users/Priyaranjan/Desktop/My Projects/movies-recommendor-system'
filename = 'similarity.pkl'

full_file_path = f'{directory}/{filename}'

with open(full_file_path,'wb') as file:
    pkl.dump(similarity,file)
    
print('Pickle file saved to path:',directory)

Pickle file saved to path: C:/Users/Priyaranjan/Desktop/My Projects/movies-recommendor-system
