In [2]:
import numpy as np
import pandas as pd

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
#merge the two data sets available to us into one single data set for our conveniance
movies = movies.merge(credits,on='title')

In [5]:
#we only want the columns that are actually required for the model, so we select only those
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [6]:
#remove missing data
movies.dropna(inplace=True) 

In [7]:
#our data is in the form of a dictionary, but for it to be usable we need to convert it into a list!
#convert string of list into a list

import ast

def convert(obj):
    L = []
    #Now convert string of list to list
    for i in ast.literal_eval(obj): 
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)

In [8]:
movies['keywords'] = movies['keywords'].apply(convert)

In [9]:
#there are so many people in a movie, so to keep our model light i will use only the name of first 3 cast members
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter!=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [10]:
movies['cast'] = movies['cast'].apply(convert)

In [11]:
#there are so many crew members defined here but we only want the directors, so let's extract that info
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj): #converts string of list to list
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L


In [12]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [13]:
#Overview a string, let's convert this also to a list, so that the data structure remains constant through out the model
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [14]:
#Now the next problem is that the lists might have elements with space, eg: space war, so at the end we will get two tags, one is "space" and the other is "war", so if someone wants to watch space war movie then pour model might recommend them a movie based on a normal war instead of specifically the space war, this problem would'nt have existed if there was nospace between "space" and "war". Therfore, we now remove the space between each element of the lists we have
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [15]:
#create tags for the model, put all the data in a single column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [16]:
#remove rest of the columns
new_df = movies[['movie_id','title','tags']]

In [17]:
#conert the tags into strings
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [18]:
#conert everything into lower case, it's recommended
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [19]:
#do text vectorisation
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [20]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [21]:
#apply stemming to remove same words but different verb form eg.: access, accessed, accesses, etc..
#Stemming
import nltk

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [22]:
#Let's now try to find the similarity between the movies, for that we will calculate the cosine distance of each movie from every other another.
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

#we should not loose the index of the cosine distances, since recommendation will be done based on the distance only 
sorted(list(enumerate(similarity[0])), reverse = True, key=lambda x:x[1])[1:11]

[(1920, 0.23473823893078552),
 (1216, 0.23294541397390256),
 (582, 0.2309782890611944),
 (539, 0.2252817784447915),
 (507, 0.21912524504463887),
 (1444, 0.21398024625545647),
 (3608, 0.21398024625545647),
 (4192, 0.2123976976214366),
 (260, 0.21170244960998524),
 (74, 0.20935894733965596)]

In [23]:
#Make a function to give/recommend 10 movies based on a given input movie
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0] #Getting the index of the movies
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key=lambda x:x[1])[1:11]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [26]:
recommend('Lockout')

Ender's Game
Lost in Space
Fortress
Armageddon
Moonraker
Automata
Gattaca
Gravity
Space Pirate Captain Harlock
Cloud Atlas


In [27]:
import pickle

In [29]:
#This file will be called by our web app to get the title of gthe movies
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [30]:
#This file will be called by our web app to get the cosine similarity for recommending the movies
pickle.dump(similarity, open('similarity.pkl','wb'))