Import Libraries

In [33]:
import pandas as pd
import ast
import nltk
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


Loading and preprocessing dataset


In [2]:
Movies_data = pd.read_csv("../Dataset/tmdb_5000_movies.csv")
Credits_data = pd.read_csv("../Dataset/tmdb_5000_credits.csv")

In [3]:
#merging the datasets
Movies_data=Movies_data.merge(Credits_data)

In [4]:
#selecting the relevant columns
Movies_data = Movies_data[['movie_id', 'cast', 'crew','keywords', 'title', 'genres', 'overview']]

In [6]:
#extracting the name of cast. Top 3
def cast(text):
    l=[]
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            l.append(i['name'])
        counter+=1
    return l

Movies_data['cast']=Movies_data['cast'].apply(cast)

In [8]:
#extracting the director name from the crew.
def name(text):
    l=[]
    for i in ast.literal_eval(text):
        if i['job']== 'Director':
            l.append(i['name'])
    return l

Movies_data['crew'] = Movies_data['crew'].apply(name)

In [10]:
#extracting genre and keywords.
def extract(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
    return l

Movies_data['genres'] = Movies_data['genres'].apply(extract)
Movies_data['keywords'] = Movies_data['keywords'].apply(extract)


In [12]:
#converting the text into list
Movies_data['overview'] = Movies_data['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])


In [14]:
#remove the whitespace.
def remove_whitespace(word):
    l=[]
    for i in word:
        l.append(i.replace(" ",""))
    return l

In [15]:
Movies_data['cast']=Movies_data['cast'].apply(remove_whitespace)
Movies_data['crew']=Movies_data['crew'].apply(remove_whitespace)
Movies_data['keywords']=Movies_data['keywords'].apply(remove_whitespace)
Movies_data['genres']=Movies_data['genres'].apply(remove_whitespace)

In [18]:
#creating a column of tags which will have all the words to match with.
Movies_data['tags'] = Movies_data['cast']+Movies_data['crew']+Movies_data['genres']+Movies_data['keywords']+Movies_data['overview']

In [20]:
#selecting relevant columns.
Movies_data=Movies_data[['movie_id', 'title', 'tags']]

In [23]:
Movies_data.head(2)


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,samworthington zoesaldana sigourneyweaver jame...
1,285,Pirates of the Caribbean: At World's End,johnnydepp orlandobloom keiraknightley gorever...


In [22]:
#converting the list of tags into words.
Movies_data['tags'] = Movies_data['tags'].apply(lambda x : " ".join(x))

#converting each tag to lowercase.
Movies_data['tags']=Movies_data['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Movies_data['tags'] = Movies_data['tags'].apply(lambda x : " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Movies_data['tags']=Movies_data['tags'].apply(lambda x: x.lower())


In [24]:
#stemming --- conevting words into its base form.
ps = LancasterStemmer()
def stems(text):
    l=[]
    for i in text.split():
        l.append(ps.stem(i))
    return " ".join(l)

In [25]:
Movies_data.head(2)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,samworthington zoesaldana sigourneyweaver jame...
1,285,Pirates of the Caribbean: At World's End,johnnydepp orlandobloom keiraknightley gorever...


In [26]:
Movies_data['tags']=Movies_data['tags'].apply(stems)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Movies_data['tags']=Movies_data['tags'].apply(stems)


In [28]:
#vectorization --- converting the tags into vectors.
cv=CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(Movies_data['tags']).toarray()


In [30]:
#perform cosine similarity --- to find the similar vectors.
similarity = cosine_similarity(vector)


In [31]:
#defining a function that will return the movies --- according to cosine similarity.
def recommender(movie):
    index = Movies_data[Movies_data['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])

    for i in distances[1:6]:
        print(Movies_data.iloc[i[0]].title)
        

In [32]:
recommender('Spider-Man')

Spider-Man 2
Spider-Man 3
21 Jump Street
The Amazing Spider-Man 2
The Amazing Spider-Man


Dumping the model 

In [35]:
pickle.dump(Movies_data, open("Movies.pkl", 'wb'))
pickle.dump(similarity, open("Similarity.pkl", 'wb'))