In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


## Data Preprocessing

In [3]:
# Merging the two datasets
movies = movies.merge(credits, on = 'title')

# Extracting relevant columns
movies = movies[['id','title','overview','genres','keywords','cast','crew']]

# Deleting Missing Values
movies.dropna(inplace = True)

In [4]:
def extract_names(obj):
    mylist = []
    for i in eval(obj):
        mylist.append(i['name'])
    return mylist

def extract_actors(obj):
    mylist = []
    counter = 0
    for i in eval(obj):
        if counter != 3:
            mylist.append(i['name'])
        else: break
    return mylist

def extract_director(obj):
    mylist = []
    for i in eval(obj):
        if i['job'] == 'Director':
            mylist.append(i['name'])
    return mylist

In [5]:
movies['genres'] = movies['genres'].apply(extract_names)
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['keywords'] = movies['keywords'].apply(extract_names)
movies['cast'] = movies['cast'].apply(extract_actors)
movies['crew'] = movies['crew'].apply(extract_director)

In [6]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ','') for i in x])
movies['overview'] = movies['overview'].apply(lambda x: [i.replace(' ','') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ','') for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ','') for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(' ','') for i in x])

In [7]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x).lower())

In [8]:
df = movies[['id','title','tags']]
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


## Model

In [9]:
cv = CountVectorizer(max_features=5000,stop_words='english')
vectors = cv.fit_transform(df['tags']).toarray()
similarity = cosine_similarity(vectors)

  ret = a @ b


In [10]:
def recommend(movie):
    index = df[df['title'] == movie].index[0]
    similar_movies = sorted(enumerate(similarity[index]), key=lambda x: x[1], reverse=True)[1:6]

    for i in similar_movies:
        print(df.iloc[i[0]].title)

recommend('Batman Begins')

The Dark Knight
The Dark Knight Rises
Amidst the Devil's Wings
Batman
Batman & Robin


In [12]:
import pickle

pickle.dump(df, open('data.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))