In [None]:
import numpy as np
import pandas as pd

In [None]:
movies =  pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies.head()

In [None]:
credits.head()

In [None]:
movies.info()

In [None]:
credits.info()

# Data Preprocessing

In [None]:
movies = movies.merge(credits, on='title')

In [None]:
movies.shape

In [None]:
# movies['original_language'].value_counts()
# english movies are almost 95% so it wont help in analysis

In [None]:
# cols that we are keeping 
# genres
# id
# keywords
# title
# overview
# cast
# crew

movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)

In [None]:
movies.isnull().sum()

In [None]:
movies.duplicated().sum()

In [None]:
# genres

movies.iloc[0].genres

In [None]:
# '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
# ['Action', 'Adventure', 'Fantasy', 'SciFi']

# objects are in string
import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

In [None]:
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 

In [None]:
movies['genres'] = movies['genres'].apply(convert)
movies.head()

In [None]:
# keywords
movies['keywords'] = movies['keywords'].apply(convert)
movies.head()

In [None]:
# cast - need top3 actors only
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L 

In [None]:
movies['cast'] = movies['cast'].apply(convert3)
movies.head()

In [None]:
# crew - we need only director
# we need the dictionary that has 'job = director'
movies['crew'][0]

In [None]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L 

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)
movies.head()

In [None]:
movies['cast'][0]

In [None]:
# overview is in 'string' we need it in a 'list'
# movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies.head()

In [None]:
# Sam Worthington -> SamWorthington
# Science Fiction -> ScienceFiction

def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [None]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies.head()

In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies.head()

In [None]:
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new.head()

In [None]:
# tags is list -> string
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

In [None]:
new['tags'][0]

In [None]:
# lowercase
new['tags'] = new['tags'].apply(lambda x: x.lower())
new.head()

# Vectorization

In [None]:
!pip install nltk

In [None]:
# Stemming

import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
        
    return ' '.join(y)

In [None]:
new['tags'] = new['tags'].apply(stem)

In [None]:
new.iloc[0].tags

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vector = cv.fit_transform(new['tags']).toarray()

In [None]:
vector.shape

In [None]:
cv.get_feature_names()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [None]:
similarity.shape

In [None]:
similarity[0]

In [None]:
# tuples (index, similarity)
list(enumerate(similarity[0]))

In [None]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:11]:
        print(new.iloc[i[0]].title)
        

In [None]:
recommend('Batman Begins')

In [None]:
recommend('Gandhi')

In [None]:
import pickle

In [None]:
pickle.dump(new,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [None]:
pickle.dump(new.to_dict(), open('movie_dict.pkl','wb'))

In [None]:
new.to_dict()