In [None]:
# importing useful libraries

import numpy as np
import pandas as pd
import csv
import ast
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [None]:
# reading the datasets
movies=pd.read_csv('/content/tmdb_5000_movies.csv')
credits=pd.read_csv('/content/tmdb_5000_credits.csv')

In [None]:
# need to merge the datasets on a common column 

movies=movies.merge(credits, on ='title')

In [None]:
#movie_id
#title
#overview
#genres
#keywrods
#cast
#crew
# the above list of columns are only useful information to recommend the movies
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [None]:
# remove missing data
movies.isnull().sum
movies.dropna(inplace=True)

# check for duplicates
movies.duplicated().sum()

0

In [None]:
# helper functions to convert genres,cast and crew into required format

def convert(obj):
    list=[]
    obj=ast.literal_eval(obj)
    for i in obj:
        list.append(i['name'])
    return list

def fetch_cast(obj):
    list=[]
    counter=0
    obj=ast.literal_eval(obj)
    for i in obj:
        if counter!=3:
            list.append(i['name'])
            counter+=1
        else:
            break
    return list

def fetch_crew(obj):
    list=[]
    obj=ast.literal_eval(obj)
    for i in obj:
        if i['job'] == 'Director':
            list.append(i['name'])
            break
    return list


In [None]:
# converting the columns overview, genres, cast and crew into desirable format using helper functions
movies['overview']=movies['overview'].apply(lambda x:x.split())
movies['genres']=movies['genres'].apply(convert)
movies['keywords']=movies['keywords'].apply(convert)
movies['cast']=movies['cast'].apply(fetch_cast)
movies['crew']=movies['crew'].apply(fetch_crew)


In [None]:
# removing spaces between the words to consider it as a single word because Sam Worthing is different from Sam Anderson so removing spaces between them.
movies['genres']= movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords']= movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']= movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']= movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
#concatenate all the necessary columns into a new column tags

movies['tags']= movies['overview'] + movies['genres'] + movies['keywords']+ movies['cast'] + movies['crew']

#create a new dataframe with only necessary columns 
movies = movies[['movie_id','title', 'tags']]

# converting list items string (into a paragraph)
movies['tags'] = movies['tags'].apply(lambda x:" ".join(x))

#convert the tags into lower case
movies['tags'] = movies['tags'].apply(lambda x:x.lower())



In [None]:
#need to perform stemming on the tags so that words like action, actions doesn't get repeated as diffferent words. 

ps = PorterStemmer()

#helper function for stemming
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

movies['tags'] = movies['tags'].apply(stem)

In [None]:
#movies.head()
#movies['tags'][0]
#need t convert each movie into vectors through vectorization
#using bag of words method choosing most frequent 5000 words from the concatenation of all tags. 
cv = CountVectorizer(max_features=5000,stop_words='english')

#converting the sparse matrix into a numpy array
vectors = cv.fit_transform(movies['tags']).toarray()
#

In [None]:
# we need to find the cosine distances between each pair of vectors we have created. 
#So that lesser the distance more the similarity is. Also we can find similarity directly.

similarity = cosine_similarity(vectors)

#converting similarities of each movie into a list of tuples having index and similairty of specific movie as elements of tuples
sorted(list(enumerate(similarity[0])),reverse = True,key=lambda x:x[1])[1:6]

In [None]:
def recommend(movie):
    #fetching index of movies
    
    movie_index = movies[movies['title'] == movie].index[0]

    #converting similarities of each movie into a list of tuples having index and similairty of specific movie as elements of tuples
    movies_list = sorted(list(enumerate(similarity[movie_index])),reverse = True,key=lambda x:x[1])[1:10]

    for i in movies_list:
        print(movies.iloc[i[0]].title)

def recommend_movies(movie_list):

    total = movie_list
    for i in range(3):
        movie_index = movies[movies['title'] == movie_list[i]].index[0]
        movies_list = sorted(list(enumerate(similarity[movie_index])),reverse = True,key=lambda x:x[1])[1:10]
        movie_name=[]
        counter=0
        for j in movies_list:
            movie_name = movies.iloc[j[0]].title
            if movie_name in total:
                continue
            else:
                counter+=1
                print(movie_name)
                total+=[movie_name]
                if i==0:
                    if counter==3:
                        break
                if i>0:
                    if counter==1:
                        break

            



        


In [None]:
#recommend('Spider-Man 2')
recommend_movies(['Spider-Man','Spider-Man 3','Spider-Man 2'])

In [None]:
pickle.dump(movies.to_dict(),open('movies_dict.pkl','wb'))

In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))