In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [3]:
#makes the two data-sets into one on the basis of the title
movies = movies.merge(credits,on = "title")
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [4]:
#checking how many null values in the dataframe
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [5]:
#to drop the null values

movies.dropna(inplace =True)

In [6]:
#checking for any duplicated data 
movies.duplicated().sum()

0

In [7]:
#movies.iloc[0].genres

In [8]:
import ast

def format(obj):
    L = []
    for i in ast.literal_eval(obj): #this makes the string a list to call index
        L.append(i['name'])
    return L

#movies['genres'].apply(format)    

In [9]:
movies['genres'] = movies['genres'].apply(format)    
#movies.head()

In [10]:
movies['keywords'] = movies['keywords'].apply(format)


In [11]:
#movies.head()

In [12]:
def convert5(text):  #takes 5 actors
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 5:
            L.append(i['name'])
        counter+=1
    return L 

In [13]:
movies['cast'] = movies['cast'].apply(convert5)

In [14]:
#movies.head()

In [15]:
#only available important crew member is director
def director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

In [16]:
movies['crew'] = movies['crew'].apply(director)

In [17]:
#movies.head()

In [18]:
#changes the overview to a list
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [19]:
#makes Christopher Nolan ChristopherNolan so it won't get confused with another Christopher

def remove_space(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

In [20]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]


In [21]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new_df = movies.drop(columns=['overview','genres','keywords','cast','crew'])


In [22]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x)) #makes that list a string 
#new_df.head()

In [23]:
from nltk import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)
    

In [24]:
new_df['tags'] = new_df['tags'].apply(stem)

In [25]:
#converts to everything to lower case
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

In [26]:
#new_df.head()

In [27]:
#Bag of words sort of naive bayes
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')


In [28]:
# making it as a vector to a numpy array
#this is a very sparse matrix
vectors = cv.fit_transform(new_df['tags']).toarray()


In [29]:
#cv.fit_transform(new_df['tags']).toarray().shape


In [30]:
#this gives us the words which are most common that are stemmed
#cv.get_feature_names() 

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
#range 0 to 1 , 0 being the least similar and 1 being most

In [32]:
similarity = cosine_similarity(vectors)

In [33]:
def recommend(movie):
    #to find the index of the movie that has been passed
    index = new_df[new_df['title'] == movie].index[0]
    #sorts the similarities in descending order, enumerate used to keep the index
    rec_movie = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in rec_movie[1:4]:
        print(new_df.iloc[i[0]].title)

In [34]:
recommend('Avatar')

Aliens vs Predator: Requiem
Independence Day
Falcon Rising
