In [21]:
#We will attempt to perform content based filtering using 1. overview and 2. Keywords, genres, cast and crew
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

meta = pd.read_csv("movies_metadata_preprocessed.csv")
print(meta['overview'].head())
meta['overview'] = meta['overview'].fillna('')

count = TfidfVectorizer(stop_words='english')
matrix = count.fit_transform(meta['overview'])
cosine_sim = linear_kernel(matrix, matrix)

indices = pd.Series(meta.index, index=meta['title']).drop_duplicates()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    Cheated on, mistreated and stepped on, the wom...
3    Just when George Banks has recovered from his ...
4    Obsessive master thief, Neil McCauley leads a ...
Name: overview, dtype: object


In [22]:
print(cosine_sim)

[[ 1.         0.0185791  0.        ...,  0.         0.         0.       ]
 [ 0.0185791  1.         0.        ...,  0.         0.         0.       ]
 [ 0.         0.         1.        ...,  0.         0.         0.       ]
 ..., 
 [ 0.         0.         0.        ...,  1.         0.         0.       ]
 [ 0.         0.         0.        ...,  0.         1.         0.       ]
 [ 0.         0.         0.        ...,  0.         0.         1.       ]]


In [23]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return meta['title'].iloc[movie_indices]


rec = get_recommendations("The Conjuring")
print(rec)

6974          The Conjuring 2
6085                    Ouija
5378           The Apparition
6539              Dark Places
4985                The Arbor
3466    The Amityville Horror
5051                 The Ward
194             Jason's Lyric
2785           Into the Night
5557               Dark Skies
Name: title, dtype: object


In [79]:
meta = pd.read_csv("movies_metadata_preprocessed.csv")
credits = pd.read_csv('credits.csv')
meta = meta.merge(credits, on = 'id')
meta = meta.dropna(subset = ['cast', 'crew'])
print(meta)

      adult                              belongs_to_collection     budget  \
0     False  {'id': 10194, 'name': 'Toy Story Collection', ...   30000000   
1     False                                                NaN   65000000   
2     False                                                NaN   16000000   
3     False  {'id': 96871, 'name': 'Father of the Bride Col...          0   
4     False                                                NaN   60000000   
5     False                                                NaN   35000000   
6     False  {'id': 645, 'name': 'James Bond Collection', '...   58000000   
7     False                                                NaN   62000000   
8     False  {'id': 117693, 'name': 'Balto Collection', 'po...          0   
9     False                                                NaN   44000000   
10    False                                                NaN   98000000   
11    False                                                NaN   52000000   

In [80]:
from ast import literal_eval
print(type(meta['cast'][0]))
print(type(meta['crew'][0]))
print(type(meta['keywords'][0]))
print(type(meta['genres'][0]))
meta = meta.dropna(subset = ['keywords'])
meta = meta.dropna(subset = ['genres'])

# for i,row in meta.iterrows():
# #     meta['keywords'][i] = literal_eval(meta['keywords'][i])
#     print(meta['keywords'][i])
#     print(type(meta['keywords'][i]))  

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [81]:
#Content based filtering using keywords, genres, cast and crew
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval


features = ['cast','crew','keywords', 'genres']

for feature in features:
    meta[feature] = meta[feature].apply(literal_eval)
    


In [97]:
import numpy as np
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    
    return []
def get_list1(x):
    if isinstance(x, dict):
        names = [i['name'] for i in x]
        print(names)
        if len(names) > 3:
            names = names[:3]
        return names
    return []

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

meta['director'] = meta['crew'].apply(get_director)

features = ['cast', 'keywords']

for feature in features:
    meta[feature] = meta[feature].apply(get_list)
    
meta['genres']=meta['genres'].apply(get_list1)
    
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    meta[feature] = meta[feature].apply(clean_data)
    print(feature)
    
def combine(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
meta['combined'] = meta.apply(combine, axis=1)


count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(meta['combined'])

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

meta = meta.reset_index()
indices = pd.Series(meta.index, index=meta['title'])

cast
keywords
director
genres


In [110]:
get_recommendations('The Conjuring', cosine_sim2)

4982                         Insidious
6351                               Pan
4034                            Joshua
4356    The Boy in the Striped Pyjamas
4586                     Up in the Air
5214                       Young Adult
6398                    Monkey Kingdom
39               Kicking and Screaming
1962                 Cannonball Run II
4992                    The Nutcracker
Name: title, dtype: object