In [1]:
#We will attempt to perform content based filtering using 1. overview and 2. Keywords, genres, cast and crew
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

meta = pd.read_csv("movies_metadata_preprocessed.csv")
print('Sample overview\n')
print(meta['overview'].head())
meta['overview'] = meta['overview'].fillna('')

count = TfidfVectorizer(stop_words='english')
matrix = count.fit_transform(meta['overview'])
cosine_sim_overview = linear_kernel(matrix, matrix)

indices_overview = pd.Series(meta.index, index=meta['title']).drop_duplicates()

Sample overview

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object


In [2]:
print(cosine_sim_overview)

[[1.         0.01501598 0.         ... 0.         0.00593763 0.        ]
 [0.01501598 1.         0.04686889 ... 0.         0.02199034 0.00927971]
 [0.         0.04686889 1.         ... 0.         0.01404055 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.00593763 0.02199034 0.01404055 ... 0.         1.         0.        ]
 [0.         0.00927971 0.         ... 0.         0.         1.        ]]


In [3]:
def get_recommendations(title, indices, cosine_sim,df):
    idx = indices[title]
    if type(idx) == pd.Series:
        idx = list(idx)[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

print(get_recommendations("The Conjuring", indices_overview, cosine_sim_overview,meta))

38020         The Conjuring 2
8087     The Boston Strangler
41622      Ghosts of Darkness
24885         The Borderlands
24041                   Ouija
39228       Chasing the Devil
19110          The Apparition
39115          Something Evil
16755               The Arbor
37937        8213: Gacy House
Name: title, dtype: object


In [5]:
meta2 = pd.read_csv("movies_metadata_preprocessed.csv")
credits = pd.read_csv('../credits.csv')
meta2 = meta2.merge(credits, on = 'id')
meta2 = meta2.dropna(subset = ['cast', 'crew'])
# print(meta2)
indices_credits = pd.Series(meta2.index, index=meta2['title']).drop_duplicates()

In [6]:
from ast import literal_eval
# print(type(meta2['cast'][0]))
# print(type(meta2['crew'][0]))
# print(type(meta2['keywords'][0]))
# print(type(meta2['genres'][0]))
meta2 = meta2.dropna(subset = ['keywords'])
meta2 = meta2.dropna(subset = ['genres'])  

In [7]:
#Content based filtering using keywords, genres, cast and crew
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval


features = ['cast','crew','keywords', 'genres']

for feature in features:
    meta2.loc[:,feature] = meta2[feature].apply(literal_eval)


In [None]:
import numpy as np
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x,feature):
    if isinstance(x, list):
        if feature!='keywords':
            names = [i['name'] for i in x]
        else:
            names = [i for i in x]
        if len(names) > 3:
            names = names[:3]
        return names  
    return []



def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

meta2['director'] = meta2['crew'].apply(get_director)

features = ['cast', 'keywords','genres']

for feature in features:
    meta2.loc[:,feature] = meta2[feature].apply(get_list,args=(feature,))
#     print('getlist',feature)
    
# meta.loc[:,'genres']=meta.loc[:,'genres'].apply(get_list)
    
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    meta2[feature] = meta2[feature].apply(clean_data)
#     print('cleandata',feature)
    
def combine(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
meta2.loc[:,'combined'] = meta2.apply(combine, axis=1)


count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(meta2['combined'])

cosine_sim_credits = cosine_similarity(count_matrix, count_matrix)

meta2 = meta2.reset_index()


In [9]:
print(get_recommendations('The Conjuring',indices_credits, cosine_sim_credits,meta2))

16732                                        Insidious
21320                             Insidious: Chapter 2
11909                                           Joshua
33367                                              Saw
11616                            The Hills Have Eyes 2
15149                                         The Tomb
18185                                 Gorilla at Large
28579    The Last Will and Testament of Rosalind Leigh
34572                                  Severed Footage
184                                       The Prophecy
Name: title, dtype: object


In [10]:
print(get_recommendations('Iron Man',indices_credits, cosine_sim_credits,meta2))

15033                                 Iron Man 2
16140                               TRON: Legacy
20562                                 Iron Man 3
26139                    Avengers: Age of Ultron
26145                 Captain America: Civil War
1970                                        Tron
17633                               The Avengers
34864                                Slow Action
177      Mighty Morphin Power Rangers: The Movie
5001                            The Time Machine
Name: title, dtype: object
