## TF-IDF Based Content Filtering
To filter out similar movies from a given title

In [172]:
import numpy as np
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [194]:
credits = pd.read_csv("../Datasets/tmdb_5000_credits.csv")
movies = pd.read_csv("../Datasets/tmdb_5000_movies.csv")

In [206]:
credits.rename(columns={'movie_id':'id'}, inplace=True)
movies_merged_df = movies.merge(credits,on='id')
movies_features = movies_merged_df[['id','original_title','overview','genres','cast']]

movies_features.loc[:,'genres'] = pd.Series([[j['name'] for j in json.loads(movies_features['genres'][i])] for i in range(len(movies))])
movies_features.loc[:,'genres_string'] = pd.Series([" ".join([elem for elem in movies_features['genres'][i]]) for i in range(len(movies))])

movies_features.loc[:,'characters'] = [[i['character'] for i in json.loads(movies_features['cast'][j])[:5]] for j in range(len(movies_features))]

movies_features.loc[:,'characters_string'] = [", ".join([character for character in movies_features['characters'][j]]) for j in range(len(movies_features))]
movies_features.loc[:,'overview'] = movies_features['overview'].fillna("")
movies_features.loc[:,'genres_string'] = movies_features['genres_string'].fillna("")
movies_features.loc[:,'characters_string'] = movies_features['characters_string'].fillna("")
movies_features.loc[:,'genres_overview_characters'] = movies_features['genres_string'] + 3*movies_features['overview'] + 5*movies_features['characters_string']
# movies_features[movies_features['genres_overview_characters'].isna()]

In [196]:
tfv = TfidfVectorizer(min_df=1,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')
movies_features['genres_overview_characters'].fillna(" ",inplace=True)
movies_features = movies_features.append({"original_title":"New Movie", "genres_overview_characters": "Jake Sully Batman"}, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [197]:
tfv_matrix = tfv.fit_transform(movies_features['genres_overview_characters'])

In [198]:
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
indices = pd.Series(movies_features.index, index=movies_features['original_title']).drop_duplicates()

In [199]:
def give_rec(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    
    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]
    print(sig_scores)
    
    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return movies_summaries['original_title'].iloc[movie_indices]

In [200]:
give_rec('New Movie')

[(428, 0.7615942635661447), (0, 0.7615942356982714), (4519, 0.7615942339088927), (4139, 0.7615942332579542), (3153, 0.761594233200953), (3, 0.7615942323334496), (119, 0.7615942233896763), (299, 0.7615942217487373), (1753, 0.7615942157977913), (397, 0.7615942141069796)]


428            Batman Returns
0                      Avatar
4519           To Save A Life
4139                Nine Dead
3153    Into the Grizzly Maze
3       The Dark Knight Rises
119             Batman Begins
299            Batman Forever
1753       The Blues Brothers
397          It's Complicated
Name: original_title, dtype: object