In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import json
import jsonpickle
import codecs

data = pd.read_csv('../Data/movie_metadata.csv')

In [6]:
# Add a new column with the name of `index`
data.reset_index(level=0, inplace=True)
# Rename the newly made column to `movie_id`
data.rename(columns={'index': 'movie_id'}, inplace=True)

In [7]:
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
data['plot_keywords'] = data['plot_keywords'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data['plot_keywords'])

# Import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)
data

Unnamed: 0,movie_id,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,4,,Doug Walker,,,131.0,,Rob Walker,131.0,,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,5038,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
5039,5039,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
5040,5040,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
5041,5041,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [10]:
data.to_json('../Data/data_cast_plot.json', orient = 'split', compression = 'infer')

In [9]:
reverse_map = data[['movie_title', 'movie_id']]
reverse_map = reverse_map.applymap(
    lambda x: x.strip() if isinstance(x, str) else x)

In [11]:
reverse_map.to_json('../Data/reverse_map_cast_plot.json', orient = 'split', compression = 'infer')

In [12]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [13]:
df2 = data[['movie_id', 'movie_title', 'director_name',
            'actor_1_name', 'actor_2_name', 'actor_3_name']].copy()

# Apply clean_data function to your features.
features = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

df2['tags'] = df2['director_name'] + ' ' + df2['actor_1_name'] + \
    ' ' + df2['actor_2_name'] + ' ' + df2['actor_3_name']

new = df2.drop(columns=['director_name', 'actor_1_name',
               'actor_2_name', 'actor_3_name'])

In [15]:
cv = CountVectorizer(max_features=5000, stop_words='english')

vector = cv.fit_transform(new['tags']).toarray()
# vector.shape
cosine_sim2 = cosine_similarity(vector)

In [16]:
new = new.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [19]:
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [23]:
import pickle
pickle.dump(cosine_sim, open("../Data/tfidf.pickle", "wb"))
pickle.dump(cosine_sim2, open("../Data/cv.pickle", "wb"))

In [22]:
selector = pickle.load(open("../Data/tfidf.pickle", "rb"))
selector

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])