In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
# from surprise import Reader, Dataset, SVD, evaluate
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import pickle

import warnings; warnings.simplefilter('ignore')

In [4]:
smd = pd. read_csv('./dataset/tfid_smd.csv')

In [5]:
smd

Unnamed: 0.1,Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,description
0,0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,en,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ..."
1,1,1,False,,65000000,"['Adventure', 'Fantasy', 'Family']",,8844,tt0113497,en,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...
2,2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"['Romance', 'Comedy']",,15602,tt0113228,en,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,A family wedding reignites the ancient feud be...
3,3,3,False,,16000000,"['Comedy', 'Drama', 'Romance']",,31357,tt0114885,en,...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom..."
4,4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,['Comedy'],,11862,tt0113041,en,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,Just when George Banks has recovered from his ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9094,9094,40224,False,,15000000,"['Action', 'Adventure', 'Drama', 'Horror', 'Sc...",,315011,tt4262980,ja,...,120.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,A god incarnate. A city doomed.,Shin Godzilla,False,6.6,152.0,2016,From the mind behind Evangelion comes a hit la...
9095,9095,40503,False,,0,"['Documentary', 'Music']",http://www.thebeatlesliveproject.com/,391698,tt2531318,en,...,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The band you know. The story you don't.,The Beatles: Eight Days a Week - The Touring Y...,False,7.6,92.0,2016,"The band stormed Europe in 1963, and, in 1964,..."
9096,9096,44821,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000,"['Adventure', 'Fantasy', 'Animation', 'Action'...",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,...,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,144.0,2000,When Molly Hale's sadness of her father's disa...
9097,9097,44826,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0,"['Adventure', 'Fantasy', 'Animation', 'Science...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,...,75.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0,2001,"All your favorite Pokémon characters are back,..."


We have **9099** movies avaiable in our small movies metadata dataset which is 5 times smaller than our original dataset of 45000 movies.

### Movie Description Based Recommender

Let us first try to build a recommender using movie descriptions and taglines. We do not have a quantitative metric to judge our machine's performance so this will have to be done qualitatively.

In [7]:
# Testing phase
tf1 = pickle.load(open("/home/convergytics/Desktop/tfidf1.pkl", 'rb'))

In [None]:
cosine_sim = linear_kernel(tf1, tf1)

# Primary Prediction

#### Cosine Similarity

I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two movies. Mathematically, it is defined as follows:

$cosine(x,y) = \frac{x. y^\intercal}{||x||.||y||} $

Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. Therefore, we will use sklearn's **linear_kernel** instead of cosine_similarities since it is much faster.

In [None]:
cosine_sim[0]

In [None]:
# cosine_sim_2[0]

We now have a pairwise cosine similarity matrix for all the movies in our dataset. The next step is to write a function that returns the 30 most similar movies based on the cosine similarity score.

In [None]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
# def get_recommendations2(title):
#     idx = indices[title]
#     sim_scores = list(enumerate(cosine_sim_2[idx]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:31]
#     movie_indices = [i[0] for i in sim_scores]
#     return titles.iloc[movie_indices]

We're all set. Let us now try and get the top recommendations for a few movies and see how good the recommendations are.

In [None]:
get_recommendations('The Godfather').head(10)

In [None]:
get_recommendations('The Dark Knight').head(10)

# Second Prediction

In [None]:
smd = pd. read_csv('./dataset/count_smd.csv')

In [None]:
# Testing phase
count1 = pickle.load(open("/home/convergytics/Desktop/count1.pkl", 'rb'))

In [None]:
cosine_sim = cosine_similarity(count1, count1)

In [None]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

We will reuse the get_recommendations function that we had written earlier. Since our cosine similarity scores have changed, we expect it to give us different (and probably better) results. Let us check for **The Dark Knight** again and see what recommendations I get this time around.

In [None]:
get_recommendations('The Dark Knight').head(10)

In [None]:
# smd['title'].to_csv('titles.txt',index=False)

In [None]:
# crimefile = open("titles.txt", "r")
# yourResult = [line.split('\n') for line in crimefile.readlines()]
# # list_of_lists = []
# # for line in a_file:
# #   stripped_line = line.strip()
# #   line_list = stripped_line.split()
# #   list_of_lists.append(line_list)

# # a_file.close()

# # print(list_of_lists)
# print(yourResult)

In [None]:
# import json
# with open("titles.txt", "r") as grilled_cheese:
#     lines = grilled_cheese.readlines()
#     quantities = []
#     ingredients = []
#     for l in lines:
#         as_list = l.split(", ")
#         ingredients.append(as_list[0].replace("\n",""))
#         # ingredients.append(as_list.replace("\n", ""))
# 			    #  quantities.append(as_list[0])
#                 # ingredients.append(as_list.replace("\n", ""))
#                            # 
#     print(as_list)
#     print(ingredients)
#     file = open('title_var.txt', 'w')
#     json.dump(ingredients, file)
#     file.close()    