In [87]:
import warnings

# Suppress warning messages
warnings.filterwarnings('ignore')

import seaborn as sns
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [88]:
processed_imdb = pd.read_csv(r"C:/Users/z003cu8m/Desktop/Data Sets/ADS508/week four/processed_imdb.csv")
processed_netflix = pd.read_csv(r"C:/Users/z003cu8m/Desktop/Data Sets/ADS508/week four/processed_netflix.csv")
match_list = pd.read_csv(r"C:/Users/z003cu8m/Desktop/Data Sets/ADS508/week four/match_list.csv")

In [89]:
processed_netflix.head()

Unnamed: 0,show_id,type,Netflix Title,director,cast,country,date_added,Release Year,rating,duration,...,Semantic 759,Semantic 760,Semantic 761,Semantic 762,Semantic 763,Semantic 764,Semantic 765,Semantic 766,Semantic 767,Semantic 768
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,...,-0.538659,-0.052227,0.035186,0.29815,0.059797,0.250116,-0.421163,-0.263913,0.135168,-0.042835
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,...,-0.452894,-0.675485,-0.166111,-0.106259,-0.3691,0.534542,-0.391329,-0.505261,0.065416,0.252679
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016,TV-PG,124,...,0.315896,0.162501,-0.071881,0.016402,0.187876,0.083418,-0.242689,-0.252881,0.035388,-0.425325
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016,R,90,...,-0.554449,0.001212,-0.010456,0.130359,0.025739,-0.212267,-0.560338,-0.059075,0.367698,-0.012123
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,3-Dec-18,2017,TV-G,1,...,-0.309526,0.231067,-0.285924,0.005509,-0.116425,0.131404,0.59998,0.25297,0.534538,-0.150926


In [90]:
# using TfidfVectorizer function to transform the data into feature vectors which makes it a good input estimator.

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(processed_imdb['genres'])

In [91]:
# using Cosine Similarity

from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [92]:
# creating a single dimensional array with movie titles

titles = processed_imdb['IMDB Title']
indices = pd.Series(processed_imdb.index, index=processed_imdb['IMDB Title'])

# creating a function to get movie recommendations based on the cosine similarity score of movie genres

def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [93]:
# testing the recommendation

genre_recommendations('Iron Man').head(10)

70          Ingeborg Holm
86             Hypocrites
88     Judith of Bethulia
103                Carmen
108      A Fool There Was
112           The Italian
117           After Death
125          Civilization
127    The Devil's Needle
139             King Lear
Name: IMDB Title, dtype: object

**2nd Recommendation Engine**

In [94]:
# creating a new column named ‘combined_info’ by joining data from type, Netflix Title, director, cast, type and country columns.

cols = ['Netflix Title','type', 'director', 'cast', 'country']
overall_infos = processed_netflix[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
df = pd.DataFrame(overall_infos)
df.columns =['combined_info']

In [95]:
processed_netflix.head()

Unnamed: 0,show_id,type,Netflix Title,director,cast,country,date_added,Release Year,rating,duration,...,Semantic 759,Semantic 760,Semantic 761,Semantic 762,Semantic 763,Semantic 764,Semantic 765,Semantic 766,Semantic 767,Semantic 768
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,...,-0.538659,-0.052227,0.035186,0.29815,0.059797,0.250116,-0.421163,-0.263913,0.135168,-0.042835
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,...,-0.452894,-0.675485,-0.166111,-0.106259,-0.3691,0.534542,-0.391329,-0.505261,0.065416,0.252679
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016,TV-PG,124,...,0.315896,0.162501,-0.071881,0.016402,0.187876,0.083418,-0.242689,-0.252881,0.035388,-0.425325
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016,R,90,...,-0.554449,0.001212,-0.010456,0.130359,0.025739,-0.212267,-0.560338,-0.059075,0.367698,-0.012123
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,3-Dec-18,2017,TV-G,1,...,-0.309526,0.231067,-0.285924,0.005509,-0.116425,0.131404,0.59998,0.25297,0.534538,-0.150926


In [96]:
processed_netflix_new = processed_netflix.append(df, ignore_index = True)

In [97]:
processed_netflix_new= processed_netflix_new.dropna(subset=['cast','director'], axis = 0)
processed_netflix_new['combined_info'] = processed_netflix_new['combined_info'].fillna("Unknown")
processed_netflix_new = processed_netflix_new.reset_index( drop=True)


processed_netflix_new.head()

Unnamed: 0,show_id,type,Netflix Title,director,cast,country,date_added,Release Year,rating,duration,...,Semantic 760,Semantic 761,Semantic 762,Semantic 763,Semantic 764,Semantic 765,Semantic 766,Semantic 767,Semantic 768,combined_info
0,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008.0,TV-MA,143.0,...,-0.675485,-0.166111,-0.106259,-0.3691,0.534542,-0.391329,-0.505261,0.065416,0.252679,Unknown
1,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016.0,TV-PG,124.0,...,0.162501,-0.071881,0.016402,0.187876,0.083418,-0.242689,-0.252881,0.035388,-0.425325,Unknown
2,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016.0,R,90.0,...,0.001212,-0.010456,0.130359,0.025739,-0.212267,-0.560338,-0.059075,0.367698,-0.012123,Unknown
3,s1002,Movie,Blue Ruin,Jeremy Saulnier,"Macon Blair, Devin Ratray, Amy Hargreaves, Kev...","United States, France",25-Feb-19,2013.0,R,90.0,...,-0.037822,0.148676,-0.381409,0.249032,0.096513,-0.298137,0.006479,0.117457,-0.186952,Unknown
4,s1003,Movie,Blue Streak,Les Mayfield,"Martin Lawrence, Luke Wilson, Peter Greene, Da...","Germany, United States",1-Jan-21,1999.0,PG-13,94.0,...,-0.315207,-0.170112,-0.228944,0.210249,0.250542,-0.250159,-0.010726,-0.263616,-0.034928,Unknown


In [98]:
# applying cosine similarity

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer()
converted_matrix = cv.fit_transform(processed_netflix_new['combined_info'])
movie_id = np.array([[converted_matrix]])
movie_id = list(enumerate(converted_matrix))

cosine_similarity = cosine_similarity(converted_matrix)

In [99]:
# this function will get the id of the movie and check the similarity between it and other movies

title = 'Blue Streak'

movie_id = processed_netflix[processed_netflix['Netflix Title'] == title]['show_id'].values[0]
#score = list(enumerate(cosine_similarity[movie_id]))
#sorted_score = sorted(score, key=lambda x:x[1], reverse= True)

#sorted_score = sorted_score[1:]
#sorted_score[0:10]
#i = 0
#for item in sorted_score:
 #   movie_title = processed_netflix[processed_netflix['show_id'] == item[0]]['Netflix Title'].values[0]
  #  print(i+1,movie_title)
   # i = i+1
    #if i > 4:
     #   break
print(movie_id)

s1003


**3rd Recommendation Engine**

In [100]:
processed_imdb.head()

Unnamed: 0,IMDB ID,titleType,IMDB Title,originalTitle,isAdult,Release Year,runtimeMinutes,genres,averageRating,numVotes,...,Semantic 759,Semantic 760,Semantic 761,Semantic 762,Semantic 763,Semantic 764,Semantic 765,Semantic 766,Semantic 767,Semantic 768
0,tt0000001,short,Carmencita,Carmencita,0,1894,1,"Documentary,Short",5.6,1694,...,-1.042062,0.214003,-0.083587,-0.006674,-0.194486,-0.150251,-0.094577,-0.605529,0.283749,-0.15579
1,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance",6.5,1441,...,-0.066071,-0.28478,0.224536,-0.515511,-0.381292,0.395341,-0.145168,-0.427048,0.292864,0.346208
2,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,1,"Comedy,Short",6.1,2240,...,-0.749765,-0.348932,-0.229298,0.192425,-0.00643,0.314851,-0.285497,0.380123,-0.002542,-0.434679
3,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,1,"Documentary,Short",5.4,1870,...,-0.382631,-0.429746,-0.322701,-0.019224,0.153636,0.045824,0.086952,-0.154917,0.119396,0.572872
4,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,1,"Documentary,Short",6.9,6271,...,-0.18542,-0.400863,-0.277769,0.097244,-0.335937,0.090837,-0.210895,-0.066578,-0.059184,-0.056082


In [101]:
# creating a new dataframe where each column would represent the unique IMBD ID and each row represents the unique IMBD Title.

final_dataset = processed_imdb.pivot_table(index='IMDB Title',columns='IMDB ID',values='averageRating').fillna(0)
final_dataset.head()

IMDB ID,tt0000001,tt0000003,tt0000005,tt0000008,tt0000010,tt0000012,tt0000014,tt0000023,tt0000029,tt0000070,...,tt0032149,tt0032152,tt0032153,tt0032155,tt0032156,tt0032157,tt0032158,tt0032181,tt0032186,tt0032194
IMDB Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'49-'17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'G' Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Neath the Arizona Skies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"20,000 Leagues Under the Sea",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"20,000 Years in Sing Sing",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
# removing any sparsity

sample = np.array([[0,0,3,0,0],[4,0,0,0,2],[0,0,0,0,1]])
sparsity = 1.0 - ( np.count_nonzero(sample) / float(sample.size) )
print(sparsity)

0.7333333333333334


In [103]:
from scipy.sparse import csr_matrix

csr_sample = csr_matrix(sample)
print(csr_sample)

  (0, 2)	3
  (1, 0)	4
  (1, 4)	2
  (2, 4)	1


In [104]:
csr_data = csr_matrix(final_dataset.values)
final_dataset.reset_index(inplace=True)

In [105]:
# creating the recommendation system using knn

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [106]:
# creating the recommendation function

def get_movie_recommendation(movie_name):
    n_movies_to_reccomend = 10
    movie_list = processed_imdb[processed_imdb['IMDB Title'].str.contains(movie_name)]  
    if len(movie_list):        
        movie_idx= movie_list.iloc[0]['IMDB Title']
        movie_idx = final_dataset[final_dataset['IMDB Title'] == movie_idx].index[0]
        distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = final_dataset.iloc[val[0]]['IMDB Title']
            idx = processed_imdb[processed_imdb['IMDB Title'] == movie_idx].index
            recommend_frame.append({'Title':processed_imdb.iloc[idx]['IMDB Title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found. Please check your input"

In [107]:
get_movie_recommendation('Iron Man')

Unnamed: 0,Title,Distance
1,The Conquerors,1.0
2,The Cossacks,1.0
3,The Count,1.0
4,The Cocoanuts,1.0
5,The Childhood of Maxim Gorky,1.0
6,The Clairvoyant,1.0
7,The City Without Jews,1.0
8,The Citadel,1.0
9,The Circus,1.0
10,The Conjuring of a Woman at the House of Rober...,1.0
