In [244]:
import warnings

# Suppress warning messages
warnings.filterwarnings('ignore')

import seaborn as sns
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [245]:
processed_imdb = pd.read_csv(r"C:/Users/z003cu8m/Desktop/Data Sets/ADS508/week four/processed_imdb.csv")
processed_netflix = pd.read_csv(r"C:/Users/z003cu8m/Desktop/Data Sets/ADS508/week four/processed_netflix.csv")
match_list = pd.read_csv(r"C:/Users/z003cu8m/Desktop/Data Sets/ADS508/week four/match_list.csv")

In [246]:
processed_netflix.head()

Unnamed: 0,show_id,type,Netflix Title,director,cast,country,date_added,Release Year,rating,duration,...,Semantic 759,Semantic 760,Semantic 761,Semantic 762,Semantic 763,Semantic 764,Semantic 765,Semantic 766,Semantic 767,Semantic 768
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,...,-0.539,-0.052,0.035,0.298,0.06,0.25,-0.421,-0.264,0.135,-0.043
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,...,-0.453,-0.675,-0.166,-0.106,-0.369,0.535,-0.391,-0.505,0.065,0.253
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016,TV-PG,124,...,0.316,0.163,-0.072,0.016,0.188,0.083,-0.243,-0.253,0.035,-0.425
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016,R,90,...,-0.554,0.001,-0.01,0.13,0.026,-0.212,-0.56,-0.059,0.368,-0.012
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,3-Dec-18,2017,TV-G,1,...,-0.31,0.231,-0.286,0.006,-0.116,0.131,0.6,0.253,0.535,-0.151


In [247]:
# using TfidfVectorizer function to transform the data into feature vectors which makes it a good input estimator.

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(processed_imdb['genres'])

In [248]:
# using Cosine Similarity

from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [249]:
# creating a single dimensional array with movie titles

titles = processed_imdb['IMDB Title']
indices = pd.Series(processed_imdb.index, index=processed_imdb['IMDB Title'])

# creating a function to get movie recommendations based on the cosine similarity score of movie genres

def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [250]:
# testing the recommendation

genre_recommendations('By the Sea').head(10)

6      The Waterer Watered
19      Let Me Dream Again
46        Those Awful Hats
53      Her Crowning Glory
84      His Musical Career
85      His New Profession
89            The Knockout
90            Laughing Gas
91      Mabel at the Wheel
92    Mabel's Married Life
Name: IMDB Title, dtype: object

**2nd Recommendation Engine**

In [251]:
# creating a new column named ‘combined_info’ by joining data from type, Netflix Title, director, cast, type and country columns.

cols = ['Netflix Title','type', 'director', 'cast', 'country']
overall_infos = processed_netflix[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
df = pd.DataFrame(overall_infos)
df.columns =['combined_info']

In [270]:
processed_netflix.head()

Unnamed: 0,show_id,type,Netflix Title,director,cast,country,date_added,Release Year,rating,duration,...,Semantic 759,Semantic 760,Semantic 761,Semantic 762,Semantic 763,Semantic 764,Semantic 765,Semantic 766,Semantic 767,Semantic 768
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,...,-0.539,-0.052,0.035,0.298,0.06,0.25,-0.421,-0.264,0.135,-0.043
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,...,-0.453,-0.675,-0.166,-0.106,-0.369,0.535,-0.391,-0.505,0.065,0.253
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016,TV-PG,124,...,0.316,0.163,-0.072,0.016,0.188,0.083,-0.243,-0.253,0.035,-0.425
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016,R,90,...,-0.554,0.001,-0.01,0.13,0.026,-0.212,-0.56,-0.059,0.368,-0.012
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,3-Dec-18,2017,TV-G,1,...,-0.31,0.231,-0.286,0.006,-0.116,0.131,0.6,0.253,0.535,-0.151


In [253]:
processed_netflix_new = processed_netflix.append(df, ignore_index = True)

In [255]:
processed_netflix_new= processed_netflix_new.dropna(subset=['cast','director'], axis = 0)
processed_netflix_new['combined_info'] = processed_netflix_new['combined_info'].fillna("Unknown")
processed_netflix_new = processed_netflix_new.reset_index( drop=True)


processed_netflix_new.head()

Unnamed: 0,show_id,type,Netflix Title,director,cast,country,date_added,Release Year,rating,duration,...,Semantic 760,Semantic 761,Semantic 762,Semantic 763,Semantic 764,Semantic 765,Semantic 766,Semantic 767,Semantic 768,combined_info
0,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008.0,TV-MA,143.0,...,-0.675,-0.166,-0.106,-0.369,0.535,-0.391,-0.505,0.065,0.253,Unknown
1,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016.0,TV-PG,124.0,...,0.163,-0.072,0.016,0.188,0.083,-0.243,-0.253,0.035,-0.425,Unknown
2,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016.0,R,90.0,...,0.001,-0.01,0.13,0.026,-0.212,-0.56,-0.059,0.368,-0.012,Unknown
3,s1002,Movie,Blue Ruin,Jeremy Saulnier,"Macon Blair, Devin Ratray, Amy Hargreaves, Kev...","United States, France",25-Feb-19,2013.0,R,90.0,...,-0.038,0.149,-0.381,0.249,0.097,-0.298,0.006,0.117,-0.187,Unknown
4,s1003,Movie,Blue Streak,Les Mayfield,"Martin Lawrence, Luke Wilson, Peter Greene, Da...","Germany, United States",1-Jan-21,1999.0,PG-13,94.0,...,-0.315,-0.17,-0.229,0.21,0.251,-0.25,-0.011,-0.264,-0.035,Unknown


In [269]:
# applying cosine similarity

cv = CountVectorizer()
converted_matrix = cv.fit_transform(processed_netflix_new['combined_info'])
cosine_similarity = cosine_similarity(converted_matrix)

TypeError: 'numpy.ndarray' object is not callable

In [258]:
# this function will get the id of the movie and check the similarity between it and other movies

title = 'Iron Man'

movie_id = processed_netflix[processed_netflix['Netflix Title'] == title]['show_id'].values[0]
score = list(enumerate(cosine_similarity[movie_id]))
sorted_score = sorted(score, key=lambda x:x[1], reverse= True)

sorted_score = sorted_score[1:]
sorted_score[0:10]
i = 0
for item in sorted_score:
    movie_title = processed_netflix[processed_netflix['show_id'] == item[0]]['Netflix Title'].values[0]
    print(i+1,movie_title)
    i = i+1
    if i > 4:
        break

**3rd Recommendation Engine**

In [139]:
final_dataset = popular_movies.pivot_table(index='IMDB Title',columns='IMDB ID',values='averageRating').fillna(0)
final_dataset.head()

IMDB ID,tt0000001,tt0000003,tt0000005,tt0000008,tt0000010,tt0000012,tt0000014,tt0000023,tt0000029,tt0000070,...,tt0032149,tt0032152,tt0032153,tt0032155,tt0032156,tt0032157,tt0032158,tt0032181,tt0032186,tt0032194
IMDB Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'49-'17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'G' Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Neath the Arizona Skies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"20,000 Leagues Under the Sea",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"20,000 Years in Sing Sing",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
sample = np.array([[0,0,3,0,0],[4,0,0,0,2],[0,0,0,0,1]])
sparsity = 1.0 - ( np.count_nonzero(sample) / float(sample.size) )
print(sparsity)

0.7333333333333334


In [141]:
csr_sample = csr_matrix(sample)
print(csr_sample)

  (0, 2)	3
  (1, 0)	4
  (1, 4)	2
  (2, 4)	1


In [142]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [143]:
def get_movie_recommendation(movie_name):
    n_movies_to_reccomend = 10
    movie_list = processed_imdb[processed_imdb['IMDB Title'].str.contains(movie_name)]  
    if len(movie_list):        
        movie_idx= movie_list.iloc[0]['IMDB Title']
        movie_idx = final_dataset[final_dataset['IMDB Title'] == movie_idx].index[0]
        distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = final_dataset.iloc[val[0]]['IMDB Title']
            idx = processed_imdb[processed_imdb['IMDB Title'] == movie_idx].index
            recommend_frame.append({'Title':processed_imdb.iloc[idx]['IMDB Title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found. Please check your input"

In [105]:
get_movie_recommendation('Iron Man')

Unnamed: 0,Title,Distance
1,The Conquest of the Pole,1.0
2,The Count,1.0
3,The Count of Monte Cristo,1.0
4,The Conjuring of a Woman at the House of Rober...,1.0
5,The Cigarette Girl of Mosselprom,1.0
6,The Cocoanuts,1.0
7,The Clairvoyant,1.0
8,The City Without Jews,1.0
9,The Citadel,1.0
10,The Conquerors,1.0
