In [1]:
import numpy as np 
import pandas as pd
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
#importing movie metadata and keep necessary columns
meta = pd.read_csv("movies_metadata.csv")
meta = meta[['id', 'original_title', 'original_language',
             'revenue', 'vote_average', 'vote_count', 'popularity', 'genres']]
meta = meta.rename(columns={'id':'movieId'})
meta = meta[meta['original_language']== 'en']
meta.head()

Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genres
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,"[{'id': 35, 'name': 'Comedy'}]"


In [3]:
meta.genres = [list(map(int, re.findall('\d+', x))) for x in meta.genres]
meta.head()

Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genres
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,"[16, 35, 10751]"
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,"[12, 14, 10751]"
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,"[10749, 35]"
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,"[35, 18, 10749]"
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,[35]


In [4]:
max_length = len(max(meta.genres, key = len))
print('Max # of Genres: ', max_length)

def padarray(A, size):
    t = size - len(A)
    return np.pad(A, pad_width=(0, t), mode='constant')

meta.genres = [padarray(x, max_length) for x in meta.genres]
meta.head()

Max # of Genres:  8


Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genres
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,"[16, 35, 10751, 0, 0, 0, 0, 0]"
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,"[12, 14, 10751, 0, 0, 0, 0, 0]"
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,"[10749, 35, 0, 0, 0, 0, 0, 0]"
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,"[35, 18, 10749, 0, 0, 0, 0, 0]"
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,"[35, 0, 0, 0, 0, 0, 0, 0]"


In [5]:
ref = pd.read_csv("movies_metadata.csv")
ref = ref[['id', 'original_title', 'original_language',
           'revenue', 'vote_average', 'vote_count', 'popularity', 'genres']]
ref = ref.rename(columns={'id':'movieId'})
ref = ref[ref['original_language']== 'en']
ref.genres = [list(map(int, re.findall('\d+', x))) for x in ref.genres]
ref.genres = [padarray(x, max_length) for x in ref.genres]
ref.head()

Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genres
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,"[16, 35, 10751, 0, 0, 0, 0, 0]"
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,"[12, 14, 10751, 0, 0, 0, 0, 0]"
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,"[10749, 35, 0, 0, 0, 0, 0, 0]"
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,"[35, 18, 10749, 0, 0, 0, 0, 0]"
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,"[35, 0, 0, 0, 0, 0, 0, 0]"


In [6]:
for n in range(0, max_length):
    meta['genre'+str(n+1)] = meta.genres.apply(lambda x: int(x[n]))

meta.drop('genres', axis=1, inplace=True)
meta.head()

Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genre1,genre2,genre3,genre4,genre5,genre6,genre7,genre8
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,16,35,10751,0,0,0,0,0
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,12,14,10751,0,0,0,0,0
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,10749,35,0,0,0,0,0,0
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,35,18,10749,0,0,0,0,0
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,35,0,0,0,0,0,0,0


In [7]:
#importing movie ratings and keep necessary columns
ratings = pd.read_csv("ratings.csv")
ratings = ratings[['userId', 'movieId', 'rating']]

# taking a 2.5MM sample because it can take too long to pivot data later on
ratings = ratings.head(2500000)

#convert data types before merging
meta.movieId = pd.to_numeric(meta.movieId, errors = 'coerce')
ratings.movieId = pd.to_numeric(ratings.movieId, errors = 'coerce')

#merge the 2 datasets, so that we can have the labels for the movie titles
data= pd.merge(ratings, meta, on = 'movieId', how = 'inner')
data.head()

Unnamed: 0,userId,movieId,rating,original_title,original_language,revenue,vote_average,vote_count,popularity,genre1,genre2,genre3,genre4,genre5,genre6,genre7,genre8
0,1,858,5.0,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0
1,3,858,4.0,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0
2,5,858,5.0,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0
3,12,858,4.0,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0
4,20,858,4.5,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0


In [8]:
#pivot the table so that rows = users and columns = movies and the content is the ratings
matrix= data.pivot_table(index='userId', columns='original_title', values='rating')
matrix.head(10)

original_title,!Women Art Revolution,$5 a Day,'Gator Bait,'R Xmas,'Twas the Night Before Christmas,(A)Sexual,...And the Pursuit of Happiness,10 Items or Less,10 Things I Hate About You,"10,000 BC",...,Æon Flux,Бабник,Грозовые ворота,Дневник его жены,Мой сводный брат Франкенштейн,"Цирк сгорел, и клоуны разбежались",به امید دیدار,مارمولک,რამინი,黑太陽731
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# Check to see if the columns are not empty
print('Total ratings score of userId 1: ', matrix.iloc[0].sum())
print('Mean ratings score of userId 1: ', matrix.iloc[0].mean())
print('Ratings Count of userId 1: ', matrix.iloc[0].count())

Total ratings score of userId 1:  30.0
Mean ratings score of userId 1:  4.285714285714286
Ratings Count of userId 1:  7


In [10]:
# Pearson Correlation
def pearsonR(s1, s2):
    s1_c = s1-s1.mean()
    s2_c = s2-s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c**2) * np.sum(s2_c**2))

In [11]:
# Create watched list based on userId.
def has_watched(M, userid):
    watched = []
    t = M[M.index==userid].notnull()
    for c in t.columns:
        if t[c].values[0] == True:
            watched.append(c)
    return watched

In [12]:
# Return the score of a recently watched movie
def returnscore(movie, userid, data_ref):
    rs = data_ref.loc[(data_ref.userId == userid) & (data_ref.original_title == movie)].reset_index(drop=True)
    rs.drop(['original_title', 'original_language', 'revenue', 'target'], axis=1, inplace=True)
    rating = rs.iloc[0]['rating']
    if rating < 4:
        s = 0
    else:
        s = 1
    return s, rating

In [13]:
def getx(movie, userid, data_ref):
    newx = data_ref.loc[data_ref.original_title == movie].reset_index(drop=True)
    newx.drop(['userId', 'rating', 'original_title', 'original_language', 'revenue', 'target'], axis=1, inplace=True)
    newx = newx[:1]

    idx = 0
    new_col = [userid]  
    newx.insert(loc=idx, column='userId', value=new_col)
    return newx

In [14]:
def findcommong(movie1, movie2, ref):
    list1 = ref[ref.original_title == movie1].genres.values
    list1 = list1[0]
    list2 = ref[ref.original_title == movie2].genres.values
    list2 = list2[0]
    common = [i for i in list1 if i in list2 if i != 0]
    return common

In [15]:
def findallcommon(list1, list2, ref):
    all_common = []
    watched_genre = []
    rec_genre = []
    mov_list1 = list1.tolist()
    mov_list2 = list2.tolist()
    genre_matched=[]
    
    for n, title in enumerate(mov_list1):
        
        m1 = ref[ref.original_title == mov_list1[n]].genres.values
        m1 = m1[0]
        m2 = ref[ref.original_title == mov_list2[n]].genres.values
        m2 = m2[0]
        
        watched_genre.append(m1)
        rec_genre.append(m2)
        all_common.append(findcommong(mov_list1[n], mov_list2[n], ref))
    ln = np.concatenate(all_common).ravel().tolist()
    df = pd.DataFrame()
    df['watched_title'] = list1
    df['watched_genre'] = watched_genre
    df['recommended_title'] = list2
    df['recommended_genre'] = rec_genre
    df['in_common_genre'] = all_common
    
    for c in df.in_common_genre:
        if c == []:
            genre_matched.append(0)
        else:
            genre_matched.append(1)
    df['matched_genre'] = genre_matched
    return df, len(ln)

In [16]:
def countcommon(list1):
    ct = 0
    for l in list1:
        if l != []:
            ct += 1
    return ct 

In [17]:
def print_metrics(r, l):
    print('Accuracy Score: ', accuracy_score(r.target, r.pred))
    print('Average Similarity Score: ', r.similarity_score.mean())
    print('Average Probability Score: ', r.probability_of_pred.mean(), '\n')

    com_hlist, l_hcom = findallcommon(r.watched_title, r.recommended_title, ref)

    print('Out of a total of {} recommendations.'.format(len(r)))
    print("{} recommendations had at least one matching genre.".format(countcommon(com_hlist.in_common_genre)))
    print('{0:.0f}% of watched and recommended genres matched.'.format(countcommon(com_hlist.in_common_genre) / len(r) * 100), '\n')
    print(com_hlist[:l])

In [18]:
data_ref = data
data_ref['target'] = np.where(data_ref.rating < 4, 0, 1)
data_ref['popularity'] = data_ref.popularity.astype(float)

gbc = joblib.load('gbc60000.pkl') 

In [19]:
# The parameters here are: matrix name, number of similar users, and userID.
def recommend(movie, M, n_user, n_rec, userid, data_ref, model, base=20, matching_genre='default'):
    
    
    # A function to make N recommendations based on Pearson Correlation.
    users=[]
    for u in range(0, len(M)):
        if u == userid - 1:
            continue
        
        cor = pearsonR(M.iloc[userid - 1], M.iloc[u])
        if np.isnan(cor):
            continue
        else:
            users.append((u, cor))
    
    # Sort the table of users descending by similarity
    users.sort(key= lambda tup: tup[1], reverse=True)
    usr = pd.DataFrame(users[:n_user], columns=['User', 'Score'])
    
    # Create new matrix with just the similar user to our user
    M2 = M[M.index.isin(usr.User.values)]
    
    # A function called to create watched list based on userID & append recently watched movie
    watched = has_watched(M, userid)
    
    # A function to make N recommendations based on Pearson Correlation.
    reviews=[]
    score, rating = returnscore(movie, userid, data_ref)
    for title in M2.columns:
        if title in watched:
            continue
        
        cor = pearsonR(M2[movie], M2[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((userid,
                            movie,
                            rating,
                            score,
                            title,
                            cor))
    
    # Sort the table of movies descending by similarity
    rev = pd.DataFrame(reviews, columns=['userId',
                                         'watched_title',
                                         'rating',
                                         'target',
                                         'recommended_title',
                                         'similarity_score'])
    rev.sort_values(by='similarity_score', ascending=False, inplace=True)
    rev.reset_index(drop=True, inplace=True)
    rev = rev[:base]
    ypred=[]
    proba=[]
    for title in rev.recommended_title:
        X = getx(title, userid, data_ref)
        pred = model.predict(X)
        prob = model.predict_proba(X)
        ypred.append(int(pred))
        proba.append(float(prob[:,int(pred)]))
    rev['probability_of_pred'] = proba
    rev['pred'] = ypred
    rev = rev[rev.target == rev.pred].reset_index(drop=True)
    
    if matching_genre == 'all':
        genre_df, num_matched = findallcommon(rev.watched_title, rev.recommended_title, ref)
        rev['matched_genre'] = genre_df.matched_genre
        rev = rev[rev.matched_genre == 1].reset_index(drop=True)
    elif matching_genre == 'none':
        genre_df, num_matched = findallcommon(rev.watched_title, rev.recommended_title, ref)
        rev['matched_genre'] = genre_df.matched_genre
        rev = rev[rev.matched_genre == 0].reset_index(drop=True)
    else:
        genre_df, num_matched = findallcommon(rev.watched_title, rev.recommended_title, ref)
        rev['matched_genre'] = genre_df.matched_genre
    
    rev = rev[:n_rec]
    rev
    
    return usr, rev

In [20]:
from datetime import datetime
start=datetime.now()
u, r = recommend('License to Wed', matrix, 750, 5, 1, data_ref, gbc)
print('Runtime: ', datetime.now()-start)
r

Runtime:  0:00:26.860589


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,License to Wed,4.0,1,The Ewok Adventure,0.322721,0.999783,1,0
1,1,License to Wed,4.0,1,Warlords of the 21st Century,0.264543,0.554028,1,0
2,1,License to Wed,4.0,1,Minority Report,0.247967,0.907353,1,0
3,1,License to Wed,4.0,1,Holy Matrimony,0.245785,0.991041,1,1
4,1,License to Wed,4.0,1,42nd Street,0.245593,0.934039,1,1


In [21]:
r

Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,License to Wed,4.0,1,The Ewok Adventure,0.322721,0.999783,1,0
1,1,License to Wed,4.0,1,Warlords of the 21st Century,0.264543,0.554028,1,0
2,1,License to Wed,4.0,1,Minority Report,0.247967,0.907353,1,0
3,1,License to Wed,4.0,1,Holy Matrimony,0.245785,0.991041,1,1
4,1,License to Wed,4.0,1,42nd Street,0.245593,0.934039,1,1


In [22]:
print_metrics(r, 10)

Accuracy Score:  1.0
Average Similarity Score:  0.2653219869680198
Average Probability Score:  0.8772485944369233 

Out of a total of 5 recommendations.
2 recommendations had at least one matching genre.
40% of watched and recommended genres matched. 

    watched_title              watched_genre             recommended_title  \
0  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]            The Ewok Adventure   
1  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]  Warlords of the 21st Century   
2  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]               Minority Report   
3  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]                Holy Matrimony   
4  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]                   42nd Street   

                      recommended_genre in_common_genre  matched_genre  
0  [12, 10751, 14, 878, 10770, 0, 0, 0]              []              0  
1            [878, 0, 0, 0, 0, 0, 0, 0]              []              0  
2       [28, 53, 878, 9648, 0, 0, 0, 0]              []    

In [23]:
from datetime import datetime
start=datetime.now()
u, r = recommend('License to Wed', matrix, 750, 5, 1, data_ref, gbc, matching_genre='all')
print('Runtime: ', datetime.now()-start)
r

Runtime:  0:00:27.036754


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,License to Wed,4.0,1,Holy Matrimony,0.245785,0.991041,1,1
1,1,License to Wed,4.0,1,42nd Street,0.245593,0.934039,1,1
2,1,License to Wed,4.0,1,Open Season,0.241851,0.976978,1,1
3,1,License to Wed,4.0,1,The Dukes of Hazzard,0.241851,0.64943,1,1
4,1,License to Wed,4.0,1,Blondie of the Follies,0.241293,0.994447,1,1


In [24]:
print_metrics(r, 10)

Accuracy Score:  1.0
Average Similarity Score:  0.24327462795806382
Average Probability Score:  0.9091869779918949 

Out of a total of 5 recommendations.
5 recommendations had at least one matching genre.
100% of watched and recommended genres matched. 

    watched_title              watched_genre       recommended_title  \
0  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]          Holy Matrimony   
1  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]             42nd Street   
2  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]             Open Season   
3  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]    The Dukes of Hazzard   
4  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]  Blondie of the Follies   

                   recommended_genre in_common_genre  matched_genre  
0          [35, 0, 0, 0, 0, 0, 0, 0]            [35]              1  
1  [10402, 35, 10749, 0, 0, 0, 0, 0]            [35]              1  
2          [35, 0, 0, 0, 0, 0, 0, 0]            [35]              1  
3        [28, 12, 35, 0, 0, 0, 0

In [25]:
from datetime import datetime
start=datetime.now()
u, r = recommend('License to Wed', matrix, 750, 5, 1, data_ref, gbc, matching_genre='none')
print('Runtime: ', datetime.now()-start)
r

Runtime:  0:00:27.004768


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,License to Wed,4.0,1,The Ewok Adventure,0.322721,0.999783,1,0
1,1,License to Wed,4.0,1,Warlords of the 21st Century,0.264543,0.554028,1,0
2,1,License to Wed,4.0,1,Minority Report,0.247967,0.907353,1,0
3,1,License to Wed,4.0,1,The Man Who Knew Too Much,0.241851,0.999996,1,0
4,1,License to Wed,4.0,1,Under Siege 2: Dark Territory,0.23193,0.841238,1,0


In [26]:
print_metrics(r, 10)

Accuracy Score:  1.0
Average Similarity Score:  0.26180265878685405
Average Probability Score:  0.8604795119329316 

Out of a total of 5 recommendations.
0 recommendations had at least one matching genre.
0% of watched and recommended genres matched. 

    watched_title              watched_genre              recommended_title  \
0  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]             The Ewok Adventure   
1  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]   Warlords of the 21st Century   
2  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]                Minority Report   
3  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]      The Man Who Knew Too Much   
4  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]  Under Siege 2: Dark Territory   

                      recommended_genre in_common_genre  matched_genre  
0  [12, 10751, 14, 878, 10770, 0, 0, 0]              []              0  
1            [878, 0, 0, 0, 0, 0, 0, 0]              []              0  
2       [28, 53, 878, 9648, 0, 0, 0, 0]              

## Improving Runtime Test #1

In [27]:
def sim_user(M, n_user, userid):
    
    # A function to make N recommendations based on Pearson Correlation.
    users=[]
    for u in range(0, len(M)):
        if u == userid - 1:
            continue
        
        cor = pearsonR(M.iloc[userid - 1], M.iloc[u])
        if np.isnan(cor):
            continue
        else:
            users.append((u, cor))
    
    # Sort the table of users descending by similarity
    users.sort(key= lambda tup: tup[1], reverse=True)
    usr = pd.DataFrame(users[:n_user], columns=['User', 'Score'])
    
    # A function called to create watched list based on userID & append recently watched movie
    watched = has_watched(M, userid)

    
    return usr, M[M.index.isin(usr.User.values)], watched

In [28]:
def en_recommend(movie, watched, M2, n_rec, userid, data_ref, model, base=20, matching_genre='default'):
    
    # A function to make N recommendations based on Pearson Correlation.
    reviews=[]
    score, rating = returnscore(movie, userid, data_ref)
    for title in M2.columns:
        if title in watched:
            continue
        
        cor = pearsonR(M2[movie], M2[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((userid,
                            movie,
                            rating,
                            score,
                            title,
                            cor))
    
    # Sort the table of movies descending by similarity
    rev = pd.DataFrame(reviews, columns=['userId',
                                         'watched_title',
                                         'rating',
                                         'target',
                                         'recommended_title',
                                         'similarity_score'])
    rev.sort_values(by='similarity_score', ascending=False, inplace=True)
    rev.reset_index(drop=True, inplace=True)
    rev = rev[:base]
    ypred=[]
    proba=[]
    for title in rev.recommended_title:
        X = getx(title, userid, data_ref)
        pred = model.predict(X)
        prob = model.predict_proba(X)
        ypred.append(int(pred))
        proba.append(float(prob[:,int(pred)]))
    rev['probability_of_pred'] = proba
    rev['pred'] = ypred
    rev = rev[rev.target == rev.pred].reset_index(drop=True)
    
    if matching_genre == 'all':
        genre_df, num_matched = findallcommon(rev.watched_title, rev.recommended_title, ref)
        rev['matched_genre'] = genre_df.matched_genre
        rev = rev[rev.matched_genre == 1].reset_index(drop=True)
    elif matching_genre == 'none':
        genre_df, num_matched = findallcommon(rev.watched_title, rev.recommended_title, ref)
        rev['matched_genre'] = genre_df.matched_genre
        rev = rev[rev.matched_genre == 0].reset_index(drop=True)
    else:
        genre_df, num_matched = findallcommon(rev.watched_title, rev.recommended_title, ref)
        rev['matched_genre'] = genre_df.matched_genre
    
    rev = rev[:n_rec]
    rev
    
    return rev

In [29]:
from datetime import datetime
start=datetime.now()
u, matrix2, watched = sim_user(matrix, 750, 1)
print(datetime.now()-start)

0:00:21.359820


In [30]:
from datetime import datetime
start=datetime.now()
r = en_recommend('License to Wed', watched, matrix2, 5, 1, data_ref, gbc)
print('Runtime: ', datetime.now()-start)
r

Runtime:  0:00:05.795889


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,License to Wed,4.0,1,The Ewok Adventure,0.322721,0.999783,1,0
1,1,License to Wed,4.0,1,Warlords of the 21st Century,0.264543,0.554028,1,0
2,1,License to Wed,4.0,1,Minority Report,0.247967,0.907353,1,0
3,1,License to Wed,4.0,1,Holy Matrimony,0.245785,0.991041,1,1
4,1,License to Wed,4.0,1,42nd Street,0.245593,0.934039,1,1


In [31]:
print_metrics(r, 10)

Accuracy Score:  1.0
Average Similarity Score:  0.2653219869680198
Average Probability Score:  0.8772485944369233 

Out of a total of 5 recommendations.
2 recommendations had at least one matching genre.
40% of watched and recommended genres matched. 

    watched_title              watched_genre             recommended_title  \
0  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]            The Ewok Adventure   
1  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]  Warlords of the 21st Century   
2  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]               Minority Report   
3  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]                Holy Matrimony   
4  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]                   42nd Street   

                      recommended_genre in_common_genre  matched_genre  
0  [12, 10751, 14, 878, 10770, 0, 0, 0]              []              0  
1            [878, 0, 0, 0, 0, 0, 0, 0]              []              0  
2       [28, 53, 878, 9648, 0, 0, 0, 0]              []    

In [32]:
from datetime import datetime
start=datetime.now()
r = en_recommend('License to Wed', watched, matrix2, 5, 1, data_ref, gbc, matching_genre='all')
print('Runtime: ', datetime.now()-start)
r

Runtime:  0:00:05.463655


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,License to Wed,4.0,1,Holy Matrimony,0.245785,0.991041,1,1
1,1,License to Wed,4.0,1,42nd Street,0.245593,0.934039,1,1
2,1,License to Wed,4.0,1,Open Season,0.241851,0.976978,1,1
3,1,License to Wed,4.0,1,The Dukes of Hazzard,0.241851,0.64943,1,1
4,1,License to Wed,4.0,1,Blondie of the Follies,0.241293,0.994447,1,1


In [33]:
print_metrics(r, 10)

Accuracy Score:  1.0
Average Similarity Score:  0.24327462795806382
Average Probability Score:  0.9091869779918949 

Out of a total of 5 recommendations.
5 recommendations had at least one matching genre.
100% of watched and recommended genres matched. 

    watched_title              watched_genre       recommended_title  \
0  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]          Holy Matrimony   
1  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]             42nd Street   
2  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]             Open Season   
3  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]    The Dukes of Hazzard   
4  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]  Blondie of the Follies   

                   recommended_genre in_common_genre  matched_genre  
0          [35, 0, 0, 0, 0, 0, 0, 0]            [35]              1  
1  [10402, 35, 10749, 0, 0, 0, 0, 0]            [35]              1  
2          [35, 0, 0, 0, 0, 0, 0, 0]            [35]              1  
3        [28, 12, 35, 0, 0, 0, 0

In [34]:
from datetime import datetime
start=datetime.now()
r = en_recommend('License to Wed', watched, matrix2, 5, 1, data_ref, gbc, matching_genre='none')
print('Runtime: ', datetime.now()-start)
r

Runtime:  0:00:05.479075


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,License to Wed,4.0,1,The Ewok Adventure,0.322721,0.999783,1,0
1,1,License to Wed,4.0,1,Warlords of the 21st Century,0.264543,0.554028,1,0
2,1,License to Wed,4.0,1,Minority Report,0.247967,0.907353,1,0
3,1,License to Wed,4.0,1,The Man Who Knew Too Much,0.241851,0.999996,1,0
4,1,License to Wed,4.0,1,Under Siege 2: Dark Territory,0.23193,0.841238,1,0


In [35]:
print_metrics(r, 10)

Accuracy Score:  1.0
Average Similarity Score:  0.26180265878685405
Average Probability Score:  0.8604795119329316 

Out of a total of 5 recommendations.
0 recommendations had at least one matching genre.
0% of watched and recommended genres matched. 

    watched_title              watched_genre              recommended_title  \
0  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]             The Ewok Adventure   
1  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]   Warlords of the 21st Century   
2  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]                Minority Report   
3  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]      The Man Who Knew Too Much   
4  License to Wed  [35, 0, 0, 0, 0, 0, 0, 0]  Under Siege 2: Dark Territory   

                      recommended_genre in_common_genre  matched_genre  
0  [12, 10751, 14, 878, 10770, 0, 0, 0]              []              0  
1            [878, 0, 0, 0, 0, 0, 0, 0]              []              0  
2       [28, 53, 878, 9648, 0, 0, 0, 0]              

## Improve runtime Test #2

In [36]:
def get_sim_user(M, n_user, userid):
    
    # A function to make N recommendations based on Pearson Correlation.
    users=[]
    for u in range(0, len(M)):
        if u == userid - 1:
            continue
        
        cor = pearsonR(M.iloc[userid - 1], M.iloc[u])
        if np.isnan(cor):
            continue
        else:
            users.append((u, cor))
    
    # Sort the table of users descending by similarity
    users.sort(key= lambda tup: tup[1], reverse=True)
    usr = pd.DataFrame(users[:n_user], columns=['User', 'Score'])
    
    return usr, M[M.index.isin(usr.User.values)]

In [37]:
def get_recommend(movie, watched, M2, n_rec, userid, data_ref, model, base=50, matching_genre='default'):
    
    # A function to make N recommendations based on Pearson Correlation.
    reviews=[]
    score, rating = returnscore(movie, userid, data_ref)
    for title in M2.columns:
        if title in watched:
            continue
        
        cor = pearsonR(M2[movie], M2[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((userid,
                            movie,
                            rating,
                            score,
                            title,
                            cor))
    
    # Sort the table of movies descending by similarity
    rev = pd.DataFrame(reviews, columns=['userId',
                                         'watched_title',
                                         'rating',
                                         'target',
                                         'recommended_title',
                                         'similarity_score'])
    rev.sort_values(by='similarity_score', ascending=False, inplace=True)
    rev.reset_index(drop=True, inplace=True)
    
    rev = rev[:base]
    
    ypred=[]
    proba=[]
    for title in rev.recommended_title:
        X = getx(title, userid, data_ref)
        pred = model.predict(X)
        prob = model.predict_proba(X)
        ypred.append(int(pred))
        proba.append(float(prob[:,int(pred)]))
    rev['probability_of_pred'] = proba
    rev['pred'] = ypred
    rev = rev[rev.target == rev.pred].reset_index(drop=True)
    
    if matching_genre == 'all':
        genre_df, num_matched = findallcommon(rev.watched_title, rev.recommended_title, ref)
        rev['matched_genre'] = genre_df.matched_genre
        rev = rev[rev.matched_genre == 1].reset_index(drop=True)
    elif matching_genre == 'none':
        genre_df, num_matched = findallcommon(rev.watched_title, rev.recommended_title, ref)
        rev['matched_genre'] = genre_df.matched_genre
        rev = rev[rev.matched_genre == 0].reset_index(drop=True)
    elif matching_genre == 'default':
        genre_df, num_matched = findallcommon(rev.watched_title, rev.recommended_title, ref)
        rev['matched_genre'] = genre_df.matched_genre
    
    rev = rev[:n_rec]
    rev
    
    return rev

In [38]:
def return_liked_watch(M, userid, data_ref):
    watched=[]
    watch = has_watched(M, userid)
    for mov in watch:
        score, rating = returnscore(mov, userid, data_ref)
        if score == 1:
            watched.append(mov)
    return watched

In [39]:
def recommend_hist(matrix, userid, n_user, n_rec, data_ref, model, based=20, match_genre='default'):
    final_df = pd.DataFrame()
    watched = return_liked_watch(matrix, userid, data_ref)
    u, matrix2 = get_sim_user(matrix, n_user, userid)
    for mov in watched:
        r = get_recommend(mov, watched, matrix2, n_rec, userid, data_ref, model, base=based, matching_genre=match_genre)
        final_df = pd.concat([final_df, r]).reset_index(drop=True)
    return final_df

In [40]:
from datetime import datetime
start=datetime.now()
rec_df = recommend_hist(matrix, 1, 750, 5, data_ref, gbc, based=20, match_genre='all')
print('Runtime: ', datetime.now()-start)
rec_df

Runtime:  0:00:56.719189


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,Fools Rush In,4.0,1,Music Box,0.360087,0.68144,1,1
1,1,Fools Rush In,4.0,1,Shakespeare in Love,0.331499,0.934195,1,1
2,1,Fools Rush In,4.0,1,Hulk,0.328091,0.782082,1,1
3,1,Fools Rush In,4.0,1,Nell,0.307196,0.997465,1,1
4,1,Fools Rush In,4.0,1,M. Butterfly,0.279146,0.993716,1,1
5,1,License to Wed,4.0,1,Holy Matrimony,0.245785,0.991041,1,1
6,1,License to Wed,4.0,1,42nd Street,0.245593,0.934039,1,1
7,1,License to Wed,4.0,1,Open Season,0.241851,0.976978,1,1
8,1,License to Wed,4.0,1,The Dukes of Hazzard,0.241851,0.64943,1,1
9,1,License to Wed,4.0,1,Blondie of the Follies,0.241293,0.994447,1,1


In [41]:
user = 1
w = return_liked_watch(matrix, 1, data_ref)
print('User {} has watched and liked {} movies, they are: {}'.format(user, len(w), w), '\n')
print_metrics(rec_df, 10)

User 1 has watched and liked 6 movies, they are: ['Fools Rush In', 'License to Wed', 'Rocky Balboa', 'Shriek If You Know What I Did Last Friday the Thirteenth', 'Sleepless in Seattle', 'Young and Innocent'] 

Accuracy Score:  1.0
Average Similarity Score:  0.26518304718346364
Average Probability Score:  0.903129944113182 

Out of a total of 29 recommendations.
29 recommendations had at least one matching genre.
100% of watched and recommended genres matched. 

    watched_title                   watched_genre       recommended_title  \
0   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]               Music Box   
1   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]     Shakespeare in Love   
2   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]                    Hulk   
3   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]                    Nell   
4   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]            M. Butterfly   
5  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]          Holy Matrimony

In [42]:
from datetime import datetime
start=datetime.now()
rec_df = recommend_hist(matrix, 1, 5000, 5, data_ref, gbc, based=20, match_genre='all')
print('Runtime: ', datetime.now()-start)
rec_df

Runtime:  0:01:04.372898


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,Fools Rush In,4.0,1,One Night at McCool's,0.199628,0.956518,1,1
1,1,Fools Rush In,4.0,1,Notes on a Scandal,0.198608,0.999906,1,1
2,1,Fools Rush In,4.0,1,The Time Machine,0.166665,0.98727,1,1
3,1,Fools Rush In,4.0,1,Gods and Monsters,0.142228,0.998965,1,1
4,1,Fools Rush In,4.0,1,My Name Is Bruce,0.134637,0.749316,1,1
5,1,License to Wed,4.0,1,Beetlejuice,0.158864,0.992277,1,1
6,1,License to Wed,4.0,1,Don't Make Waves,0.116873,0.996745,1,1
7,1,License to Wed,4.0,1,Human Nature,0.104257,0.993945,1,1
8,1,Rocky Balboa,5.0,1,Say Anything...,0.170296,0.999424,1,1
9,1,Rocky Balboa,5.0,1,Prime,0.152465,0.976416,1,1


In [43]:
user = 1
w = return_liked_watch(matrix, 1, data_ref)
print('User {} has watched and liked {} movies, they are: {}'.format(user, len(w), w), '\n')
print_metrics(rec_df, 10)

User 1 has watched and liked 6 movies, they are: ['Fools Rush In', 'License to Wed', 'Rocky Balboa', 'Shriek If You Know What I Did Last Friday the Thirteenth', 'Sleepless in Seattle', 'Young and Innocent'] 

Accuracy Score:  1.0
Average Similarity Score:  0.15735299198851507
Average Probability Score:  0.9656699270266486 

Out of a total of 27 recommendations.
27 recommendations had at least one matching genre.
100% of watched and recommended genres matched. 

    watched_title                   watched_genre      recommended_title  \
0   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]  One Night at McCool's   
1   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]     Notes on a Scandal   
2   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]       The Time Machine   
3   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]      Gods and Monsters   
4   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]       My Name Is Bruce   
5  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]            Beetlejuice   
6 

In [44]:
from datetime import datetime
start=datetime.now()
rec_df = recommend_hist(matrix, 1, 750, 5, data_ref, gbc, based=50, match_genre='all')
print('Runtime: ', datetime.now()-start)
rec_df

Runtime:  0:01:20.505492


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,Fools Rush In,4.0,1,Music Box,0.360087,0.68144,1,1
1,1,Fools Rush In,4.0,1,Shakespeare in Love,0.331499,0.934195,1,1
2,1,Fools Rush In,4.0,1,Hulk,0.328091,0.782082,1,1
3,1,Fools Rush In,4.0,1,Nell,0.307196,0.997465,1,1
4,1,Fools Rush In,4.0,1,M. Butterfly,0.279146,0.993716,1,1
5,1,License to Wed,4.0,1,Holy Matrimony,0.245785,0.991041,1,1
6,1,License to Wed,4.0,1,42nd Street,0.245593,0.934039,1,1
7,1,License to Wed,4.0,1,Open Season,0.241851,0.976978,1,1
8,1,License to Wed,4.0,1,The Dukes of Hazzard,0.241851,0.64943,1,1
9,1,License to Wed,4.0,1,Blondie of the Follies,0.241293,0.994447,1,1


In [45]:
user = 1
w = return_liked_watch(matrix, 1, data_ref)
print('User {} has watched and liked {} movies, they are: {}'.format(user, len(w), w), '\n')
print_metrics(rec_df, 10)

User 1 has watched and liked 6 movies, they are: ['Fools Rush In', 'License to Wed', 'Rocky Balboa', 'Shriek If You Know What I Did Last Friday the Thirteenth', 'Sleepless in Seattle', 'Young and Innocent'] 

Accuracy Score:  1.0
Average Similarity Score:  0.26173152726538035
Average Probability Score:  0.9062504296185881 

Out of a total of 30 recommendations.
30 recommendations had at least one matching genre.
100% of watched and recommended genres matched. 

    watched_title                   watched_genre       recommended_title  \
0   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]               Music Box   
1   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]     Shakespeare in Love   
2   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]                    Hulk   
3   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]                    Nell   
4   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]            M. Butterfly   
5  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]          Holy Matrimon

## match genre Test #1

In [46]:
from datetime import datetime
start=datetime.now()
rec_df = recommend_hist(matrix, 1, 750, 5, data_ref, gbc, based=50, match_genre='default')
print('Runtime: ', datetime.now()-start)
rec_df

Runtime:  0:01:20.134483


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,Fools Rush In,4.0,1,Music Box,0.360087,0.68144,1,1
1,1,Fools Rush In,4.0,1,Shakespeare in Love,0.331499,0.934195,1,1
2,1,Fools Rush In,4.0,1,Hulk,0.328091,0.782082,1,1
3,1,Fools Rush In,4.0,1,Le Professionnel,0.311649,0.934083,1,0
4,1,Fools Rush In,4.0,1,Nell,0.307196,0.997465,1,1
5,1,License to Wed,4.0,1,The Ewok Adventure,0.322721,0.999783,1,0
6,1,License to Wed,4.0,1,Warlords of the 21st Century,0.264543,0.554028,1,0
7,1,License to Wed,4.0,1,Minority Report,0.247967,0.907353,1,0
8,1,License to Wed,4.0,1,Holy Matrimony,0.245785,0.991041,1,1
9,1,License to Wed,4.0,1,42nd Street,0.245593,0.934039,1,1


In [47]:
user = 1
w = return_liked_watch(matrix, 1, data_ref)
print('User {} has watched and liked {} movies, they are: {}'.format(user, len(w), w), '\n')
print_metrics(rec_df, 10)

User 1 has watched and liked 6 movies, they are: ['Fools Rush In', 'License to Wed', 'Rocky Balboa', 'Shriek If You Know What I Did Last Friday the Thirteenth', 'Sleepless in Seattle', 'Young and Innocent'] 

Accuracy Score:  1.0
Average Similarity Score:  0.2947643916586803
Average Probability Score:  0.9176621638474877 

Out of a total of 30 recommendations.
17 recommendations had at least one matching genre.
57% of watched and recommended genres matched. 

    watched_title                   watched_genre  \
0   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
1   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
2   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
3   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
4   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
5  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]   
6  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]   
7  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]   
8  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]   
9  Lic

In [48]:
from datetime import datetime
start=datetime.now()
rec_df = recommend_hist(matrix, 1, 750, 5, data_ref, gbc, based=25, match_genre='default')
print('Runtime: ', datetime.now()-start)
rec_df

Runtime:  0:01:02.369730


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred,matched_genre
0,1,Fools Rush In,4.0,1,Music Box,0.360087,0.68144,1,1
1,1,Fools Rush In,4.0,1,Shakespeare in Love,0.331499,0.934195,1,1
2,1,Fools Rush In,4.0,1,Hulk,0.328091,0.782082,1,1
3,1,Fools Rush In,4.0,1,Le Professionnel,0.311649,0.934083,1,0
4,1,Fools Rush In,4.0,1,Nell,0.307196,0.997465,1,1
5,1,License to Wed,4.0,1,The Ewok Adventure,0.322721,0.999783,1,0
6,1,License to Wed,4.0,1,Warlords of the 21st Century,0.264543,0.554028,1,0
7,1,License to Wed,4.0,1,Minority Report,0.247967,0.907353,1,0
8,1,License to Wed,4.0,1,Holy Matrimony,0.245785,0.991041,1,1
9,1,License to Wed,4.0,1,42nd Street,0.245593,0.934039,1,1


In [49]:
user = 1
w = return_liked_watch(matrix, 1, data_ref)
print('User {} has watched and liked {} movies, they are: {}'.format(user, len(w), w), '\n')
print_metrics(rec_df, 10)

User 1 has watched and liked 6 movies, they are: ['Fools Rush In', 'License to Wed', 'Rocky Balboa', 'Shriek If You Know What I Did Last Friday the Thirteenth', 'Sleepless in Seattle', 'Young and Innocent'] 

Accuracy Score:  1.0
Average Similarity Score:  0.2947643916586803
Average Probability Score:  0.9176621638474877 

Out of a total of 30 recommendations.
17 recommendations had at least one matching genre.
57% of watched and recommended genres matched. 

    watched_title                   watched_genre  \
0   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
1   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
2   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
3   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
4   Fools Rush In  [18, 35, 10749, 0, 0, 0, 0, 0]   
5  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]   
6  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]   
7  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]   
8  License to Wed       [35, 0, 0, 0, 0, 0, 0, 0]   
9  Lic