In [1]:
import numpy as np 
import pandas as pd
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from tqdm import tqdm

from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [2]:
#importing movie metadata and keep necessary columns
meta = pd.read_csv("movies_metadata.csv")
meta = meta[['id', 'original_title', 'original_language',
             'revenue', 'vote_average', 'vote_count', 'popularity', 'genres']]
meta = meta.rename(columns={'id':'movieId'})
meta = meta[meta['original_language']== 'en']
meta.head()

Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genres
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,"[{'id': 35, 'name': 'Comedy'}]"


In [3]:
meta.genres = [list(map(int, re.findall('\d+', x))) for x in meta.genres]
meta.head()

Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genres
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,"[16, 35, 10751]"
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,"[12, 14, 10751]"
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,"[10749, 35]"
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,"[35, 18, 10749]"
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,[35]


In [4]:
max_length = len(max(meta.genres, key = len))
print('Max # of Genres: ', max_length)

def padarray(A, size):
    t = size - len(A)
    return np.pad(A, pad_width=(0, t), mode='constant')

meta.genres = [padarray(x, max_length) for x in meta.genres]
meta.head()

Max # of Genres:  8


Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genres
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,"[16, 35, 10751, 0, 0, 0, 0, 0]"
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,"[12, 14, 10751, 0, 0, 0, 0, 0]"
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,"[10749, 35, 0, 0, 0, 0, 0, 0]"
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,"[35, 18, 10749, 0, 0, 0, 0, 0]"
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,"[35, 0, 0, 0, 0, 0, 0, 0]"


In [5]:
ref = pd.read_csv("movies_metadata.csv")
ref = ref[['id', 'original_title', 'original_language',
           'revenue', 'vote_average', 'vote_count', 'popularity', 'genres']]
ref = ref.rename(columns={'id':'movieId'})
ref = ref[ref['original_language']== 'en']
ref.genres = [list(map(int, re.findall('\d+', x))) for x in ref.genres]
ref.genres = [padarray(x, max_length) for x in ref.genres]
ref.head()

Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genres
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,"[16, 35, 10751, 0, 0, 0, 0, 0]"
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,"[12, 14, 10751, 0, 0, 0, 0, 0]"
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,"[10749, 35, 0, 0, 0, 0, 0, 0]"
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,"[35, 18, 10749, 0, 0, 0, 0, 0]"
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,"[35, 0, 0, 0, 0, 0, 0, 0]"


In [6]:
for n in range(0, max_length):
    meta['genre'+str(n+1)] = meta.genres.apply(lambda x: int(x[n]))

meta.drop('genres', axis=1, inplace=True)
meta.head()

Unnamed: 0,movieId,original_title,original_language,revenue,vote_average,vote_count,popularity,genre1,genre2,genre3,genre4,genre5,genre6,genre7,genre8
0,862,Toy Story,en,373554033.0,7.7,5415.0,21.9469,16,35,10751,0,0,0,0,0
1,8844,Jumanji,en,262797249.0,6.9,2413.0,17.0155,12,14,10751,0,0,0,0,0
2,15602,Grumpier Old Men,en,0.0,6.5,92.0,11.7129,10749,35,0,0,0,0,0,0
3,31357,Waiting to Exhale,en,81452156.0,6.1,34.0,3.85949,35,18,10749,0,0,0,0,0
4,11862,Father of the Bride Part II,en,76578911.0,5.7,173.0,8.38752,35,0,0,0,0,0,0,0


In [7]:
#importing movie ratings and keep necessary columns
ratings = pd.read_csv("ratings.csv")
ratings = ratings[['userId', 'movieId', 'rating']]

# taking a 2.5MM sample because it can take too long to pivot data later on
ratings = ratings.head(2500000)

#convert data types before merging
meta.movieId = pd.to_numeric(meta.movieId, errors = 'coerce')
ratings.movieId = pd.to_numeric(ratings.movieId, errors = 'coerce')

#merge the 2 datasets, so that we can have the labels for the movie titles
data= pd.merge(ratings, meta, on = 'movieId', how = 'inner')
data.head()

Unnamed: 0,userId,movieId,rating,original_title,original_language,revenue,vote_average,vote_count,popularity,genre1,genre2,genre3,genre4,genre5,genre6,genre7,genre8
0,1,858,5.0,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0
1,3,858,4.0,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0
2,5,858,5.0,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0
3,12,858,4.0,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0
4,20,858,4.5,Sleepless in Seattle,en,227799884.0,6.5,630.0,10.2349,35,18,10749,0,0,0,0,0


In [8]:
#pivot the table so that rows = users and columns = movies and the content is the ratings
matrix= data.pivot_table(index='userId', columns='original_title', values='rating').fillna(0)
matrix.head(10)

original_title,!Women Art Revolution,$5 a Day,'Gator Bait,'R Xmas,'Twas the Night Before Christmas,(A)Sexual,...And the Pursuit of Happiness,10 Items or Less,10 Things I Hate About You,"10,000 BC",...,Æon Flux,Бабник,Грозовые ворота,Дневник его жены,Мой сводный брат Франкенштейн,"Цирк сгорел, и клоуны разбежались",به امید دیدار,مارمولک,რამინი,黑太陽731
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Check to see if the columns are not empty
print('Total ratings score of userId 1: ', matrix.iloc[0].sum())
print('Mean ratings score of userId 1: ', matrix.iloc[0].mean())
print('Ratings Count of userId 1: ', matrix.iloc[0].count())

Total ratings score of userId 1:  30.0
Mean ratings score of userId 1:  0.007354743809757294
Ratings Count of userId 1:  4079


In [10]:
# Create reference table
data_ref = data
data_ref['target'] = np.where(data_ref.rating < 4, 0, 1)
data_ref['popularity'] = data_ref.popularity.astype(float)

# Load Model
gbc = joblib.load('gbc60000.pkl') 

## Helper Functions

In [11]:
# Pearson Correlation
def pearsonR(s1, s2):
    s1_c = s1-s1.mean()
    s2_c = s2-s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c**2) * np.sum(s2_c**2))

In [12]:
# Create watched list based on userId.
def has_watched(M, userid):
    watched = []
    t = M[M.index==userid]
    for c in t.columns:
        if t[c].values[0] != 0:
            watched.append(c)
    return watched

In [13]:
# Return the score of a recently watched movie
def returnscore(movie, userid, data_ref):
    rs = data_ref.loc[(data_ref.userId == userid) & (data_ref.original_title == movie)].reset_index(drop=True)
    rs.drop(['original_title', 'original_language', 'revenue', 'target'], axis=1, inplace=True)
    rating = rs.iloc[0]['rating']
    if rating < 4:
        s = 0
    else:
        s = 1
    return s, rating

In [14]:
def findcommong(movie1, movie2, ref):
    list1 = ref[ref.original_title == movie1].genres.values
    list1 = list1[0]
    list2 = ref[ref.original_title == movie2].genres.values
    list2 = list2[0]
    common = [i for i in list1 if i in list2 if i != 0]
    return common

In [15]:
def findallcommon(list1, list2, ref):
    all_common = []
    watched_genre = []
    rec_genre = []
    mov_list1 = list1.tolist()
    mov_list2 = list2.tolist()
    genre_matched=[]
    
    for n, title in enumerate(mov_list1):
        
        m1 = ref[ref.original_title == mov_list1[n]].genres.values
        m1 = m1[0]
        m2 = ref[ref.original_title == mov_list2[n]].genres.values
        m2 = m2[0]
        
        watched_genre.append(m1)
        rec_genre.append(m2)
        all_common.append(findcommong(mov_list1[n], mov_list2[n], ref))
    ln = np.concatenate(all_common).ravel().tolist()
    df = pd.DataFrame()
    df['watched_title'] = list1
    df['watched_genre'] = watched_genre
    df['recommended_title'] = list2
    df['recommended_genre'] = rec_genre
    df['in_common_genre'] = all_common
    
    for c in df.in_common_genre:
        if c == []:
            genre_matched.append(0)
        else:
            genre_matched.append(1)
    df['matched_genre'] = genre_matched
    return df, len(ln)

In [16]:
def countcommon(list1):
    ct = 0
    for l in list1:
        if l != []:
            ct += 1
    return ct 

In [17]:
def print_metrics(r, start=0, end=9):
    end = end+1
    print('Accuracy Score: ', accuracy_score(r.target, r.pred))
    print('Average Similarity Score: ', r.similarity_score.mean())
    print('Average Probability Score: ', r.probability_of_pred.mean(), '\n')

    com_hlist, l_hcom = findallcommon(r.watched_title, r.recommended_title, ref)

    print('Out of a total of {} recommendations.'.format(len(r)))
    print("{} recommendations had at least one matching genre.".format(countcommon(com_hlist.in_common_genre)))
    print('{0:.0f}% of watched and recommended genres matched.'.format(countcommon(com_hlist.in_common_genre) / len(r) * 100), '\n')

    return com_hlist

## Transform X function

In [18]:
def getx(movie, userid, data_ref):
    newx = data_ref.loc[data_ref.original_title == movie].reset_index(drop=True)
    newx.drop(['userId', 'rating', 'original_title', 'original_language', 'revenue', 'target'], axis=1, inplace=True)
    newx = newx[:1]

    idx = 0
    new_col = [userid]  
    newx.insert(loc=idx, column='userId', value=new_col)
    return newx

## Main Recommendation Structure

In [19]:
def return_liked_watch(M, userid, data_ref):
    watched=[]
    watch = has_watched(M, userid)
    for mov in watch:
        score, rating = returnscore(mov, userid, data_ref)
        if score == 1:
            watched.append(mov)
    return watched

In [20]:
def get_sim_user(M, n_user, userid):
    
    # A function to make N recommendations based on Pearson Correlation.
    users=[]
    for u, row in M.iterrows():
        if u == userid:
            continue
        
        cor = pearsonR(M.loc[userid], M.loc[u])
        users.append((u, cor))
    
    # Sort the table of users descending by similarity
    users.sort(key= lambda tup: tup[1], reverse=True)
    usr = pd.DataFrame(users[:n_user], columns=['User', 'Score'])

    return usr, M[M.index.isin(usr.User.values)]

In [21]:
def get_recommendation_cf(movie, watched, M2, n_rec, userid, data_ref, ref, model, matching_genre='default'):
    
    # A function to make N recommendations based on Pearson Correlation.
    reviews=[]
    score, rating = returnscore(movie, userid, data_ref)
    for title in M2.columns:
        if title in watched:
            continue
        
        cor = pearsonR(M2[movie], M2[title])
        reviews.append((userid, movie, rating, score, title, cor))
    
    # Sort the table of movies descending by similarity
    rev = pd.DataFrame(reviews, columns=['userId',
                                         'watched_title',
                                         'rating',
                                         'target',
                                         'recommended_title',
                                         'similarity_score'])
    rev.sort_values(by='similarity_score', ascending=False, inplace=True)
    rev.reset_index(drop=True, inplace=True)
    
    rec_count=0
    
    uid=[]
    wt=[]
    r=[]
    t=[]
    rt=[]
    ss=[]
    ypred=[]
    proba=[]
    
    for n, title in enumerate(rev.recommended_title):
        X = getx(title, userid, data_ref)
        pred = model.predict(X)
        prob = model.predict_proba(X)
        c = findcommong(rev.watched_title[n], title, ref)
        
        if matching_genre == 'all':
            if int(pred) == 1: 
                if c != []:
                    uid.append(rev.userId[n])
                    wt.append(rev.watched_title[n])
                    r.append(rev.rating[n])
                    t.append(rev.target[n])
                    rt.append(title)
                    ss.append(rev.similarity_score[n])
                    ypred.append(int(pred))
                    proba.append(float(prob[:,int(pred)]))
                    rec_count = rec_count + 1

        elif matching_genre == 'none':
            if int(pred) == 1:
                if c == []:
                    uid.append(rev.userId[n])
                    wt.append(rev.watched_title[n])
                    r.append(rev.rating[n])
                    t.append(rev.target[n])
                    rt.append(title)
                    ss.append(rev.similarity_score[n])
                    ypred.append(int(pred))
                    proba.append(float(prob[:,int(pred)]))
                    rec_count = rec_count + 1
       
        elif matching_genre == 'default':
            if int(pred) == 1:
                uid.append(rev.userId[n])
                wt.append(rev.watched_title[n])
                r.append(rev.rating[n])
                t.append(rev.target[n])
                rt.append(title)
                ss.append(rev.similarity_score[n])
                ypred.append(int(pred))
                proba.append(float(prob[:,int(pred)]))

                rec_count = rec_count + 1
      
        if rec_count == n_rec:
                break
             
    t_df = pd.DataFrame(
    {'userId': uid,
     'watched_title': wt,
     'rating': r,
     'target': t,
     'recommended_title': rt,
     'similarity_score': ss,
     'probability_of_pred': proba,
     'pred': ypred
    })
    
    t_df = t_df[['userId', 'watched_title', 'rating', 'target',
                 'recommended_title', 'similarity_score',
                 'probability_of_pred', 'pred']]
    
    return t_df

In [22]:
def recommend_from_history(matrix, userids, n_user, n_rec, data_ref, ref, model, matching_genre='default'):
    final_df = pd.DataFrame()
    for userid in tqdm(userids):
        watched = return_liked_watch(matrix, userid, data_ref)
        u, matrix2 = get_sim_user(matrix, n_user, userid)
        for mov in watched:
            r = get_recommendation_cf(mov, watched, matrix2, n_rec, userid, data_ref, ref, model, matching_genre=matching_genre)
            final_df = pd.concat([final_df, r]).reset_index(drop=True)
    return final_df

In [23]:
def whatismissing(df1, df2, no_of_rec):
    s = df2.groupby('userId').size().reset_index()
    s.columns = ['userId', 'count']
    s['count'] = s['count'] / no_of_rec

    m = df1.groupby('userId').size().reset_index()
    m.columns = ['userId', 'real_count']

    z = pd.merge(m, s, how='left', on='userId')
    z.fillna(value=0, inplace=True)
    z['diff'] = z['real_count'] - z['count']
    z = z[z['diff'] > 0]

    return z

In [24]:
comp_user_list = data_ref.userId.unique()
comp_user_list = comp_user_list[:500]
user_list_under16 = []

for u in comp_user_list:
    hw = has_watched(matrix, u)
    if len(hw) < 11:
        if len(hw) > 4:
            user_list_under16.append(u)
        
trunc_user_list = user_list_under16[:50]       
print('Length of list between 5-10: ', len(user_list_under16))
print('Truncated list between 5-10: ', trunc_user_list)

Length of list between 5-10:  50
Truncated list between 5-10:  [1, 3, 5, 28, 50, 138, 146, 184, 204, 206, 448, 502, 643, 812, 840, 862, 915, 959, 1182, 1202, 1206, 1303, 1317, 1347, 1377, 1387, 1442, 1446, 1474, 1610, 1632, 1642, 1690, 1835, 1854, 1913, 1936, 1942, 1969, 1984, 2028, 2031, 2134, 2161, 2176, 2197, 2225, 2246, 2271, 2274]


In [25]:
no_of_rec = 3

m_df = pd.DataFrame()
uid = []
mov = []
for userid in trunc_user_list:
        m = return_liked_watch(matrix, userid, data_ref)
        for mv in m:
            uid.append(userid)
            mov.append(mv)
            
m_df['userId'] = uid
m_df['likedMovie'] = mov
print(m_df.head(10))
print('''
From a total of {} users, here are a total of {} liked movies.
There are {} recommendation for each movie watched, therefore
there should be a total of {} recommedations.'''.format(len(m_df.userId.unique()),
                                                        len(m_df),
                                                        no_of_rec,
                                                        len(m_df)*no_of_rec))

   userId                                         likedMovie
0       1                                      Fools Rush In
1       1                                     License to Wed
2       1                                       Rocky Balboa
3       1  Shriek If You Know What I Did Last Friday the ...
4       1                               Sleepless in Seattle
5       1                                 Young and Innocent
6       3                                 Once Were Warriors
7       3                               Sleepless in Seattle
8       5                                    Murder She Said
9       5                                               Nell

From a total of 50 users, here are a total of 255 liked movies.
Therefore, there should be a total of 765 recommedations.


In [26]:
rec_df1 = recommend_from_history(matrix,                           # Original Matrix      
                                 trunc_user_list,                  # List of userId
                                 750,                              # Cluster of similar users
                                 no_of_rec,                        # Number of recommendation for each liked movie
                                 data_ref,                         # Reference table used to transform X
                                 ref,                              # Reference table used to check matched genre
                                 gbc,                              # Model used to predict for y
                                 matching_genre='default')         # 3 options ('all', 'none', 'default')
                                                                   # 'all' - returns only matched genre
                                                                   # 'none' - returns only genre that does not match
                                                                   # 'default' - does not filter for genres and 
                                                                   #             prioritizes higher similarity
                                                                   #             score.
rec_df1

100%|██████████| 50/50 [47:17<00:00, 56.75s/it]


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred
0,1,Fools Rush In,4.0,1,The Bourne Supremacy,0.260526,0.971244,1
1,1,Fools Rush In,4.0,1,Jacob's Ladder,0.253193,0.998288,1
2,1,Fools Rush In,4.0,1,Bridge to Terabithia,0.252134,0.994812,1
3,1,License to Wed,4.0,1,Terminator 3: Rise of the Machines,0.441820,0.989955,1
4,1,License to Wed,4.0,1,The Million Dollar Hotel,0.439499,0.999883,1
5,1,License to Wed,4.0,1,5 Card Stud,0.367420,0.998627,1
6,1,Rocky Balboa,5.0,1,My Name Is Bruce,0.387532,0.749316,1
7,1,Rocky Balboa,5.0,1,48 Hrs.,0.297214,0.994897,1
8,1,Rocky Balboa,5.0,1,Boogie Nights,0.289882,0.999184,1
9,1,Shriek If You Know What I Did Last Friday the ...,4.0,1,Terminator 3: Rise of the Machines,0.511706,0.989955,1


In [27]:
met1 = print_metrics(rec_df1, start=0, end=9)
met1.head(10)

Accuracy Score:  1.0
Average Similarity Score:  0.3033289857101142
Average Probability Score:  0.7950068141840606 

Out of a total of 765 recommendations.
441 recommendations had at least one matching genre.
58% of watched and recommended genres matched. 



Unnamed: 0,watched_title,watched_genre,recommended_title,recommended_genre,in_common_genre,matched_genre
0,Fools Rush In,"[18, 35, 10749, 0, 0, 0, 0, 0]",The Bourne Supremacy,"[28, 18, 53, 0, 0, 0, 0, 0]",[18],1
1,Fools Rush In,"[18, 35, 10749, 0, 0, 0, 0, 0]",Jacob's Ladder,"[18, 9648, 27, 0, 0, 0, 0, 0]",[18],1
2,Fools Rush In,"[18, 35, 10749, 0, 0, 0, 0, 0]",Bridge to Terabithia,"[12, 18, 10751, 0, 0, 0, 0, 0]",[18],1
3,License to Wed,"[35, 0, 0, 0, 0, 0, 0, 0]",Terminator 3: Rise of the Machines,"[28, 53, 878, 0, 0, 0, 0, 0]",[],0
4,License to Wed,"[35, 0, 0, 0, 0, 0, 0, 0]",The Million Dollar Hotel,"[18, 53, 0, 0, 0, 0, 0, 0]",[],0
5,License to Wed,"[35, 0, 0, 0, 0, 0, 0, 0]",5 Card Stud,"[28, 37, 53, 0, 0, 0, 0, 0]",[],0
6,Rocky Balboa,"[18, 0, 0, 0, 0, 0, 0, 0]",My Name Is Bruce,"[35, 27, 0, 0, 0, 0, 0, 0]",[],0
7,Rocky Balboa,"[18, 0, 0, 0, 0, 0, 0, 0]",48 Hrs.,"[53, 28, 35, 80, 18, 0, 0, 0]",[18],1
8,Rocky Balboa,"[18, 0, 0, 0, 0, 0, 0, 0]",Boogie Nights,"[18, 0, 0, 0, 0, 0, 0, 0]",[18],1
9,Shriek If You Know What I Did Last Friday the ...,"[35, 0, 0, 0, 0, 0, 0, 0]",Terminator 3: Rise of the Machines,"[28, 53, 878, 0, 0, 0, 0, 0]",[],0


In [28]:
m_rec1 = rec_df1.groupby('recommended_title').size().reset_index()
m_rec1.columns = ['recommended_title', 'count']
m_rec1 = m_rec1.sort_values('count', ascending=False).reset_index(drop=True)
m_rec1[:10]

Unnamed: 0,recommended_title,count
0,Terminator 3: Rise of the Machines,68
1,License to Wed,58
2,The Million Dollar Hotel,37
3,5 Card Stud,36
4,Point Break,31
5,Once Were Warriors,31
6,Shriek If You Know What I Did Last Friday the ...,30
7,The Talented Mr. Ripley,27
8,Say Anything...,27
9,The 39 Steps,17


In [29]:
q = whatismissing(m_df, rec_df1, no_of_rec)
q

Unnamed: 0,userId,real_count,count,diff


In [30]:
rec_df2 = recommend_from_history(matrix,                           # Original Matrix      
                                 trunc_user_list,                  # List of userId
                                 2500,                             # Cluster of similar users
                                 no_of_rec,                        # Number of recommendation for each liked movie
                                 data_ref,                         # Reference table used to transform X
                                 ref,                              # Reference table used to check matched genre
                                 gbc,                              # Model used to predict for y
                                 matching_genre='all')             # 3 options ('all', 'none', 'default')
                                                                   # 'all' - returns only matched genre
                                                                   # 'none' - returns only genre that does not match
                                                                   # 'default' - does not filter for genres and 
                                                                   #             prioritizes higher similarity
                                                                   #             score.
rec_df2

100%|██████████| 50/50 [54:50<00:00, 65.81s/it]


Unnamed: 0,userId,watched_title,rating,target,recommended_title,similarity_score,probability_of_pred,pred
0,1,Fools Rush In,4.0,1,Arlington Road,0.337116,0.999337,1
1,1,Fools Rush In,4.0,1,Rebecca,0.331302,0.999962,1
2,1,Fools Rush In,4.0,1,Bridge to Terabithia,0.328964,0.994812,1
3,1,License to Wed,4.0,1,Beetlejuice,0.322750,0.992277,1
4,1,License to Wed,4.0,1,Loose Screws,0.282484,0.520918,1
5,1,License to Wed,4.0,1,Men in Black II,0.233956,0.992412,1
6,1,Rocky Balboa,5.0,1,Boogie Nights,0.334969,0.999184,1
7,1,Rocky Balboa,5.0,1,48 Hrs.,0.318785,0.994897,1
8,1,Rocky Balboa,5.0,1,Bridge to Terabithia,0.310905,0.994812,1
9,1,Shriek If You Know What I Did Last Friday the ...,4.0,1,Beetlejuice,0.374441,0.992277,1


In [31]:
met2 = print_metrics(rec_df2, start=0, end=9)
met2.head(10)

Accuracy Score:  1.0
Average Similarity Score:  0.29018024840778106
Average Probability Score:  0.785041913736807 

Out of a total of 765 recommendations.
765 recommendations had at least one matching genre.
100% of watched and recommended genres matched. 



Unnamed: 0,watched_title,watched_genre,recommended_title,recommended_genre,in_common_genre,matched_genre
0,Fools Rush In,"[18, 35, 10749, 0, 0, 0, 0, 0]",Arlington Road,"[18, 53, 9648, 0, 0, 0, 0, 0]",[18],1
1,Fools Rush In,"[18, 35, 10749, 0, 0, 0, 0, 0]",Rebecca,"[18, 9648, 0, 0, 0, 0, 0, 0]",[18],1
2,Fools Rush In,"[18, 35, 10749, 0, 0, 0, 0, 0]",Bridge to Terabithia,"[12, 18, 10751, 0, 0, 0, 0, 0]",[18],1
3,License to Wed,"[35, 0, 0, 0, 0, 0, 0, 0]",Beetlejuice,"[14, 35, 0, 0, 0, 0, 0, 0]",[35],1
4,License to Wed,"[35, 0, 0, 0, 0, 0, 0, 0]",Loose Screws,"[35, 0, 0, 0, 0, 0, 0, 0]",[35],1
5,License to Wed,"[35, 0, 0, 0, 0, 0, 0, 0]",Men in Black II,"[28, 12, 35, 878, 0, 0, 0, 0]",[35],1
6,Rocky Balboa,"[18, 0, 0, 0, 0, 0, 0, 0]",Boogie Nights,"[18, 0, 0, 0, 0, 0, 0, 0]",[18],1
7,Rocky Balboa,"[18, 0, 0, 0, 0, 0, 0, 0]",48 Hrs.,"[53, 28, 35, 80, 18, 0, 0, 0]",[18],1
8,Rocky Balboa,"[18, 0, 0, 0, 0, 0, 0, 0]",Bridge to Terabithia,"[12, 18, 10751, 0, 0, 0, 0, 0]",[18],1
9,Shriek If You Know What I Did Last Friday the ...,"[35, 0, 0, 0, 0, 0, 0, 0]",Beetlejuice,"[14, 35, 0, 0, 0, 0, 0, 0]",[35],1


In [32]:
m_rec2 = rec_df2.groupby('recommended_title').size().reset_index()
m_rec2.columns = ['recommended_title', 'count']
m_rec2 = m_rec2.sort_values('count', ascending=False).reset_index(drop=True)
m_rec2[:10]

Unnamed: 0,recommended_title,count
0,The Talented Mr. Ripley,55
1,Terminator 3: Rise of the Machines,40
2,Say Anything...,32
3,Once Were Warriors,31
4,The Million Dollar Hotel,30
5,Men in Black II,30
6,Scarface,29
7,5 Card Stud,24
8,License to Wed,23
9,Point Break,23


In [33]:
v = whatismissing(m_df, rec_df2, no_of_rec)
v

Unnamed: 0,userId,real_count,count,diff
