In [46]:
#importing required libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from ast import literal_eval
import scipy.sparse as sp
from flask import Flask,request,render_template

# Data Exploration and Preprocessing

In [9]:
#the csv file containing the users data
users = 'Datasets/ratings.csv'

In [10]:
#csv file to dataframe
users_df = pd.read_csv(users)

In [11]:
#users data frame columns
users_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [12]:
#Taking a look at first 5 rows of the data
users_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [13]:
#dropping not needed features
users_df = users_df.drop(['timestamp'],axis=1)

In [14]:
#describing the data
users_df.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [15]:
#number of users
users_df.userId.nunique()

610

In [17]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [150]:
#information of the data frame
users_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [426]:
#csv file containing the movies data
movies = 'Datasets/movies.csv'

In [427]:
#csv file to dataframe
movies_df = pd.read_csv(movies)

In [428]:
#movies data frame columns
movies_df.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [429]:
#A look at first 5 rows
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [430]:
#cleaning genres feature (to list)
movies_df['genres'] = movies_df['genres'].str.split('|')

In [431]:
movies_df.genres

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [432]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [433]:
#Removing the year from the title
movies_df.title = movies_df.title.str[:-7]

In [434]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men,"[Comedy, Romance]"
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II,[Comedy]


In [435]:
#loading the movies_dataset from kaggle to get the content of the movies
content_df = pd.read_csv('Datasets_kaggle/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [436]:
content_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [437]:
#extracting the needed columns
content_df = content_df[['title','overview','id']]

In [438]:
content_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     45460 non-null  object
 1   overview  44512 non-null  object
 2   id        45466 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [439]:
content_df.head()

Unnamed: 0,title,overview,id
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,11862


In [440]:
#merginng movies and content data frame to get the plot of the movie
movies_df = pd.merge(movies_df, content_df, left_on='title', right_on='title')

In [441]:
movies_df = movies_df[['movieId','title','overview','genres','id']]

In [442]:
#dropping the first 5 rows
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8070 entries, 0 to 8069
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movieId   8070 non-null   int64 
 1   title     8070 non-null   object
 2   overview  8042 non-null   object
 3   genres    8070 non-null   object
 4   id        8070 non-null   object
dtypes: int64(1), object(4)
memory usage: 378.3+ KB


In [443]:
movies_df.head()

Unnamed: 0,movieId,title,overview,genres,id
0,1,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Adventure, Animation, Children, Comedy, Fantasy]",862
1,2,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Children, Fantasy]",8844
2,3,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Comedy, Romance]",15602
3,4,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",31357
4,5,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],11862


In [444]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8070 entries, 0 to 8069
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movieId   8070 non-null   int64 
 1   title     8070 non-null   object
 2   overview  8042 non-null   object
 3   genres    8070 non-null   object
 4   id        8070 non-null   object
dtypes: int64(1), object(4)
memory usage: 378.3+ KB


In [445]:
#function to convert the ids to int 
def int_ids(x):
    try:
        return int(x)
    except:
        return np.nan

In [446]:
movies_df['id'] = movies_df['id'].apply(int_ids)

In [447]:
movies_df['title'] = movies_df['title'].drop_duplicates()

In [448]:
movies_df = movies_df.dropna()

In [449]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6316 entries, 0 to 8069
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movieId   6316 non-null   int64 
 1   title     6316 non-null   object
 2   overview  6316 non-null   object
 3   genres    6316 non-null   object
 4   id        6316 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 296.1+ KB


In [450]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69481 entries, 0 to 69480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   index    69481 non-null  int64  
 1   userId   69481 non-null  float64
 2   movieId  69481 non-null  int64  
 3   rating   69481 non-null  float64
dtypes: float64(2), int64(2)
memory usage: 2.1 MB


In [451]:
#merging movies and users to match the movies in the movies data frame
users_df = users_df.merge(movies_df,indicator=True,how='outer')

In [452]:
users_df = users_df.dropna()

In [453]:
users_df = users_df[['userId','movieId','rating']]

In [454]:
users_df = users_df.reset_index()

In [455]:
users_df.userId.nunique()

610

# Item-based Collaborative Filtering

In [456]:
item_matrix = users_df.pivot_table(values='rating', index='movieId', columns='userId')

In [457]:
item_matrix.head()

userId,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,601.0,602.0,603.0,604.0,605.0,606.0,607.0,608.0,609.0,610.0
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [458]:
item_matrix_filled = item_matrix.fillna(item_matrix.mean(axis=0))

In [459]:
cos_sim = cosine_similarity(item_matrix_filled, item_matrix_filled)

In [460]:
item_sim_matrix = pd.DataFrame(cos_sim, index=item_matrix.index, columns=item_matrix.index)

In [461]:
#similarity matrix
item_sim_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,175743,175781,176051,176423,176601,177765,180263,183199,186587,188797
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.98861,0.989508,0.990723,0.987893,0.988025,0.988191,0.990996,0.990194,0.986399,...,0.991433,0.991433,0.991432,0.991446,0.991362,0.990733,0.991237,0.991254,0.991405,0.991451
2,0.98861,1.0,0.994205,0.995534,0.99448,0.992069,0.993632,0.996057,0.995664,0.99081,...,0.996473,0.996473,0.99648,0.996427,0.996419,0.995767,0.996378,0.996454,0.996475,0.996449
3,0.989508,0.994205,1.0,0.996734,0.995452,0.993272,0.994749,0.996807,0.996606,0.99219,...,0.99744,0.99744,0.997446,0.997472,0.997393,0.996689,0.997343,0.997421,0.997441,0.997416
4,0.990723,0.995534,0.996734,1.0,0.99692,0.995147,0.996321,0.998899,0.998608,0.995201,...,0.999383,0.999383,0.999388,0.999364,0.999335,0.99863,0.999286,0.999363,0.999383,0.999358
5,0.987893,0.99448,0.995452,0.99692,1.0,0.993132,0.99522,0.997008,0.996992,0.993106,...,0.997529,0.997529,0.997534,0.997404,0.997484,0.996777,0.997427,0.997511,0.997528,0.997506


In [462]:
#A function to get the movies similar to what the user watched
def user_watched(user_id):
    item = list(item_matrix[item_matrix[user_id]>0].index)
    df = pd.DataFrame()
    for i in item:
        df = df.append(item_sim_matrix[i][:])
    df = df.nlargest(15,user_id)
    ind = list(df.index)
    watched_df = pd.DataFrame() 
    for i in ind:
        watched_df = watched_df.append(movies_df[movies_df['movieId'] == i]) 
    return watched_df[['movieId','title']]

# Content-based recommendation

In [463]:
#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies_df['overview'] = movies_df['overview'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(movies_df['overview'])

In [464]:
#Output the shape of tfidf_matrix
tfidf_matrix.shape

(6316, 24389)

In [465]:
#cosine similarity matrix
cosine_sim1 = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [466]:
indices = pd.Series(movies_df.index, index=movies_df['title'])

In [467]:
indices

title
Toy Story                         0
Jumanji                           1
Grumpier Old Men                  2
Waiting to Exhale                 3
Father of the Bride Part II       4
                               ... 
Coco                           8063
The Shining                    8064
Quest                          8065
Rampage                        8066
Tag                            8069
Length: 6316, dtype: int64

In [160]:
# Function that takes in movie title as input and gives recommendations 
def content_recommender(df, title, cosine_sim, indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda df: df[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return pd.DataFrame(df[['title','genres']].iloc[movie_indices])

In [311]:
#Extracting a single genre for metric evaluation
def get_genre(x):
    y = []
    for i in range(len(x)):
        if x[i] != '':
            y.append(x[i][0])
        else:
            y.append(' ')
    return y

In [312]:
movies_df['genre'] = get_genre(list(movies_df['genres']))

In [162]:
#A function to get the recommend movies from content based recommender
def result(titles):
    final_df = pd.DataFrame()
    for i in titles:
        final_df = final_df.append(content_recommender(movies_df, i, cosine_sim1, indices))
    return final_df

In [163]:
#A function to implement the hybrid recommender
def recommend_movies(user_id):
    
    sim_mov = user_watched(user_id)
    
    titles = list(sim_mov['title'])
    
    movies = result(titles)
    
    return movies.head()

# Flask for the web app

In [494]:
app = Flask(__name__, template_folder='templates')

In [495]:
all_users = list(users_df.userId.unique())

In [496]:
@app.route('/')
def main():
    return render_template('webapp.html')
@app.route('/recommendations',methods=['GET','POST'])
def recommendations():
    if request.method=="POST":
        u_id = int(request.form['uid'])
        #m_name = m_name.title()
        if u_id in all_users:
            result_final = recommend_movies(u_id)
            names = list(result_final['title'])
    return render_template('recommendations.html',movie_names=names,search_id=u_id)

if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [27/Jul/2020 05:13:28] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [27/Jul/2020 05:13:28] "[33mGET /projector.jpg HTTP/1.1[0m" 404 -
127.0.0.1 - - [27/Jul/2020 05:13:35] "[37mPOST /recommendations HTTP/1.1[0m" 200 -
127.0.0.1 - - [27/Jul/2020 05:13:35] "[33mGET /recommendations.jpg HTTP/1.1[0m" 404 -


# Benchmark Models

In [377]:
df = pd.read_csv('Datasets_kaggle/movies_metadata.csv')

df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]

df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [378]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [379]:
def convert_int(x):
    try:
        return int(x)
    except:
        return 0

In [380]:
df['year'] = df['year'].apply(convert_int)

In [381]:
df = df.drop('release_date', axis=1)

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


In [382]:
df = pd.merge(movies_df, df, left_on='title', right_on='title')

In [383]:
df.columns

Index(['movieId', 'title', 'overview', 'genres_x', 'id', 'genre', 'genres_y',
       'runtime', 'vote_average', 'vote_count', 'year'],
      dtype='object')

In [384]:
df['genres'] = df['genres_x']

In [385]:
df = df[['movieId','title','genre','runtime','vote_average','vote_count','year','id']]

In [386]:
df.head()

Unnamed: 0,movieId,title,genre,runtime,vote_average,vote_count,year,id
0,1,Toy Story,Adventure,81.0,7.7,5415.0,1995,862
1,2,Jumanji,Adventure,104.0,6.9,2413.0,1995,8844
2,3,Grumpier Old Men,Comedy,101.0,6.5,92.0,1995,15602
3,4,Waiting to Exhale,Comedy,127.0,6.1,34.0,1995,31357
4,5,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,11862


In [391]:
def knowledge_based(df, genre, low_time, high_time, low_year, high_year, percentile=0.8):
    
    movies = df.copy()
    
    movies = movies[(movies['genre'] == genre) & 
                    (movies['runtime'] >= low_time) |
                    (movies['runtime'] <= high_time) & 
                    (movies['year'] >= low_year) & 
                    (movies['year'] <= high_year)]
    
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)

    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    
    return q_movies.head()

In [392]:
knowledge_based(df,'adventure',90,120,1990,2010)

Unnamed: 0,movieId,title,genre,runtime,vote_average,vote_count,year,id
0,1,Toy Story,Adventure,81.0,7.7,5415.0,1995,862
1,2,Jumanji,Adventure,104.0,6.9,2413.0,1995,8844
14,13,Balto,Adventure,78.0,7.1,423.0,1995,21032
22,18,Four Rooms,Comedy,98.0,6.5,539.0,1995,5
23,19,Ace Ventura: When Nature Calls,Comedy,90.0,6.1,1128.0,1995,9273


In [393]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7449 entries, 0 to 7448
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       7449 non-null   int64  
 1   title         7449 non-null   object 
 2   genre         7449 non-null   object 
 3   runtime       7447 non-null   float64
 4   vote_average  7449 non-null   float64
 5   vote_count    7449 non-null   float64
 6   year          7449 non-null   int64  
 7   id            7449 non-null   int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 523.8+ KB


A simple Recommender

In [394]:
vc = df['vote_count'].quantile(0.8)

In [395]:
va = df['vote_average'].mean()

In [396]:
#computing the weighted score
def weighted_score(df):
    v = df['vote_count']
    R = df['vote_average']
    # Computing the weighted score
    return (v/(v+vc) * R) + (vc/(vc+v) * va)

In [397]:
df['score'] = df.apply(weighted_score, axis=1)

In [398]:
df = df.sort_values('score', ascending=False)

In [399]:
df[['movieId','title', 'vote_count', 'vote_average', 'score', 'runtime']].head()

Unnamed: 0,movieId,title,vote_count,vote_average,score,runtime
1821,2959,Fight Club,9678.0,8.3,8.196327,139.0
240,296,Pulp Fiction,8670.0,8.3,8.18493,154.0
397,527,Schindler's List,4436.0,8.3,8.086213,195.0
272,356,Forrest Gump,8147.0,8.2,8.083746,142.0
6313,112552,Whiplash,4376.0,8.3,8.083582,105.0


Content-based Recommender

In [414]:
content_recommender(movies_df, 'Toy Story', cosine_sim1, indices).head()

Unnamed: 0,title,genres
6145,Toy Story 3,"[Adventure, Animation, Children, Comedy, Fanta..."
2265,Toy Story 2,"[Adventure, Animation, Children, Comedy, Fantasy]"
904,Rebel Without a Cause,[Drama]
2292,Man on the Moon,"[Comedy, Drama]"
5416,For Your Consideration,[Comedy]


# Evaluation of Hybrid recommender system

In [None]:
#getting the titles similar to what the user watched
titles_1 = user_watched(1)

In [97]:
titles_610 = user_watched(610)

In [103]:
titles_5 = user_watched(5)

In [470]:
titles_42 = user_watched(42)

In [418]:
titles_6 = user_watched(6)

In [104]:
#getting the recommended titles
user_1 = result(sorted(list(titles_1['title'])))

In [105]:
user_610 = result(sorted(list(titles_610['title'])))

In [422]:
user_5 = result(sorted(list(titles_5['title'])))

In [471]:
user_42 = result(sorted(list(titles_42['title'])))

In [472]:
user_6 = result(sorted(list(titles_6['title'])))

In [107]:
x = [user_1.index,user_610.index,user_5.index]

In [473]:
y = [user_42.index,user_6.index]

In [476]:
z = [user_1.index,user_6.index,user_42.index,user_610.index]

In [477]:
#personalization metric function
def personalization(predicted):
    
    users_rec = pd.DataFrame(predicted)
    
    users_binary_rec = pd.get_dummies(users_rec,columns=users_rec.columns)

    similarity = cosine_similarity(X=users_binary_rec, dense_output=False)

    upper_right = np.triu_indices(similarity.shape[0], k=1)

    personalization = np.mean(similarity[upper_right])
    
    return 1-personalization

In [479]:
personalization(x)

1.0

In [481]:
personalization(y)

1.0

In [482]:
personalization(z)

1.0

In [483]:
#intra list similarity function
def intra_list_similarity(predicted, user):

    user['genre'] = get_genre(list(user['genres']))
    
    feature_df = pd.get_dummies(user['genre'],columns=user.index)

    recs_content = feature_df.loc[predicted]

    similarity = cosine_similarity(X=recs_content.values, dense_output=False)

    upper_right = np.triu_indices(similarity.shape[0], k=1)

    ils_single_user = np.mean(similarity[upper_right])
    
    return ils_single_user

In [486]:
intra_list_similarity(user_42.index,user_42)

0.20706935123042505

In [490]:
intra_list_similarity(user_1.index,user_1)

0.1485709306378529

In [489]:
intra_list_similarity(user_5.index,user_5)

0.1804245283018868