In [2]:
import numpy as np
import pandas as pd
from datetime import datetime

# WBSFLIX Data

In [407]:
links = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/links.csv')
movies = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/movies.csv')
ratings = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/ratings.csv')
tags = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/tags.csv')


## Data exploration and preprocessing

In [4]:
links.head(3)
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [283]:
movies.head()
# movies.info()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [412]:
# assign new column for year of movies, and exratc the yeatr from title 
movies= movies.assign(year = lambda df_ : df_['title'].replace(r'(.*)\((\d{4})\)', r'\2', regex= True) )

# movies= movies.assign(title = lambda df_ : df_['title'].replace(r'(.*)\((\d{4})\)', r'\1', regex= True).str.strip())

#there sre some movies have no year in their title, i fill the year by 0
movies= movies.assign(year = lambda df_ : np.where(df_['year'].str.len() <=5 , df_['year'], 0))
#convert the year column to int
movies['year']= movies['year'].astype(int)
movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,2017
9739,193585,Flint (2017),Drama,2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,2018


In [290]:
movies.loc[lambda df_ : df_['year'] == 0]

Unnamed: 0,movieId,title,genres,year
6059,40697,Babylon 5,Sci-Fi,0
9031,140956,Ready Player One,Action|Sci-Fi|Thriller,0
9091,143410,Hyena Road,(no genres listed),0
9138,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),0
9179,149334,Nocturnal Animals,Drama|Thriller,0
9259,156605,Paterson,(no genres listed),0
9367,162414,Moonlight,Drama,0
9448,167570,The OA,(no genres listed),0
9514,171495,Cosmos,(no genres listed),0
9515,171631,Maria Bamford: Old Baby,(no genres listed),0


In [409]:
ratings.head()
ratings.info()
#convert timestamp to datetime format
ratings['datetime'] = ratings['timestamp'].apply(datetime.fromtimestamp)
#drop the timestamp column
ratings.drop(columns=['timestamp'], inplace=True)
ratings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


Unnamed: 0,userId,movieId,rating,datetime
0,1,1,4.0,2000-07-30 20:45:03
1,1,3,4.0,2000-07-30 20:20:47
2,1,6,4.0,2000-07-30 20:37:04
3,1,47,5.0,2000-07-30 21:03:35
4,1,50,5.0,2000-07-30 20:48:51
...,...,...,...,...
100831,610,166534,4.0,2017-05-03 23:53:22
100832,610,168248,5.0,2017-05-04 00:21:31
100833,610,168250,5.0,2017-05-08 21:50:47
100834,610,168252,5.0,2017-05-03 23:19:12


In [7]:
tags.head(3)
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


## Popularity/Quality based recommmender system

In [8]:
ratings.groupby(by='movieId').mean().sort_values("rating", ascending=False).head(300)

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
88448,483.000000,5.000000,1.315438e+09
100556,462.000000,5.000000,1.456151e+09
143031,89.000000,5.000000,1.520409e+09
143511,105.000000,5.000000,1.526207e+09
143559,89.000000,5.000000,1.520410e+09
...,...,...,...
5328,414.000000,5.000000,1.047914e+09
6460,284.800000,4.900000,1.029904e+09
26810,475.666667,4.833333,1.218214e+09
115122,422.000000,4.833333,1.471733e+09


In [40]:
# define a function that get n as number of movies and genre and return top_popular_n movies
def popular_n_movies(n, genre):
    popular_n = (
    ratings
            .groupby(by='movieId')
            .agg(rating_mean=('rating', 'mean'), rating_count=('movieId', 'count'), datetime=('datetime','mean'))
            .sort_values(['rating_mean','rating_count','datetime'], ascending= False)
            .loc[lambda df_ :df_['rating_count'] >= (df_['rating_count'].mean()+df_['rating_count'].median())/2]
            .reset_index()
    )['movieId'].to_list()
    result = movies.loc[lambda df_ : df_['movieId'].isin(popular_n)]
    if genre != 'all_genres':
            result = result.loc[lambda df_ : df_['genres'].str.contains(genre)]
    return result.head(n).reset_index()

# i will write another version of this function can manage time period of movies too

In [41]:
popular_n_movies(20, 'Drama')

Unnamed: 0,index,movieId,title,genres
0,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
1,10,11,"American President, The (1995)",Comedy|Drama|Romance
2,13,14,Nixon (1995),Drama
3,15,16,Casino (1995),Crime|Drama
4,16,17,Sense and Sensibility (1995),Drama|Romance
5,19,20,Money Train (1995),Action|Comedy|Crime|Drama|Thriller
6,21,22,Copycat (1995),Crime|Drama|Horror|Mystery|Thriller
7,23,24,Powder (1995),Drama|Sci-Fi
8,24,25,Leaving Las Vegas (1995),Drama|Romance
9,25,26,Othello (1995),Drama


In [43]:
#find all genres
all_genres = set()
for genres in movies["genres"].unique():
    genres_2 = genres.split(r"|")
    for genre in genres_2:
        all_genres.add(genre)
sorted(all_genres)

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [426]:
#finding best match just by all other watchers (rating count, rating mean)
def best_match_popular(movie_name , n):
    movie_Id = movies.loc[lambda df_ : df_['title'].str.lower() == movie_name.lower(), 'movieId']
    if movie_Id.empty: 
        return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
    movie_Id = int(movie_Id)
    movie_raters = ratings.loc[lambda df_ : df_['movieId'] ==  movie_Id]
    similar_raters_movies = ratings.loc[lambda df_ : df_['userId'].isin(movie_raters['userId'])]
    similar_raters_movies
    best_match_p= (
        similar_raters_movies
                .groupby('movieId')
                .agg(rating_mean=('rating', 'mean'), rating_count=('userId', 'count'), datetime=('datetime','mean'))
                .sort_values(['rating_count','rating_mean'], ascending= False)
                .loc[lambda df_ :df_['rating_mean'] >= 4]
                .reset_index()
    ).head(n)
    return movies.loc[lambda df_ : df_['movieId'].isin(best_match_p['movieId'])]

In [429]:
#finding best match just by sorting them by popularity (rating count, rating mean)
def best_match_match(movie_name , n):
    movie_Id = movies.loc[lambda df_ : df_['title'].str.lower() == movie_name.lower(), 'movieId']
    if movie_Id.empty: 
        return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
    movie_Id = int(movie_Id)
    movie_raters = ratings.loc[lambda df_ : df_['movieId'] ==  movie_Id]
    similar_raters_movies = ratings.loc[lambda df_ : df_['userId'].isin(movie_raters['userId'])]
    similar_raters_movies
    best_match_m =(
        similar_raters_movies
                .groupby('movieId')
                .agg(rating_mean=('rating', 'mean'), rating_count=('userId', 'count'), datetime=('datetime','mean'))
                .sort_values(['rating_mean','rating_count'], ascending= False)
                .loc[lambda df_ :df_['rating_count'] >= (df_['rating_count'].mean()+df_['rating_count'].median())/2]
                .reset_index()
    ).head(n)
    return movies.loc[lambda df_ : df_['movieId'].isin(best_match_m['movieId'])]

In [571]:
best_match_match('Toy Story (1995)', 10 )

Unnamed: 0,movieId,title,genres,year
76,85,Angels and Insects (1995),Drama|Romance,1995
731,951,His Girl Friday (1940),Comedy|Romance,1940
796,1041,Secrets & Lies (1996),Drama,1996
841,1104,"Streetcar Named Desire, A (1951)",Drama,1951
1649,2202,Lifeboat (1944),Drama|War,1944
1664,2239,Swept Away (Travolti da un insolito destino ne...,Comedy|Drama,1975
1762,2360,"Celebration, The (Festen) (1998)",Drama,1998
2411,3201,Five Easy Pieces (1970),Drama,1970
2582,3451,Guess Who's Coming to Dinner (1967),Drama,1967
7815,92535,Louis C.K.: Live at the Beacon Theater (2011),Comedy,2011


## Recommendations Based on Correlation

In [573]:
def top_n_similar_v0(target_name , n ):
    #check the movie input
    target_Id = movies.loc[lambda df_ : df_['title'].str.lower() == target_name.lower(), 'movieId']
    if target_Id.empty: 
        return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
    target_Id = int(target_Id)
    
    #make cross dataframe
    movies_crosstab = pd.pivot_table(data=ratings, values='rating', index='userId', columns='movieId')
    #choose target ratings
    target_ratings = movies_crosstab[target_Id]
    #caalculate pearson colleratin with the target movie 
    similar_to_target = movies_crosstab.corrwith(target_ratings)
    #convert colleration matrix to dataframe
    corr_target = pd.DataFrame(similar_to_target, columns=['PearsonR'])
    #drop na s
    corr_target.dropna(inplace=True)
    
    corr_target.drop(target_Id, inplace=True) # drop target
    # sorting the colleration data frame and choose n top
    corr_target = corr_target.sort_values('PearsonR', ascending=False).head(n).reset_index(drop= True)
    # merge the result with movies dataframe, drop the PearsonR column and return the top n simillar movies
    return corr_target.merge(movies, left_index=True, right_on="movieId").drop(columns=['PearsonR'])


In [572]:
def top_n_similar_v1(target_name , n ):
    #check the movie input
    target_Id = movies.loc[lambda df_ : df_['title'].str.lower() == target_name.lower(), 'movieId']
    if target_Id.empty: 
        return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
    target_Id = int(target_Id)
    
    #make cross dataframe
    movies_crosstab = pd.pivot_table(data=ratings, values='rating', index='userId', columns='movieId')
    #choose target ratings
    target_ratings = movies_crosstab[target_Id]
    #caalculate pearson colleratin with the target movie 
    similar_to_target = movies_crosstab.corrwith(target_ratings)
    #convert colleration matrix to dataframe
    corr_target = pd.DataFrame(similar_to_target, columns=['PearsonR'])
    #drop na s
    corr_target.dropna(inplace=True)
    #count the numer of ratings for each movie
    rating_n =(
                ratings
                    .groupby(by='movieId')
                    .agg(rating_count=('userId', 'count'))
        )
    #merge colleration dataframe with ratings count dataframe
    target_corr_summary = corr_target.join(rating_n['rating_count'])
    
    target_corr_summary.drop(target_Id, inplace=True) # drop the target
    #filter at least 10 time raited movies, sort them and pick top n
    top_n = target_corr_summary[target_corr_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(n)
    # merge the result with movies dataframe, drop the PearsonR column and return the top n simillar movies
    return top_n.merge(movies, left_index=True, right_on="movieId").drop(columns=['PearsonR'])


In [575]:
top_n_similar_v1('Toy Story (1995)', 10)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0,PearsonR,rating_count,movieId,title,genres,year
8890,0.983092,10,134393,Trainwreck (2015),Comedy|Romance,2015
9286,0.968694,15,158238,The Nice Guys (2016),Crime|Mystery|Thriller,2016
2800,0.958373,10,3742,Battleship Potemkin (1925),Drama|War,1925
8693,0.942264,13,122912,Avengers: Infinity War - Part I (2018),Action|Adventure|Sci-Fi,2018
8533,0.936586,10,114935,Predestination (2014),Action|Mystery|Sci-Fi|Thriller,2014
1318,0.935897,12,1772,Blues Brothers 2000 (1998),Action|Comedy|Musical,1998
6947,0.931695,10,65514,Ip Man (2008),Action|Drama|War,2008
2448,0.922331,11,3261,Singles (1992),Comedy|Drama|Romance,1992
8449,0.913282,19,112138,22 Jump Street (2014),Action|Comedy|Crime,2014
4885,0.903757,13,7318,"Passion of the Christ, The (2004)",Drama,2004


In [438]:
def top_n_similar_v2(target_name , n ):
    target_Id=int(movies.loc[lambda df_ : df_['title'] == target_name, 'movieId'])
    movies_crosstab = pd.pivot_table(data=ratings, values='rating', index='userId', columns='movieId')
    target_ratings = movies_crosstab[target_Id]
    movies_crosstab
    similar_to_target = movies_crosstab.corrwith(target_ratings)
    corr_target = pd.DataFrame(similar_to_target, columns=['PearsonR'])
    corr_target.dropna(inplace=True)
    rating_n =(
                ratings
                    .groupby(by='movieId')
                    .agg(rating_count=('movieId', 'count'))
                    .reset_index()
        )
    target_raters = ratings.loc[lambda df_ : df_['movieId'] == target_Id]
    def num_same_raters(movie_Id):
        return ratings.loc[lambda df_ : (df_['movieId'] == movie_Id & df_['userId'].isin(target_raters['userId']))]['userId'].count()
    rating_n['n_same_raters'] = rating_n['movieId'].apply(num_same_raters)
    target_corr_summary = corr_target.join(rating_n[['rating_count', 'n_same_raters']])
    target_corr_summary.dropna(inplace=True)
    target_corr_summary.drop(target_Id, inplace=True) # drop Tortas Locas itself
    top_n = target_corr_summary[((target_corr_summary['rating_count']>=10) & (target_corr_summary['n_same_raters'] >=3))].sort_values('PearsonR', ascending=False).head(n)
    
    
    return top_n.merge(movies, left_index=True, right_on="movieId")
    # return top_n

In [441]:
top_n_similar_v0('Toy Story (1995)', 10)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0,PearsonR,movieId,title,genres,year
0,1.0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,0.330978,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,0.487109,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,1.0,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,0.310971,5,Father of the Bride Part II (1995),Comedy,1995
5,0.106465,6,Heat (1995),Action|Crime|Thriller,1995
6,0.208402,7,Sabrina (1995),Comedy|Romance,1995
7,0.968246,8,Tom and Huck (1995),Adventure|Children,1995
8,0.095913,9,Sudden Death (1995),Action,1995
9,-0.021409,10,GoldenEye (1995),Action|Adventure|Thriller,1995


In [433]:
top_n_similar_v2('Sudden Death (1995)', 10)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0,PearsonR,rating_count,n_same_raters,movieId,title,genres,year
1286,1.0,16.0,12.0,1711,Midnight in the Garden of Good and Evil (1997),Crime|Drama|Mystery,1997
1283,1.0,80.0,12.0,1703,For Richer or Poorer (1997),Comedy,1997
308,1.0,12.0,12.0,350,"Client, The (1994)",Drama|Mystery|Thriller,1994
315,1.0,26.0,12.0,357,Four Weddings and a Funeral (1994),Comedy|Romance,1994
1135,1.0,87.0,12.0,1485,Liar Liar (1997),Comedy,1997
329,1.0,12.0,12.0,371,"Paper, The (1994)",Comedy|Drama,1994
1102,1.0,14.0,12.0,1431,Beverly Hills Ninja (1997),Action|Comedy,1997
1058,1.0,12.0,12.0,1375,Star Trek III: The Search for Spock (1984),Action|Adventure|Sci-Fi,1984
363,1.0,19.0,12.0,419,"Beverly Hillbillies, The (1993)",Comedy,1993
1056,1.0,24.0,12.0,1373,Star Trek V: The Final Frontier (1989),Action|Sci-Fi,1989


## User-based Recomendation

In [494]:
from sklearn.metrics.pairwise import cosine_similarity

def top_n_similar_v3(user_id , n ):
    if user_id not in ratings["userId"]: 
        return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
    
    users_items = pd.pivot_table(data=ratings, 
                                 values='rating', 
                                 index='userId', 
                                 columns='movieId')
    users_items.fillna(0, inplace=True)
    user_similarities = pd.DataFrame(cosine_similarity(users_items),
                                 columns=users_items.index, 
                                 index=users_items.index)
    weights = (
    user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id])
          )
    not_watched_movies = users_items.loc[users_items.index!=user_id, users_items.loc[user_id,:]==0]
    weighted_averages = pd.DataFrame(not_watched_movies.T.dot(weights), columns=["predicted_rating"])
    recommendations = weighted_averages.merge(movies, left_index=True, right_on="movieId").sort_values("predicted_rating", ascending=False).head(n)
    return recommendations
    # return top_n

In [495]:
top_n_similar_v3(1, 10)

Unnamed: 0,predicted_rating,movieId,title,genres,year
277,2.654727,318,"Shawshank Redemption, The (1994)",Crime|Drama,1994
507,2.087327,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,1991
659,1.859548,858,"Godfather, The (1972)",Crime|Drama,1972
2078,1.663564,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery,1999
3638,1.62482,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001
123,1.585826,150,Apollo 13 (1995),Adventure|Drama|IMAX,1995
31,1.583809,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,1995
4800,1.502235,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,2003
4137,1.483288,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,2002
506,1.477032,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,1992


In [570]:
from sklearn.metrics.pairwise import cosine_similarity

def top_n_similar_v4(user_id , n ):
    if user_id not in ratings["userId"]: 
        return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
    
    users_items = pd.pivot_table(data=ratings, 
                                 values='rating', 
                                 index='userId', 
                                 columns='movieId')
    users_items.fillna(0, inplace=True)
    user_similarities = pd.DataFrame(cosine_similarity(users_items),
                                 columns=users_items.index, 
                                 index=users_items.index)
    weights = (
    user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id])
          )
    
    
    new_userids = weights.sort_values(ascending=False).head(100).index.tolist()
    new_userids.append(user_id)
    new_ratings = ratings.loc[lambda df_ : df_['userId'].isin(new_userids)]
    new_users_items = pd.pivot_table(data=new_ratings, 
                                 values='rating', 
                                 index='userId', 
                                 columns='movieId')
    new_users_items.fillna(0, inplace=True)
    new_user_similarities = pd.DataFrame(cosine_similarity(new_users_items),
                                 columns=new_users_items.index, 
                                 index=new_users_items.index)
    new_weights = (
    new_user_similarities.query("userId!=@user_id")[user_id] / sum(new_user_similarities.query("userId!=@user_id")[user_id])
          )
    
    not_watched_movies = new_users_items.loc[new_users_items.index!=user_id, new_users_items.loc[user_id,:]==0]
    weighted_averages = pd.DataFrame(not_watched_movies.T.dot(new_weights), columns=["predicted_rating"])
    recommendations = weighted_averages.merge(movies, left_index=True, right_on="movieId").sort_values("predicted_rating", ascending=False).head(n)
    return recommendations
    # return top_n

In [569]:
top_n_similar_v4(1, 10)

Unnamed: 0,predicted_rating,movieId,title,genres,year
507,3.82679,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,1991
659,3.772675,858,"Godfather, The (1972)",Crime|Drama,1972
902,3.487462,1200,Aliens (1986),Action|Adventure|Horror|Sci-Fi,1986
793,3.387974,1036,Die Hard (1988),Action|Crime|Thriller,1988
2078,3.235424,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery,1999
31,3.079436,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,1995
277,3.046273,318,"Shawshank Redemption, The (1994)",Crime|Drama,1994
922,3.023148,1221,"Godfather: Part II, The (1974)",Crime|Drama,1974
1445,2.948112,1968,"Breakfast Club, The (1985)",Comedy|Drama,1985
1158,2.905115,1527,"Fifth Element, The (1997)",Action|Adventure|Comedy|Sci-Fi,1997
