In [2]:
from __future__ import annotations

%load_ext autoreload
%autoreload 2

In [3]:
from typing import Union

from data_import import load_data
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np

from surprise import KNNWithMeans,SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

In [4]:
# import data
df_user, df_movie = load_data()

Data cleaning

In [5]:
df_movie = df_movie[df_movie['Avg_rating']<=10]

concat = lambda df, col1, col2: df[col1].astype(str) + "_" + df[col2].astype(str)

df_movie['Title'] = concat(df_movie,'Title','Year')
df_user['Title'] = concat(df_user,'Title','Year')
df_user = df_user.drop(columns='Year')

Feature engineering

In [6]:
# percentage difference between user rating and average rating (user-avg)

df_user_temp = df_user.merge(df_movie.groupby(['Title'])['Avg_rating'].max().reset_index(), on=['Title'], how='left')

df_user['Avg_user_rating_diff'] = (df_user_temp['Rating'] - df_user_temp['Avg_rating'])

df_user = df_user.dropna()

In [7]:
# Mapping users to numbers
user_dict = {user: i for i, user in enumerate(df_user.User.unique())}
movie_dict = {title: i for i, title in enumerate(df_movie.Title.unique())}

df_user[['User','Title']] = df_user[['User','Title']].agg({'User': lambda x: user_dict[x], 'Title': lambda x: movie_dict[x]})
df_movie[['Title']] = df_movie[['Title']].agg({'Title': lambda x: movie_dict[x]})

In [8]:
# TODO: change that to get only subset of the movies.

val_user_list = [user_dict[it] for it in ['sokoly35', 'honorciak', 'piotrr99']]
df_user_validation = df_user[df_user['User'].isin(val_user_list)]

# exclude us from df_user
df_user = df_user[~df_user.index.isin(df_user_validation.index)]

In [9]:
df_user_validation

Unnamed: 0,User,Title,Rating,Avg_user_rating_diff
9900,82,5744,9.0,0.2
9901,82,7433,10.0,1.4
9902,82,318,6.0,-0.5
9903,82,2148,3.0,-4.6
9904,82,1171,7.0,0.1
...,...,...,...,...
32486,261,6264,8.0,0.4
32487,261,2469,7.0,-0.6
32488,261,2409,7.0,-0.4
32489,261,7447,5.0,-2.2


# 0. model

In [10]:
class BaseRecommender(ABC):
    def __init__(self, df_user: pd.DataFrame, df_movie: pd.DataFrame):
        self.df_user = df_user
        self.df_movie = df_movie

    """ Base abstract class for recommendation techniques."""
    def train(self, df_user: Union[pd.DataFrame, None], **kwargs):
        raise NotImplementedError
    def predict(self, df_user: pd.DataFrame, num_of_recomendations: int, **kwargs):
        raise NotImplementedError

In [11]:
# decode users and movies
def decode(df):
    user_dict_decoder = {val: key for key, val in user_dict.items()}
    movie_dict_decoder = {val: key for key, val in movie_dict.items()}
    df_cols = df.columns
    if 'User' in df_cols:
        df['User'] = df['User'].agg(lambda x: user_dict_decoder[x])
    if 'Title' in df_cols:
        df['Title'] = df['Title'].agg(lambda x: movie_dict_decoder[x])
    return df

# 1. User cold start
* recommendation based on popularity and overall rating of the movie for users with not many rated movies
* we have no data about the user (age, gender, ...) so we don't use it as a feature in collaborative filtering.

In [12]:
class UCSRecommender(BaseRecommender):
    """ user cold start recommender.
    Take n_top_movies and recommend them in order based on Avg_rating."""
    recommendation_table = None

    def train(self, df_user: Union[pd.DataFrame, None], n_top_movies: int = 20, **kwargs):
        self.recommendation_table = df_movie.groupby(['Title'])[['Avg_rating', 'Number_of_ratings']].min().reset_index().sort_values('Number_of_ratings', ascending=False).head(n_top_movies).sort_values('Avg_rating', ascending=False)

    def predict(self, df_user: pd.DataFrame, num_of_recomendations: int, **kwargs):
        recommendations = pd.DataFrame(columns=['User', 'Title'])
        for user in df_user['User'].unique():
            recommendations_for_user = self.recommendation_table.head(num_of_recomendations)[['Title']]
            recommendations_for_user['User'] = user
            recommendations = pd.concat([recommendations, recommendations_for_user])
        return recommendations

In [14]:
# example of usage
ucs_recommender = UCSRecommender(df_user, df_movie)
ucs_recommender.train(df_user)
recommendations = ucs_recommender.predict(df_user=df_user_validation, num_of_recomendations=10)

decode(recommendations)

Unnamed: 0,User,Title
5744,honorciak,Skazani na Shawshank_1994
7433,honorciak,Zielona mila_1999
4163,honorciak,Nietykalni_2011
1867,honorciak,Forrest Gump_1994
5264,honorciak,Pulp Fiction_1994
3621,honorciak,Milczenie owiec_1991
3258,honorciak,Leon zawodowiec_1994
6077,honorciak,Szeregowiec Ryan_1998
2006,honorciak,Gladiator_2000
7177,honorciak,Władca Pierścieni: Drużyna Pierścienia_2001


# 2. Collaborative filtering
* user similarity

In [15]:
class CFRecommender(BaseRecommender):
    def __init__(self, df_user, df_movie):
        super(CFRecommender, self).__init__(df_user, df_movie)
        self.algo_SVD = SVD()
        # rating scale
        # reader = Reader(rating_scale = (1,10))
        # self.rating_df = Dataset.load_from_df(df_user[['User','Title', 'Rating']], reader)

    def train(self, df_user: Union[pd.DataFrame, None], n_top_movies: int = 20, **kwargs):
         reader = Reader(rating_scale = (1,10))
         self.rating_df = Dataset.load_from_df(df_user[['User','Title', 'Rating']], reader)

         self.algo_SVD.fit(self.rating_df.build_full_trainset())

    def predict(self, df_user: pd.DataFrame, num_of_recomendations: int, **kwargs):
        recommendations = pd.DataFrame(columns = ['User', 'Title', 'Rating'])
        for user in df_user['User'].unique():

            user_movies = df_user[df_user['User'] == user]['Title'].unique()

            # predict value for each movie in dataset.
            pred_list = []
            for movie in range(1, len(df_movie.Title.unique())):
                rating = self.algo_SVD.predict(user, movie).est
                pred_list.append([user, movie, rating])

            recommendations_for_user = pd.DataFrame(pred_list, columns = ['User', 'Title', 'Rating'])

            # remove already watched movies from recommendations
            recommendations_for_user = recommendations_for_user[~ recommendations_for_user['Title'].isin(user_movies)].sort_values('Rating',ascending=False).head(num_of_recomendations)
            recommendations = pd.concat([recommendations, recommendations_for_user])
        return recommendations

In [16]:
cf_recommender = CFRecommender(pd.concat([df_user, df_user_validation]), df_movie)
cf_recommender.train(pd.concat([df_user, df_user_validation]))
recommendations = cf_recommender.predict(df_user_validation, 5)
decode(recommendations)

Unnamed: 0,User,Title,Rating
1535,honorciak,Dwunastu gniewnych ludzi_1957,8.567738
3297,honorciak,Lista Schindlera_1993,8.471069
4162,honorciak,Nietykalni_2011,8.406124
5263,honorciak,Pulp Fiction_1994,8.239491
2229,honorciak,Harry Potter i Insygnia Śmierci: Część II_2011,8.178782
6076,piotrr99,Szeregowiec Ryan_1998,8.682948
6495,piotrr99,"Trzy billboardy za Ebbing, Missouri_2017",8.53237
3297,piotrr99,Lista Schindlera_1993,8.523958
2005,piotrr99,Gladiator_2000,8.441833
6423,piotrr99,Top Gun: Maverick_2022,8.401216


# 3. Content based
* movie similarity

In [None]:
# TODO:

# 4. Final model