In [1]:
from __future__ import annotations

%load_ext autoreload
%autoreload 2

In [2]:
from typing import Union

from data_import import load_data
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

from surprise import KNNWithMeans, SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

In [3]:
# import data
df_user, df_movie = load_data()

Data cleaning

In [4]:
df_movie = df_movie[df_movie["Avg_rating"] <= 10]

concat = lambda df, col1, col2: df[col1].astype(str) + "_" + df[col2].astype(str)

df_movie["Title"] = concat(df_movie, "Title", "Year")
df_user["Title"] = concat(df_user, "Title", "Year")
df_user = df_user.drop(columns="Year")

Feature engineering

In [5]:
# percentage difference between user rating and average rating (user-avg)

df_user_temp = df_user.merge(
    df_movie.groupby(["Title"])["Avg_rating"].max().reset_index(),
    on=["Title"],
    how="left",
)

df_user["Avg_user_rating_diff"] = df_user_temp["Rating"] - df_user_temp["Avg_rating"]

df_user = df_user.dropna()

In [6]:
# Mapping users to numbers
user_dict = {user: i for i, user in enumerate(df_user.User.unique())}
movie_dict = {title: i for i, title in enumerate(df_movie.Title.unique())}

df_user[["User", "Title"]] = df_user[["User", "Title"]].agg(
    {"User": lambda x: user_dict[x], "Title": lambda x: movie_dict[x]}
)
df_movie[["Title"]] = df_movie[["Title"]].agg({"Title": lambda x: movie_dict[x]})

In [7]:
# TODO: change that to get only subset of the movies.

val_user_list = [user_dict[it] for it in ["sokoly35", "honorciak", "piotrr99"]]
df_user_validation = df_user[df_user["User"].isin(val_user_list)]

# exclude us from df_user
df_user = df_user[~df_user.index.isin(df_user_validation.index)]

In [8]:
df_user_validation

Unnamed: 0,User,Title,Rating,Avg_user_rating_diff
9900,82,5744,9.0,0.2
9901,82,7433,10.0,1.4
9902,82,318,6.0,-0.5
9903,82,2148,3.0,-4.6
9904,82,1171,7.0,0.1
...,...,...,...,...
32486,261,6264,8.0,0.4
32487,261,2469,7.0,-0.6
32488,261,2409,7.0,-0.4
32489,261,7447,5.0,-2.2


# 0. model

In [9]:
class BaseRecommender(ABC):
    def __init__(self, df_user: pd.DataFrame, df_movie: pd.DataFrame):
        self.df_user = df_user
        self.df_movie = df_movie

    """ Base abstract class for recommendation techniques."""

    def train(self, df_user: Union[pd.DataFrame, None], **kwargs):
        raise NotImplementedError

    def predict(self, df_user: pd.DataFrame, num_of_recomendations: int, **kwargs):
        raise NotImplementedError

In [10]:
# decode users and movies
def decode(df):
    user_dict_decoder = {val: key for key, val in user_dict.items()}
    movie_dict_decoder = {val: key for key, val in movie_dict.items()}
    df_cols = df.columns
    if "User" in df_cols:
        df["User"] = df["User"].agg(lambda x: user_dict_decoder[x])
    if "Title" in df_cols:
        df["Title"] = df["Title"].agg(lambda x: movie_dict_decoder[x])
    return df

# 1. User cold start
* recommendation based on popularity and overall rating of the movie for users with not many rated movies
* we have no data about the user (age, gender, ...) so we don't use it as a feature in collaborative filtering.

In [11]:
class UCSRecommender(BaseRecommender):
    """user cold start recommender.
    Take n_top_movies and recommend them in order based on Avg_rating."""

    recommendation_table = None

    def train(
        self, df_user: Union[pd.DataFrame, None], n_top_movies: int = 20, **kwargs
    ):
        self.recommendation_table = (
            df_movie.groupby(["Title"])[["Avg_rating", "Number_of_ratings"]]
            .min()
            .reset_index()
            .sort_values("Number_of_ratings", ascending=False)
            .head(n_top_movies)
            .sort_values("Avg_rating", ascending=False)
        )

    def predict(self, df_user: pd.DataFrame, num_of_recomendations: int, **kwargs):
        recommendations = pd.DataFrame(columns=["User", "Title"])
        for user in df_user["User"].unique():
            recommendations_for_user = self.recommendation_table.head(
                num_of_recomendations
            )[["Title"]]
            recommendations_for_user["User"] = user
            recommendations = pd.concat([recommendations, recommendations_for_user])
        return recommendations

In [12]:
# example of usage
ucs_recommender = UCSRecommender(df_user, df_movie)
ucs_recommender.train(df_user)
recommendations = ucs_recommender.predict(
    df_user=df_user_validation, num_of_recomendations=10
)

decode(recommendations)

Unnamed: 0,User,Title
5744,honorciak,Skazani na Shawshank_1994
7433,honorciak,Zielona mila_1999
4163,honorciak,Nietykalni_2011
1867,honorciak,Forrest Gump_1994
5264,honorciak,Pulp Fiction_1994
3621,honorciak,Milczenie owiec_1991
3258,honorciak,Leon zawodowiec_1994
6077,honorciak,Szeregowiec Ryan_1998
2006,honorciak,Gladiator_2000
7177,honorciak,Władca Pierścieni: Drużyna Pierścienia_2001


# 2. Collaborative filtering
* user similarity

In [13]:
class CFRecommender(BaseRecommender):
    def __init__(self, df_user, df_movie):
        super(CFRecommender, self).__init__(df_user, df_movie)
        self.algo_SVD = SVD()
        # rating scale
        # reader = Reader(rating_scale = (1,10))
        # self.rating_df = Dataset.load_from_df(df_user[['User','Title', 'Rating']], reader)

    def train(
        self, df_user: Union[pd.DataFrame, None], n_top_movies: int = 20, **kwargs
    ):
        reader = Reader(rating_scale=(1, 10))
        self.rating_df = Dataset.load_from_df(
            df_user[["User", "Title", "Rating"]], reader
        )

        self.algo_SVD.fit(self.rating_df.build_full_trainset())

    def predict(self, df_user: pd.DataFrame, num_of_recomendations: int, **kwargs):
        recommendations = pd.DataFrame(columns=["User", "Title", "Rating"])
        for user in df_user["User"].unique():

            user_movies = df_user[df_user["User"] == user]["Title"].unique()

            # predict value for each movie in dataset.
            pred_list = []
            for movie in range(1, len(df_movie.Title.unique())):
                rating = self.algo_SVD.predict(user, movie).est
                pred_list.append([user, movie, rating])

            recommendations_for_user = pd.DataFrame(
                pred_list, columns=["User", "Title", "Rating"]
            )

            # remove already watched movies from recommendations
            recommendations_for_user = (
                recommendations_for_user[
                    ~recommendations_for_user["Title"].isin(user_movies)
                ]
                .sort_values("Rating", ascending=False)
                .head(num_of_recomendations)
            )
            recommendations = pd.concat([recommendations, recommendations_for_user])
        return recommendations

In [14]:
cf_recommender = CFRecommender(pd.concat([df_user, df_user_validation]), df_movie)
cf_recommender.train(pd.concat([df_user, df_user_validation]))
recommendations = cf_recommender.predict(df_user_validation, 5)
decode(recommendations)

Unnamed: 0,User,Title,Rating
1535,honorciak,Dwunastu gniewnych ludzi_1957,8.588125
5263,honorciak,Pulp Fiction_1994,8.456092
3297,honorciak,Lista Schindlera_1993,8.408468
6076,honorciak,Szeregowiec Ryan_1998,8.343446
5849,honorciak,Spider-Man Uniwersum_2018,8.312519
3297,piotrr99,Lista Schindlera_1993,8.740565
6076,piotrr99,Szeregowiec Ryan_1998,8.589543
4712,piotrr99,Pianista_2002,8.588165
654,piotrr99,Blade Runner 2049_2017,8.493203
944,piotrr99,Chłopcy z ferajny_1990,8.476726


# 3. Content based
* movie similarity

In [15]:
class CBRecommender(BaseRecommender):
    def __init__(self, df_movie):
        self.movie_items = self.create_movie_items(df_movie)

    def create_movie_items(self, df_movie):
        """
        Function prepares movie_items matrix which:
        - In each row is single movie
        - In each column is movie's asset (mostly one hot encoding of actors and genres)
        """
        # Create columns assignment ('Gnere/Actor: {genre/actor}')
        # Its preparation for one hot encoding
        temp = df_movie.copy()
        temp["Genre_name"] = "Genre: " + temp["Genre"]
        temp["Actor_name"] = "Actor: " + temp["Actor"]

        # Create movie_items
        movie_items = (
            temp
            # returns Title and for different rows either actors and genre
            .melt("Title", ["Genre_name", "Actor_name"], value_name="Class")
            # Assign a ghost column with 1 which will turn to one hot encodng in pivot table
            .assign(value=1)
            # Sometimes for the same title occured multiple same genres... which occured after melting
            # For pivoting we have to discard them
            .drop_duplicates()
            # Pivot table => matrix of 0/1 with n_title rows x n_{actor/genre} columns
            .pivot(index="Title", columns="Class")["value"].fillna(0)
        )

        # We will concat the columns of average rating and number of ratings to movie_items
        temp2 = temp.groupby("Title").agg({"Avg_rating": min, "Number_of_ratings": min})

        # Because we will base our analysis on the value of dot product let's normalize new columns which
        # have values greater than 1
        scaler = MinMaxScaler()

        movie_items[["Avg_rating", "Number_of_ratings"]] = scaler.fit_transform(temp2)
        return movie_items

    def predict_for_single_user(self, df_user, user_id, n_recommendations):
        """Function recommends provided number of movies to user"""
        # Take user data
        user_data = df_user[df_user["User"] == user_id]

        # Find watched movies
        watched_movies = user_data.Title.values

        # Create user_items => select movie's items for user watched movies from our database
        user_items = self.movie_items.loc[watched_movies]

        # Take user diff ratings as an appropriate weight for dot product
        ratings = user_data["Avg_user_rating_diff"].values

        # Calculate dot product accross watched movies and those from db
        # Weight the solution by rating diff of user
        similarity_matrix = (self.movie_items @ user_items.T) * ratings
        # For watched movies assign value 0
        similarity_matrix.loc[watched_movies] = 0

        recommendations = list(
            similarity_matrix
            # Take max across row
            .max(axis=1)
            # Sort in descending way
            .sort_values(ascending=False)
            # Take n recommendations
            .iloc[:n_recommendations].index.values
        )
        return recommendations

    def train(
        self, df_user: Union[pd.DataFrame, None], n_top_movies: int = 20, **kwargs
    ):
        pass

    def predict(self, df_user: pd.DataFrame, num_of_recomendations: int, **kwargs):
        """Predict movie recomendations for provided users with their data"""
        # Find all unique users
        user_ids = np.unique(df_user["User"])

        # Generate recommendations
        recommendations = pd.DataFrame(
            [
                (user_id, movie)
                for user_id in user_ids
                # Generate recommendations for each user
                for movie in self.predict_for_single_user(
                    df_user, user_id, num_of_recomendations
                )
            ],
            columns=["User", "Title"],
        )
        return recommendations

In [16]:
cb_recommender = CBRecommender(df_movie)
recommendations = cb_recommender.predict(
    df_user=df_user_validation, num_of_recomendations=10
)
decode(recommendations)

Unnamed: 0,User,Title
0,honorciak,Zakochany bez pamięci_2004
1,honorciak,Brazil_1985
2,honorciak,Nie patrz w górę_2021
3,honorciak,Człowiek z księżyca_1999
4,honorciak,Gorączka_2021
5,honorciak,Walka żywiołów_2005
6,honorciak,I Love You Phillip Morris_2009
7,honorciak,Niania w Nowym Jorku_2007
8,honorciak,Zakochany geniusz_2018
9,honorciak,Pomniejszenie_2017


# 4. Final model