# analysis, dirty code etc

In [1]:
from __future__ import annotations

%load_ext autoreload
%autoreload 2

In [2]:
from typing import Union

from scrap_data.data_import import load_data
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np

from surprise import KNNWithMeans, SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

In [3]:
# import data
df_user, df_movie = load_data()

Data cleaning

In [4]:
df_movie = df_movie[df_movie["Avg_rating"] <= 10]

concat = lambda df, col1, col2: df[col1].astype(str) + "_" + df[col2].astype(str)

df_movie["Title"] = concat(df_movie, "Title", "Year")
df_user["Title"] = concat(df_user, "Title", "Year")
df_user = df_user.drop(columns="Year")

Feature engineering

In [5]:
# percentage difference between user rating and average rating (user-avg)

df_user_temp = df_user.merge(
    df_movie.groupby(["Title"])["Avg_rating"].max().reset_index(),
    on=["Title"],
    how="left",
)

df_user["Avg_user_rating_diff"] = df_user_temp["Rating"] - df_user_temp["Avg_rating"]

df_user = df_user.dropna()

In [6]:
# Mapping users to numbers
user_dict = {user: i for i, user in enumerate(df_user.User.unique())}
movie_dict = {title: i for i, title in enumerate(df_movie.Title.unique())}

df_user[["User", "Title"]] = df_user[["User", "Title"]].agg(
    {"User": lambda x: user_dict[x], "Title": lambda x: movie_dict[x]}
)
df_movie[["Title"]] = df_movie[["Title"]].agg({"Title": lambda x: movie_dict[x]})

Preparing final users to validate. Us.

In [7]:
val_user_list = [user_dict[it] for it in ["sokoly35", "honorciak", "piotrr99"]]
df_user_validation = df_user[df_user["User"].isin(val_user_list)]

# exclude us from df_user
df_user = df_user[~df_user.index.isin(df_user_validation.index)]

In [8]:
df_user_validation

Unnamed: 0,User,Title,Rating,Avg_user_rating_diff
9900,82,5744,9.0,0.2
9901,82,7433,10.0,1.4
9902,82,318,6.0,-0.5
9903,82,2148,3.0,-4.6
9904,82,1171,7.0,0.1
...,...,...,...,...
32486,261,6264,8.0,0.4
32487,261,2469,7.0,-0.6
32488,261,2409,7.0,-0.4
32489,261,7447,5.0,-2.2


# 0. model

# 1. User cold start
* recommendation based on popularity and overall rating of the movie for users with not many rated movies
* we have no data about the user (age, gender, ...) so we don't use it as a feature in collaborative filtering.

In [11]:
class UCSRecommender(BaseRecommender):
    """user cold start recommender.
    Take n_top_movies and recommend them in order based on Avg_rating."""

    recommendation_table = None

    def train(
        self, df_user: Union[pd.DataFrame, None], n_top_movies: int = 20, **kwargs
    ):
        self.recommendation_table = (
            df_movie.groupby(["Title"])[["Avg_rating", "Number_of_ratings"]]
            .min()
            .reset_index()
            .sort_values("Number_of_ratings", ascending=False)
            .head(n_top_movies)
            .sort_values("Avg_rating", ascending=False)
        )

    def predict(self, df_user: pd.DataFrame, num_of_recomendations: int, **kwargs):
        recommendations = pd.DataFrame(columns=["User", "Title"])
        for user in df_user["User"].unique():
            recommendations_for_user = self.recommendation_table.head(
                num_of_recomendations
            )[["Title"]]
            recommendations_for_user["User"] = user
            recommendations = pd.concat([recommendations, recommendations_for_user])
        return recommendations

In [12]:
# example of usage
ucs_recommender = UCSRecommender(df_user, df_movie)
ucs_recommender.train(df_user)
recommendations = ucs_recommender.fit(
# recommendations = ucs_recommender.train(
    df_user=df_user_validation, num_of_recomendations=10
)

decode(recommendations, user_dict, movie_dict)

AttributeError: 'NoneType' object has no attribute 'columns'

# 2. Collaborative filtering
* user similarity

In [None]:
# TODO:

# KNN
similarity = {
    "name": "cosine",
    "user_based": False,  # item-based similarity
}
algo_KNN = KNNWithMeans(sim_options=similarity)

# SVD
algo_SVD = SVD()

In [None]:
reader = Reader(rating_scale=(1, 10))
rating_df = Dataset.load_from_df(df_user[["User", "Title", "Rating"]], reader)

# from surprise.model_selection import cross_validate
# cross_validate_KNN = cross_validate(algo_KNN, rating_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# cross_validate_SVD = cross_validate(algo_SVD, rating_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
# define train test function
def train_test_algo(algo, label):
    training_set, testing_set = train_test_split(rating_df, test_size=0.2)
    algo.fit(training_set)
    test_output = algo.test(testing_set)
    test_df = pd.DataFrame(test_output)

    print("RMSE -", label, accuracy.rmse(test_output, verbose=False))
    print("MAE -", label, accuracy.mae(test_output, verbose=False))
    print("MSE -", label, accuracy.mse(test_output, verbose=False))

    return test_df

In [None]:
train_test_KNN = train_test_algo(algo_KNN, "algo_KNN")
print(train_test_KNN.head())
train_test_SVD = train_test_algo(algo_SVD, "algo_SVD")
print(train_test_SVD.head())

# as we can see SVD does better job, so we process with SVD.

ValueError: test_size=0 should be strictly greater than 0

In [None]:
def prediction(algo, users_K):
    pred_list = []
    for userId in range(1, users_K):
        for movieId in range(1, len(df_movie.Title.unique())):
            rating = algo.predict(userId, movieId).est
            pred_list.append([userId, movieId, rating])
    pred_df = pd.DataFrame(pred_list, columns=["userId", "movieId", "rating"])
    return pred_df

In [None]:
# def top_recommendations(pred_df, top_N):
#     link_movie = pd.merge(pred_df, links_df, how='inner', left_on='movieId', right_on='movieId')
#     recommended_movie = pd.merge(link_movie, movie_df, how='left', left_on='imdbId', right_on='imdb_id')[['userId', 'movieId', 'rating', 'movieId','imdb_id','title']]
#     sorted_df = recommended_movie.groupby(('userId'), as_index = False).apply(lambda x: x.sort_values(['rating'], ascending = False)).reset_index(drop=True)
#     top_recommended_movies = sorted_df.groupby('userId').head(top_N)
#     return sorted_df, top_recommended_movies

In [None]:
# KNN predictions
pred_KNN = prediction(algo_KNN, 10)
pred_KNN
# recommended_movies_KNN, top_recommended_movies_KNN = top_recommendations(pred_KNN, 3)
# ## SVD predictions
# pred_SVD = prediction(algo_SVD, 10)
# recommended_movies_SVD, top_recommended_movies_SVD = top_recommendations(pred_SVD, 3)

Unnamed: 0,userId,movieId,rating
0,1,1,1.000000
1,1,2,7.166667
2,1,3,7.098803
3,1,4,5.566667
4,1,5,4.733333
...,...,...,...
69592,9,7729,5.969287
69593,9,7730,7.969287
69594,9,7731,8.168812
69595,9,7732,9.135745


In [None]:
class CFRecommender(BaseRecommender):
    def __init__(self, df_user, df_movie):
        super(CFRecommender, self).__init__(df_user, df_movie)
        self.algo_SVD = SVD()
        # rating scale
        # reader = Reader(rating_scale = (1,10))
        # self.rating_df = Dataset.load_from_df(df_user[['User','Title', 'Rating']], reader)

    def train(
        self, df_user: Union[pd.DataFrame, None], n_top_movies: int = 20, **kwargs
    ):
        reader = Reader(rating_scale=(1, 10))
        self.rating_df = Dataset.load_from_df(
            df_user[["User", "Title", "Rating"]], reader
        )

        self.algo_SVD.fit(self.rating_df.build_full_trainset())

    def predict(self, df_user: pd.DataFrame, num_of_recomendations: int, **kwargs):
        recommendations = pd.DataFrame(columns=["User", "Title", "Rating"])
        for user in df_user["User"].unique():

            user_movies = df_user[df_user["User"] == user]["Title"].unique()

            # predict value for each movie in dataset.
            pred_list = []
            for movie in range(1, len(df_movie.Title.unique())):
                rating = self.algo_SVD.predict(user, movie).est
                pred_list.append([user, movie, rating])

            recommendations_for_user = pd.DataFrame(
                pred_list, columns=["User", "Title", "Rating"]
            )

            # remove already watched movies from recommendations
            recommendations_for_user = (
                recommendations_for_user[
                    ~recommendations_for_user["Title"].isin(user_movies)
                ]
                .sort_values("Rating", ascending=False)
                .head(num_of_recomendations)
            )
            recommendations = pd.concat([recommendations, recommendations_for_user])
        return recommendations

In [None]:
cf_recommender = CFRecommender(pd.concat([df_user, df_user_validation]), df_movie)
cf_recommender.train(pd.concat([df_user, df_user_validation]))
recommendations = cf_recommender.predict(df_user_validation, 5)
decode(recommendations, user_dict, movie_dict)

Unnamed: 0,User,Title,Rating
5263,honorciak,Pulp Fiction_1994,8.519361
1535,honorciak,Dwunastu gniewnych ludzi_1957,8.440207
3297,honorciak,Lista Schindlera_1993,8.437749
4162,honorciak,Nietykalni_2011,8.372387
3338,honorciak,Lot nad kukułczym gniazdem_1975,8.34205
5857,piotrr99,Spirited Away: W krainie Bogów_2001,8.672691
2005,piotrr99,Gladiator_2000,8.481566
6848,piotrr99,Whiplash_2014,8.465827
3297,piotrr99,Lista Schindlera_1993,8.431235
6076,piotrr99,Szeregowiec Ryan_1998,8.403267


# 3. Content based
* movie similarity

In [None]:
# TODO:

# 4. Final model