# 0. Configuration

In [306]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [307]:
import numpy as np
import pandas as pd

from itertools import islice, cycle, product

import warnings
warnings.filterwarnings('ignore')

## 1. 1. Helper functions to avoid copy paste

In [308]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

In [309]:
def compute_popularity(df: pd.DataFrame, item_id: str, max_candidates: int):
    """
    calculates mean rating to define popular titles
    """
    popular_titles = df.groupby(item_id).agg({'rating': np.mean})\
                     .sort_values(['rating'], ascending=False).head(max_candidates).index.values

    return popular_titles

# 2. Data

## 2. 1. Load data

In [310]:
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)

In [311]:
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)

## 2.2 Data preparation

In [312]:
interactions['movieId'] = interactions['movieId'].astype(str)
movies_metadata.rename(columns = {'id': 'movieId'}, inplace = True)

In [313]:
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['movieId'])]

In [314]:
item_name_mapper = dict(zip(movies_metadata['movieId'], movies_metadata['original_title']))

In [315]:
users = interactions[['userId']].drop_duplicates().reset_index(drop = True)

# 3. Model

Let's define our baseline popularity recommender BaselineRecommender - top rated titles based on average rating with possibility to get by any group(s)

The pipeline will be similar to most python ML modules -- it will have two methods in the end: fit() and recommend()
1. The logic of fit() as follow:
- Initiate recommendation based on median rating from all observations recomm_common;
- Prepare list of interacted items by users
- If we set groups - we get recommendations i.e. calculate movie ratings by groups:
    - If we get NaN, we fill with base recommendations 
    - If we get less than required number of candidates, we populate from base recommendations

2. The logic of recommend():
- Return base recommendations if users data is not set;
- In case of category wise requirement -- we get results of our fit

## 3.1. Fit

In [316]:
MAX_CANDIDATES = 20
ITEM_COLUMN = 'movieId'
USER_COLUMN = 'userId'

In [317]:
base_recommendations = compute_popularity(interactions_filtered, ITEM_COLUMN, MAX_CANDIDATES)

In [318]:
known_items = interactions_filtered.groupby(USER_COLUMN)[ITEM_COLUMN].apply(list).to_dict()

In [319]:
group = [np.random.random_integers(2) for x in range(len(users))]
users['group'] = group

In [320]:
# old
data = pd.merge(interactions_filtered, users, how='left', on = USER_COLUMN)
group_recommendations = data.groupby('group').apply(compute_popularity, ITEM_COLUMN, MAX_CANDIDATES)
group_recommendations.head()

group
1    [3576, 55063, 2267, 54328, 5460, 667, 1543, 11...
2    [43267, 93855, 65216, 3112, 33138, 1903, 26791...
dtype: object

In [321]:
# new
data = pd.merge(interactions_filtered, users, how='left', on = USER_COLUMN)
group_recommendations = data.groupby('group').apply(compute_popularity, ITEM_COLUMN, len(movies_metadata))
group_recommendations.head()

group
1    [3576, 55063, 2267, 54328, 5460, 667, 1543, 11...
2    [43267, 93855, 65216, 3112, 33138, 1903, 26791...
dtype: object

## 3. 2. Recommend

In [322]:
recs = list(islice(cycle([base_recommendations]), len(users['userId'])))
users['rekkos'] = recs
users.head()

Unnamed: 0,userId,group,rekkos
0,1,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
1,2,2,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
2,3,2,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
3,4,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
4,5,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."


In [323]:
group_recommendations = group_recommendations.reset_index()
group_rekkos = pd.merge(users, group_recommendations, how = 'left', on = 'group')
group_rekkos.rename(columns = {0: 'rekkos'}, inplace = True)
group_rekkos.head()

Unnamed: 0,userId,group,rekkos,rekkos.1
0,1,1,"[74727, 128846, 702, 127728, 65216, 43267, 867...","[3576, 55063, 2267, 54328, 5460, 667, 1543, 11..."
1,2,2,"[74727, 128846, 702, 127728, 65216, 43267, 867...","[43267, 93855, 65216, 3112, 33138, 1903, 26791..."
2,3,2,"[74727, 128846, 702, 127728, 65216, 43267, 867...","[43267, 93855, 65216, 3112, 33138, 1903, 26791..."
3,4,1,"[74727, 128846, 702, 127728, 65216, 43267, 867...","[3576, 55063, 2267, 54328, 5460, 667, 1543, 11..."
4,5,1,"[74727, 128846, 702, 127728, 65216, 43267, 867...","[3576, 55063, 2267, 54328, 5460, 667, 1543, 11..."


## 3.3. Wrap everything into pretty functions

### 3.3.1 Fit part

In [324]:
def fit(
    data: pd.DataFrame,
    item_col: str, groups: list = None,
    max_candidates: int = movies_metadata.movieId.nunique()
    ) -> pd.DataFrame:
    """
    function runs all pipeline to generate recommendations based on given group
    :data: dataframe of interactions
    :item_col: item column name
    :groups: optional, list of groups column names to get recommendations
    :max_candidates: number of recommendations to return
    """
    
    if groups is not None:
        recommendations = data.groupby(groups).apply(compute_popularity, item_col, max_candidates)
    else:
        recommendations = compute_popularity(data, item_col, max_candidates)

    return recommendations

In [325]:
fit(data, item_col=ITEM_COLUMN)

array(['74727', '128846', '702', ..., '48591', '27376', '8859'],
      dtype=object)

In [326]:
fit(data, item_col=ITEM_COLUMN, groups=['group'])

group
1    [3576, 55063, 2267, 54328, 5460, 667, 1543, 11...
2    [43267, 93855, 65216, 3112, 33138, 1903, 26791...
dtype: object

### 3.3.2 Recommend part

In [327]:
def recommend(
    users: pd.DataFrame,
    recommendations: pd.DataFrame,
    groups: list = None,
    K: int = 10) -> pd.DataFrame:
    """
    recommends items for a given list of users
    :users: series / list of users to recommend
    :recommendations: output of fit() function
    :groups: optional, list of groups column names to get recommendations
    :K: number of items to recommend (not always we want to show dozens of items instantly)
    """
    if groups is not None:
        output = pd.merge(users, recommendations.reset_index(), how = 'left', on = 'group')

    else:
        output = users.copy(deep = True)
        recs = list(islice(cycle([recommendations]), len(users['userId'])))
        output['rekkos'] = recs
    
    output.columns = ['userId', 'group', 'rekkos']

    return output


In [328]:
recs = fit(data, item_col=ITEM_COLUMN)
check_recs = recommend(users[['userId', 'group']], recs)
check_recs.head()

Unnamed: 0,userId,group,rekkos
0,1,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
1,2,2,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
2,3,2,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
3,4,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
4,5,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."


In [329]:
recs = fit(data, item_col=ITEM_COLUMN, groups = ['group'])
check_recs = recommend(users[['userId', 'group']], recs, ['group'])
check_recs.head()

Unnamed: 0,userId,group,rekkos
0,1,1,"[3576, 55063, 2267, 54328, 5460, 667, 1543, 11..."
1,2,2,"[43267, 93855, 65216, 3112, 33138, 1903, 26791..."
2,3,2,"[43267, 93855, 65216, 3112, 33138, 1903, 26791..."
3,4,1,"[3576, 55063, 2267, 54328, 5460, 667, 1543, 11..."
4,5,1,"[3576, 55063, 2267, 54328, 5460, 667, 1543, 11..."


Congrats! Your first basic recommender system is ready!!

# TODO
- Add filtration of watched items to pipeline
- Also, consider cases when you fitler watched ones and you have less items in recommendations than required i.e. number of recommendations < MAX_CANDIDATES

# Solution
- Rewrite `fit` function to generate maximum amount of recommendations (done above)
- Add `personalize` function, which will clean user recommendations from watched films and keep only top 20 films
- Add `prettify` function to replace movieIds with titles

In [342]:
def personalize(
    recommendations: pd.DataFrame, 
    watched: dict = None,
    max_candidates: int = 20) -> pd.DataFrame:
    """
    Filter by watched items and return top 20 recommendations
    :recommendations: data frame with recomendations 
    :watched: dictionary of watched movies for every user
    :max_candidates: number of recommendations to return
    """
    
    if watched is not None:
        for user in recommendations['userId'].unique():
            watched_movies = watched.get(user, [])
            recommendations.loc[recommendations['userId'] == user, 'rekkos'] = \
                recommendations.loc[recommendations['userId'] == user, 'rekkos'].apply(
                    lambda x: [movie for movie in x if movie not in watched_movies][:20]
                )
    
    return recommendations

In [343]:
def prettify(
    df: pd.DataFrame,
    metadata: pd.DataFrame = movies_metadata) -> pd.DataFrame:
    """
    Replaces movie IDs in recommendations DataFrame with corresponding movie titles from movies_metadata DataFrame
    :df: dataframe with movie codes but not names
    """
    id_to_title = dict(zip(movies_metadata['movieId'], movies_metadata['original_title']))
    df['rekkos'] = df['rekkos'].apply(lambda x: [id_to_title.get(movieId, movieId) for movieId in x])
    
    return df


In [344]:
prettify(check_recs)

Unnamed: 0,userId,group,rekkos
0,1,1,"[Calling Dr. Gillespie, Still Bill, The Last M..."
1,2,2,"[29th Street, Posti in piedi in paradiso, Bloo..."
2,3,2,"[29th Street, Posti in piedi in paradiso, Bloo..."
3,4,1,"[Calling Dr. Gillespie, Still Bill, The Last M..."
4,5,1,"[Calling Dr. Gillespie, Still Bill, The Last M..."
...,...,...,...
666,667,1,"[Calling Dr. Gillespie, Still Bill, The Last M..."
667,668,2,"[29th Street, Posti in piedi in paradiso, Bloo..."
668,669,1,"[Calling Dr. Gillespie, Still Bill, The Last M..."
669,670,2,"[29th Street, Posti in piedi in paradiso, Bloo..."
