In [10]:
import json
from pathlib import Path
from typing import Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from implicit.als import AlternatingLeastSquares

from catboost import (
    CatBoostClassifier, 
    Pool
)
from catboost.metrics import (
    BalancedAccuracy, 
    Logloss
)

### Read data

In [7]:
data_path = Path('data/hse')

def meta_to_df(meta_raw):
    element_uid = []
    duration = []
    type_ = []
    for k, v in meta_raw.items():
        element_uid.append(int(k))
        duration.append(float(v['duration']) * 60)
        type_.append(v['type'])
    meta = pd.DataFrame({
        'element_uid': element_uid,
        'duration': duration,
        'type': type_,
    })
    return meta

ratings = pd.read_csv(data_path / 'train_ratings.csv')
bookmarks = pd.read_csv(data_path / 'train_bookmarks.csv')
transactions = pd.read_csv(data_path / 'train_transactions.csv')

with open(data_path / 'catalogue.json', 'r') as f:
    meta_raw = json.load(f)
    meta = meta_to_df(meta_raw)

### Load CatBoost model

In [11]:
catboost_ranker = CatBoostClassifier().load_model('data/catboost_full.cbm')

### Generator code

In [4]:
def add_meta_to_transactions(transactions, meta):
    transactions_with_meta = (
        transactions
        .merge(meta[[
            'element_uid', 
            'duration',
            'type',
        ]], on='element_uid', how='left')
    )

    transactions_with_meta['watched_ratio'] = (
        transactions_with_meta['watched_time'] / 
        transactions_with_meta['duration']
    )

    def score_transaction(t):
        score = 1
        if t['duration'] > 0:
            if t['type'] == 'movie':
                if t['watched_ratio'] > 0.3:
                    score = int(t['watched_ratio'] * 9) + 1
            else:
                if t['watched_ratio'] > 1:
                    score = int(t['watched_ratio'] / 2) + 1
        return min(score, 10)

    transactions_with_meta['score'] = transactions_with_meta.apply(score_transaction, axis=1)

    return transactions_with_meta


def add_ratings_to_transactions(transactions, ratings):
    transactions_with_ratings = (
        transactions
        .merge(
            ratings[['user_uid', 'element_uid', 'rating']],
            on=['user_uid', 'element_uid'],
            how='outer'
        )
    )

    transactions_with_ratings['score'].fillna(
        transactions_with_ratings['rating'], inplace=True)

    return transactions_with_ratings


def encode_tfidf_coo(transactions: pd.DataFrame) -> pd.DataFrame:
    score_sum_per_user = (
        transactions
        .groupby('user_uid')['score']
        .transform('sum')
    )
    user_count_per_element = (
        transactions
        .groupby('element_uid')['user_uid']
        .transform('size')
    )
    tf = transactions['score'].values / score_sum_per_user.values
    idf = np.log(len(score_sum_per_user) / user_count_per_element.values)
    
    tfidf = transactions[['user_uid', 'element_uid']].copy()
    tfidf['value'] = tf * idf

    return tfidf


def encode_tfidf(transactions: pd.DataFrame) -> Tuple[LabelEncoder, LabelEncoder, csr_matrix]:
    tfidf = encode_tfidf_coo(transactions)
    
    n_users = tfidf['user_uid'].nunique()
    n_elements = tfidf['element_uid'].nunique()

    user_encoder = LabelEncoder()
    element_encoder = LabelEncoder()
    user_index = user_encoder.fit_transform(transactions['user_uid'].values)
    element_index = element_encoder.fit_transform(transactions['element_uid'].values)

    tfidf_csr = csr_matrix(
        (
            tfidf['value'].astype('float32').values,
            (user_index, element_index)
        ),
        shape=(n_users, n_elements)
    )

    return user_encoder, element_encoder, tfidf_csr


def als_fit_predict(transactions_csr: csr_matrix):
    transactions_csr.data = 1 + 40.0 * transactions_csr.data

    als = AlternatingLeastSquares(factors=128, iterations=30, calculate_training_loss=True)
    als.fit(transactions_csr)

    recommendations_matrix, recommendations_scores = als.recommend(
        np.arange(0, transactions_csr.shape[0]), 
        transactions_csr, 
        N=200, 
        filter_already_liked_items=True
    )

    return recommendations_matrix, recommendations_scores


def als_recommendations_to_df(
    recommendations_matrix: np.ndarray, 
    recommendations_scores: np.ndarray,
    user_encoder: LabelEncoder, 
    item_encoder: LabelEncoder,
    user_key = 'user_id',
    item_key = 'item_id'
) -> pd.DataFrame:
    recommendations_indices = pd.DataFrame({
        'user_index': np.arange(0, len(recommendations_matrix)),
        'item_index': list(recommendations_matrix),
        'score': list(recommendations_scores),
    })

    user_mapping = pd.DataFrame({
        'user_index': np.arange(0, len(user_encoder.classes_)),
        user_key: user_encoder.classes_,
    })

    item_mapping = pd.DataFrame({
        'item_index': np.arange(0, len(item_encoder.classes_)),
        item_key: item_encoder.classes_,
    })

    recommendations = (
        recommendations_indices
        .merge(
            user_mapping,
            on='user_index',
            how='left',
        )
        .drop(columns=['user_index'])
        .explode(['item_index', 'score'], ignore_index=True)
        .merge(
            item_mapping,
            on='item_index',
            how='left',
        )
        .drop(columns=['item_index'])
    )

    return recommendations


def run_als(
    transactions: pd.DataFrame, 
    meta: pd.DataFrame, 
    ratings: pd.DataFrame
) -> pd.DataFrame:
    print('Preprocess transactions')
    transactions = add_meta_to_transactions(transactions, meta)
    transactions = add_ratings_to_transactions(transactions, ratings)

    print('Compute TF-IDF')
    user_encoder, element_encoder, transactions_csr = \
        encode_tfidf(transactions)

    print('Run ALS')
    recommendations_item_indices, recommendations_scores = \
        als_fit_predict(transactions_csr)

    print('Postprocess ALS prediction')
    recommendations = als_recommendations_to_df(
        recommendations_item_indices,
        recommendations_scores,
        user_encoder,
        element_encoder,
        user_key='user_uid',
        item_key='element_uid',
    )

    return recommendations

### Ranker code

In [19]:
def merge_item_feature(item_features, new_feature):
    item_features = item_features.merge(
        new_feature,
        on='element_uid',
        how='left'
    )
    return item_features


def merge_user_feature(iser_features, new_feature):
    iser_features = iser_features.merge(
        new_feature,
        on='user_uid',
        how='left'
    )
    return iser_features


def add_item_popularity_feature(item_features: pd.DataFrame, transactions: pd.DataFrame):
    item_occurences = (
        transactions
        .groupby('element_uid')
        .size()
        .reset_index(name='element_occurences')
    )

    item_occurences['element_popularity'] = (
        item_occurences['element_occurences'] / 
        transactions['element_uid'].nunique()
    )
    item_occurences.drop(columns=['element_occurences'], inplace=True)

    return merge_item_feature(item_features, item_occurences)


def add_item_bookmark_count_feature(item_features: pd.DataFrame, bookmarks: pd.DataFrame):
    bookmarks_per_item = (
        bookmarks
        .groupby('element_uid')
        .size()
        .reset_index(name='element_bookmark_count')
    ) 

    item_features = merge_item_feature(item_features, bookmarks_per_item)
    item_features['element_bookmark_count'].fillna(0, inplace=True)

    return item_features


def add_user_watch_count_feature(user_features: pd.DataFrame, transactions: pd.DataFrame):
    user_watch_count = (
        transactions
        .groupby('user_uid')
        .size()
        .reset_index(name='user_watch_count')
    )
    return merge_user_feature(user_features, user_watch_count)


def add_user_watch_time_std(user_features: pd.DataFrame, transactions: pd.DataFrame):
    user_watch_time_std = (
        transactions
        .groupby('user_uid')['watched_time']
        .std()
        .reset_index(name='user_watch_time_std')
    )
    return merge_user_feature(user_features, user_watch_time_std)


def generate_item_features(transactions: pd.DataFrame, bookmarks: pd.DataFrame) -> pd.DataFrame:
    item_features = pd.DataFrame({
        'element_uid': transactions['element_uid'].unique()
    })
    item_features = add_item_popularity_feature(item_features, transactions)
    item_features = add_item_bookmark_count_feature(item_features, bookmarks)
    return item_features


def generate_user_features(transactions: pd.DataFrame) -> pd.DataFrame:
    user_features = pd.DataFrame({
        'user_uid': transactions['user_uid'].unique()
    })
    user_features = add_user_watch_count_feature(user_features, transactions)
    user_features = add_user_watch_time_std(user_features, transactions)
    return user_features


def enrich_interactions(
    interactions: pd.DataFrame, 
    transactions: pd.DataFrame, 
    bookmarks: pd.DataFrame
) -> pd.DataFrame:
    print('Generating item features')
    item_features = generate_item_features(transactions, bookmarks)
    
    print('Generating user features')
    user_features = generate_user_features(transactions)
    
    print('Merging features to interactions')
    interactions_featurized = (
        interactions
        .merge(
            item_features,
            on='element_uid',
            how='left'
        )
        .merge(
            user_features,
            on='user_uid',
            how='left'
        )
    )

    return interactions_featurized


def score_interactions(
    interactions_featurized: pd.DataFrame,
    catboost_model: CatBoostClassifier, 
) -> pd.DataFrame:
    interactions = interactions_featurized[['user_uid', 'element_uid']].copy()
    features = interactions_featurized.drop(columns=['user_uid', 'element_uid'])
    scores = catboost_model.predict_proba(features)[:, 1].flatten()
    interactions['catboost_score'] = scores
    return interactions


def run_catboost(
    interactions: pd.DataFrame, 
    transactions: pd.DataFrame, 
    bookmarks: pd.DataFrame,
    catboost_model: CatBoostClassifier
) -> pd.DataFrame:
    interactions_featurized = enrich_interactions(interactions, transactions, bookmarks)
    
    print('Running CatBoost scoring')
    scored_interactions = score_interactions(interactions_featurized, catboost_model)

    return scored_interactions

### Combine all together

In [8]:
als_prediction_full = run_als(transactions, meta, ratings)

Preprocess transactions
Compute TF-IDF
Run ALS


  0%|          | 0/30 [00:00<?, ?it/s]

Postprocess ALS prediction


In [21]:
catboost_prediction_full = run_catboost(als_prediction_full, transactions, bookmarks, catboost_ranker)

Generating item features
Generating user features
Merging features to interactions
Running CatBoost scoring


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [25]:
final_recommendations = (
    catboost_prediction_full
    .sort_values('catboost_score', ascending=False, ignore_index=True)
    .groupby('user_uid')
    .head(10)
)

### Evaluate recommendations on Kaggle test set

In [30]:
solution = pd.read_csv('data/solution.csv')
solution['element_uid'] = solution['element_uid'].apply(lambda x: x.split(' '))
kaggle_test = solution.explode('element_uid', ignore_index=True)
kaggle_test['element_uid'] = kaggle_test['element_uid'].astype('int32')

#### ALS-only recommendations

In [31]:
from metrics import compute_recsys_metrics

compute_recsys_metrics(
    als_prediction_full, 
    kaggle_test, 
    k=10, 
    user_key='user_uid', 
    item_key='element_uid'
)

{'recall': 0.07297968601014716, 'map': 0.029372680504801876}

#### Two-stage model: ALS + CatBoost

In [32]:
from metrics import compute_recsys_metrics

compute_recsys_metrics(
    final_recommendations, 
    kaggle_test, 
    k=10, 
    user_key='user_uid', 
    item_key='element_uid'
)

{'recall': 0.08903546806147954, 'map': 0.036405124927073235}

Benifit of two-stage model:
- Recall@10 improvement ~22%
- MAP@10 improvement ~24%