## Installs

In [None]:
#Install package Bayesian Factorization Machine
!pip install myfm

## Imports

In [1]:
# To store the data
import pandas as pd

# To do linear algebra
import numpy as np

# To apply Factorization Machines
import myfm
from myfm import RelationBlock
from myfm.utils.encoders import(DataFrameEncoder,MultipleValuesToSparseEncoder,CategoryValueToSparseEncoder)

# To do train-test split for evaluation
from sklearn.model_selection import train_test_split

## Download Data

In [None]:
!pip install kaggle

!mkdir ~/.kaggle

import json

kaggle_username = "yuvalnis" #@param {type:"string"}
kaggle_api_key = "1800d5a286834f0416c338c7bd7f6dee" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2022

!unzip -n cil-collaborative-filtering-2022.zip

## Load Data

In [2]:
#Function to Extract users, movies and ratings from raw data
def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

#Load raw data
data_pd = pd.read_csv('data_train.csv')

#Do Train-Test Split
train, val = train_test_split(data_pd, test_size=0.1,random_state=42)

#Extract users, movies and ratings from raw data
total_users, total_movies, total_pred = extract_users_items_predictions(data_pd)
train_users, train_movies, train_pred = extract_users_items_predictions(train)
val_users, val_movies, val_pred = extract_users_items_predictions(val)

#Store total data
ratings_dict_total = {'userID': total_users,'movieID': total_movies,'rating': total_pred}
df_total = pd.DataFrame(ratings_dict_total)

#Store train data
ratings_dict_train = {'userID': train_users,'movieID': train_movies,'rating': train_pred}
df_train = pd.DataFrame(ratings_dict_train)

#Store validation data
ratings_dict_test = {'userID': val_users,'movieID': val_movies,'rating': val_pred}
df_test = pd.DataFrame(ratings_dict_test)

## Get implicit user/movie Features
- code is based on https://github.com/tohtsky/myFM/blob/main/examples/ml-100k-extended.ipynb

In [3]:
#Utility function to convert list to str
def string_agg(int_list) -> str:
    return ','.join([str(y) for y in int_list])

#Get & Store Implicit Features
user_f = pd.DataFrame(np.arange(10000),columns=['userID'])
user_f['user_implicit_features'] = df_train.groupby('userID').movieID.agg(string_agg).reindex(user_f.index).fillna('')
user_f['user_rating'] = df_train.groupby('userID').rating.agg(string_agg).reindex(user_f.index).fillna('')
movie_f = pd.DataFrame(np.arange(1000),columns=['movieID'])
movie_f['movie_implicit_features'] = df_train.groupby('movieID').userID.agg(string_agg).reindex(movie_f.index).fillna('')
movie_f['movie_rating'] = df_train.groupby('movieID').rating.agg(string_agg).reindex(movie_f.index).fillna('')

# Transform Data in myFM Format

In [4]:
#Create User Encoder
user_encoder = DataFrameEncoder().add_column(
    'userID', CategoryValueToSparseEncoder(user_f.userID)
    )

user_encoder.add_column(
        'user_implicit_features',
        MultipleValuesToSparseEncoder(user_f.user_implicit_features, normalize=True)
    )

user_encoder.add_column(
        'user_rating',
        MultipleValuesToSparseEncoder(user_f.user_rating, normalize=True)
    )


#Create Movie Encoder
movie_encoder = DataFrameEncoder().add_column(
    'movieID', CategoryValueToSparseEncoder(movie_f.movieID)
    )
movie_encoder.add_column(
        'movie_implicit_features',
        MultipleValuesToSparseEncoder(movie_f.movie_implicit_features, normalize=True)
    )
movie_encoder.add_column(
        'movie_rating',
        MultipleValuesToSparseEncoder(movie_f.movie_rating, normalize=True)
    )


def augment_user_id(user_ids):
    return user_encoder.encode_df(
        user_f.reindex(user_ids).reset_index()
    )

def augment_movie_id(movie_ids):
    return movie_encoder.encode_df(
        movie_f.reindex(movie_ids).reset_index()
    )


#Create Train/Test Blocks
train_blocks = []
test_blocks = []
for source, target in [(df_train, train_blocks),(df_test, test_blocks)]:
    unique_users, user_map = np.unique(source.userID, return_inverse=True)
    target.append(
        RelationBlock(user_map, augment_user_id(unique_users))
    )
    unique_movies, movie_map = np.unique(source.movieID, return_inverse=True)
    target.append(
        RelationBlock(movie_map, augment_movie_id(unique_movies))
    )

## Evaluation and Parameter Tuning

In [5]:
# Return root mean square error metric
def RMSE(x: np.ndarray, y: np.ndarray) -> float:
    return np.sqrt(np.average((x - y) ** 2))

# Function to do parameter tuning
def validation(user_encoder,movie_encoder,df_train,train_blocks,test_blocks,val_predictions):
    group_shapes = user_encoder.encoder_shapes + movie_encoder.encoder_shapes

    rank = [10,12,14,16]
    n_iter = [100,200,300,400,500]
    n_samples = [95,195,295,395,495]

    best_rmse = 1
    best_r = 0
    best_n = 0
    best_s = 0

    for r in rank:
        for i in range(len(n_iter)):
            fm_probit = myfm.MyFMOrderedProbit(rank=r, random_seed=1234)
            fm_probit.fit(None, df_train.rating.values - 1, X_rel=train_blocks,
                          group_shapes=group_shapes,n_iter=n_iter[i], n_kept_samples=n_samples[i])
            test_prediction_ordered_prob = fm_probit.predict_proba(None, test_blocks)
            test_prediction_ordered_mean = 1 + test_prediction_ordered_prob.dot(np.arange(5)) # class 0 => rating 1 shift
            rmse = RMSE(val_predictions,test_prediction_ordered_mean)
            if rmse < best_rmse:
                best_rmse = rmse
                best_r = r
                best_n = n_iter[i]
                best_s = n_samples[i]
                print(best_rmse)
                
    return best_rmse,best_r,best_n,best_s

best_rmse,best_r,best_n,best_s = validation(user_encoder,movie_encoder,df_train,train_blocks,test_blocks,val_pred)

w0 = -0.13, cutpoint = ['-2.578', '-1.837', '-0.868', '0.009'] : 100%|█| 100/100


0.9709255431942436


w0 = -0.09, cutpoint = ['-2.605', '-1.865', '-0.897', '-0.019'] : 100%|█| 200/20


0.970385255381208


w0 = -0.09, cutpoint = ['-2.653', '-1.915', '-0.945', '-0.067'] : 100%|█| 300/30


0.970202443866509


w0 = -0.07, cutpoint = ['-2.665', '-1.926', '-0.958', '-0.082'] : 100%|█| 400/40


0.9700261975878915


w0 = -0.08, cutpoint = ['-2.684', '-1.936', '-0.968', '-0.090'] : 100%|█| 500/50


0.9699207241404016


w0 = -0.03, cutpoint = ['-2.404', '-1.654', '-0.682', '0.201'] : 100%|█| 100/100
w0 = -0.08, cutpoint = ['-2.445', '-1.698', '-0.723', '0.159'] : 100%|█| 200/200


0.969453234942143


w0 = -0.07, cutpoint = ['-2.483', '-1.739', '-0.762', '0.117'] : 100%|█| 300/300


0.9691976371341182


w0 = -0.05, cutpoint = ['-2.511', '-1.767', '-0.793', '0.090'] : 100%|█| 400/400


0.9690981868690504


w0 = -0.05, cutpoint = ['-2.537', '-1.791', '-0.817', '0.067'] : 100%|█| 500/500


0.9690871527710165


w0 = -0.06, cutpoint = ['-2.245', '-1.496', '-0.515', '0.374'] : 100%|█| 100/100
w0 = -0.05, cutpoint = ['-2.272', '-1.520', '-0.540', '0.347'] : 100%|█| 200/200
w0 = -0.02, cutpoint = ['-2.311', '-1.564', '-0.584', '0.302'] : 100%|█| 300/300
w0 = -0.04, cutpoint = ['-2.340', '-1.593', '-0.611', '0.274'] : 100%|█| 400/400


0.969014609144093


w0 = -0.04, cutpoint = ['-2.354', '-1.604', '-0.623', '0.263'] : 100%|█| 500/500


0.9688954999851386


w0 = 0.13, cutpoint = ['-2.153', '-1.404', '-0.419', '0.471'] : 100%|█| 100/100 
w0 = 0.14, cutpoint = ['-2.231', '-1.474', '-0.486', '0.406'] : 100%|█| 200/200 
w0 = 0.15, cutpoint = ['-2.275', '-1.518', '-0.530', '0.361'] : 100%|█| 300/300 
w0 = 0.14, cutpoint = ['-2.313', '-1.555', '-0.569', '0.321'] : 100%|█| 400/400 


0.9687395403658107


w0 = 0.15, cutpoint = ['-2.339', '-1.583', '-0.596', '0.294'] : 100%|█| 500/500 


0.9685790059989303


In [6]:
print("Validation RMSE using model is " + str(best_rmse))

Validation RMSE using model is 0.9685790059989303


# Generate Features & Train the model on Total Data

In [7]:
#Get & Store Implicit Features on Total Data
user_f = pd.DataFrame(np.arange(10000),columns=['userID'])
user_f['user_implicit_features'] = df_total.groupby('userID').movieID.agg(string_agg).reindex(user_f.index).fillna('')
user_f['user_rating'] = df_total.groupby('userID').rating.agg(string_agg).reindex(user_f.index).fillna('')
movie_f = pd.DataFrame(np.arange(1000),columns=['movieID'])
movie_f['movie_implicit_features'] = df_total.groupby('movieID').userID.agg(string_agg).reindex(movie_f.index).fillna('')
movie_f['movie_rating'] = df_total.groupby('movieID').rating.agg(string_agg).reindex(movie_f.index).fillna('')

#Create User Encoder for total data
user_encoder = DataFrameEncoder().add_column(
    'userID', CategoryValueToSparseEncoder(user_f.userID)
    )

user_encoder.add_column(
        'user_implicit_features',
        MultipleValuesToSparseEncoder(user_f.user_implicit_features, normalize=True)
    )

user_encoder.add_column(
        'user_rating',
        MultipleValuesToSparseEncoder(user_f.user_rating, normalize=True)
    )

#Create Movie Encoder for total data
movie_encoder = DataFrameEncoder().add_column(
    'movieID', CategoryValueToSparseEncoder(movie_f.movieID)
    )
movie_encoder.add_column(
        'movie_implicit_features',
        MultipleValuesToSparseEncoder(movie_f.movie_implicit_features, normalize=True)
    )
movie_encoder.add_column(
        'movie_rating',
        MultipleValuesToSparseEncoder(movie_f.movie_rating, normalize=True)
    )

#Generate Train Blocks on total data
group_shapes = user_encoder.encoder_shapes + movie_encoder.encoder_shapes

total_train_blocks = []
for source, target in [(df_total, total_train_blocks)]:
    unique_users, user_map = np.unique(source.userID, return_inverse=True)
    target.append(
        RelationBlock(user_map, augment_user_id(unique_users))
    )
    unique_movies, movie_map = np.unique(source.movieID, return_inverse=True)
    target.append(
        RelationBlock(movie_map, augment_movie_id(unique_movies))
    )


#Train the model
#optimal params: best_r = 16, best_n=500, best_s=495
fm_probit = myfm.MyFMOrderedProbit(rank=best_r, random_seed=1234)
fm_probit.fit(None, df_total.rating.values - 1, X_rel=total_train_blocks,
                          group_shapes=group_shapes,n_iter=best_n, n_kept_samples=best_s)

w0 = 0.18, cutpoint = ['-2.262', '-1.509', '-0.524', '0.366'] : 100%|█| 500/500 


<myfm.gibbs.MyFMOrderedProbit at 0x1268bb9a0>

## Generate predictions for Submission

In [8]:
#Load Submission File
sub_pd = pd.read_csv('sampleSubmission.csv')
sub_users, sub_movies, sub_pred = extract_users_items_predictions(sub_pd)
sub_test_ratings_dict = {'userID': sub_users,'movieID': sub_movies,'rating': sub_pred}
sub_df = pd.DataFrame(sub_test_ratings_dict)

#Generate Submission Test Blocks
sub_test_blocks = []
for source, target in [(sub_df, sub_test_blocks)]:
    unique_users, user_map = np.unique(source.userID, return_inverse=True)
    target.append(
        RelationBlock(user_map, augment_user_id(unique_users))
    )
    unique_movies, movie_map = np.unique(source.movieID, return_inverse=True)
    target.append(
        RelationBlock(movie_map, augment_movie_id(unique_movies))
    )

#Generate Predictions and create submission csv    
test_prediction_ordered_prob = fm_probit.predict_proba(None, sub_test_blocks)
test_prediction_ordered_mean = 1 + test_prediction_ordered_prob.dot(np.arange(5))
sub_pd['Prediction'] = test_prediction_ordered_mean
sub_pd.set_index("Id", inplace = True)
sub_pd.to_csv("submission_bayesian_svdpp_flipped_rating_ordered_probit.csv")