## Installs

In [1]:
#Install package to apply Bayesian Factorization Machine
!pip install myfm



## Imports

In [2]:
# To store the data
import pandas as pd

# To do linear algebra
import numpy as np

# To apply Factorization Machines
import myfm
from sklearn.preprocessing import OneHotEncoder

# To do train-test split for evaluation
from sklearn.model_selection import train_test_split

## Load Data

In [3]:
#Function to Extract users, movies and ratings from raw data
def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

#Load raw data
data_pd = pd.read_csv('data/data_train.csv')

#Do Train-Test Split
train, val = train_test_split(data_pd, test_size=0.1,random_state=42)

#Extract users, movies and ratings from raw data
total_users, total_movies, total_pred = extract_users_items_predictions(data_pd)
train_users, train_movies, train_pred = extract_users_items_predictions(train)
val_users, val_movies, val_pred = extract_users_items_predictions(val)

#Store total data
ratings_dict_total = {'userID': total_users,'movieID': total_movies,'rating': total_pred}
df_total = pd.DataFrame(ratings_dict_total)

#Store train data
ratings_dict_train = {'userID': train_users,'movieID': train_movies,'rating': train_pred}
df_train = pd.DataFrame(ratings_dict_train)

#Store validation data
ratings_dict_test = {'userID': val_users,'movieID': val_movies,'rating': val_pred}
df_test = pd.DataFrame(ratings_dict_test)

## Apply Bayesian Factorization Machine

In [4]:
'''code adapted from https://github.com/tohtsky/myFM'''
def factorization_machine(df_train, df_test=pd.Series([]), rank=12, grouping=None, n_iter=500, samples=495):
    explanation_columns = ["userID", "movieID"]
    ohe = OneHotEncoder(handle_unknown="ignore")
    X_train = ohe.fit_transform(df_train[explanation_columns])
    y_train = df_train.rating.values
    fm = myfm.MyFMRegressor(rank=rank, random_seed=1234)

    if grouping:
        # specify how columns of X_train are grouped
        group_shapes = [len(category) for category in ohe.categories_]
        assert sum(group_shapes) == X_train.shape[1]
    else:
        group_shapes = None

    fm.fit(
        X_train,
        y_train,
        group_shapes=group_shapes,
        n_iter=n_iter,
        n_kept_samples=samples,
    )
    prediction = None
    if not df_test.empty:
        X_test = ohe.transform(df_test[explanation_columns])
        y_test = df_test.rating.values
        prediction = fm.predict(X_test)
    return fm,prediction,ohe

## Evaluation and Parameter Tuning

In [5]:
# Return root mean square error metric
def RMSE(x: np.ndarray, y: np.ndarray) -> float:
    return np.sqrt(np.average((x - y) ** 2))

# Function to do parameter tuning
def validation(df_train,df_test,val_predictions):
  
    rank = [8,10,12,14]
    n_iter = [100,200,300,400,500]
    n_samples = [95,195,295,395,495]

    best_rmse = 1
    best_r = 0
    best_n = 0
    best_s = 0

    for r in rank:
        for i in range(len(n_iter)):
            fm,test_predictions,ohe = factorization_machine(df_train, df_test, r,True,n_iter[i],n_samples[i])
            rmse = RMSE(val_predictions,test_predictions)
            if rmse < best_rmse:
                best_rmse = rmse
                best_r = r
                best_n = n_iter[i]
                best_s = n_samples[i]
                print(best_rmse)
                
    return best_rmse,best_r,best_n,best_s

best_rmse,best_r,best_n,best_s = validation(df_train,df_test,val_pred)

alpha = 1.12 w0 = 3.78 : 100%|████████████████| 100/100 [00:18<00:00,  5.44it/s]


0.9786862959741647


alpha = 1.12 w0 = 3.78 : 100%|████████████████| 200/200 [00:37<00:00,  5.34it/s]


0.9780474944790564


alpha = 1.12 w0 = 3.79 : 100%|████████████████| 300/300 [00:56<00:00,  5.29it/s]


0.978031974607306


alpha = 1.12 w0 = 3.80 : 100%|████████████████| 400/400 [01:15<00:00,  5.28it/s]


0.9780192776453689


alpha = 1.12 w0 = 3.84 : 100%|████████████████| 500/500 [01:34<00:00,  5.29it/s]


0.9780132791681007


alpha = 1.13 w0 = 3.77 : 100%|████████████████| 100/100 [00:23<00:00,  4.28it/s]
alpha = 1.13 w0 = 3.79 : 100%|████████████████| 200/200 [00:46<00:00,  4.29it/s]


0.9777867974646192


alpha = 1.13 w0 = 3.81 : 100%|████████████████| 300/300 [01:10<00:00,  4.29it/s]


0.9776467145136033


alpha = 1.13 w0 = 3.83 : 100%|████████████████| 400/400 [01:33<00:00,  4.29it/s]


0.9776108507449198


alpha = 1.13 w0 = 3.84 : 100%|████████████████| 500/500 [01:56<00:00,  4.30it/s]


0.9775757919279762


alpha = 1.15 w0 = 3.78 : 100%|████████████████| 100/100 [00:27<00:00,  3.62it/s]
alpha = 1.15 w0 = 3.79 : 100%|████████████████| 200/200 [00:55<00:00,  3.62it/s]
alpha = 1.15 w0 = 3.81 : 100%|████████████████| 300/300 [01:22<00:00,  3.64it/s]
alpha = 1.15 w0 = 3.84 : 100%|████████████████| 400/400 [01:55<00:00,  3.45it/s]
alpha = 1.15 w0 = 3.86 : 100%|████████████████| 500/500 [02:14<00:00,  3.73it/s]
alpha = 1.16 w0 = 3.77 : 100%|████████████████| 100/100 [00:31<00:00,  3.21it/s]
alpha = 1.17 w0 = 3.81 : 100%|████████████████| 200/200 [01:03<00:00,  3.16it/s]
alpha = 1.16 w0 = 3.84 : 100%|████████████████| 300/300 [01:34<00:00,  3.16it/s]
alpha = 1.16 w0 = 3.88 : 100%|████████████████| 400/400 [02:10<00:00,  3.07it/s]
alpha = 1.16 w0 = 3.90 : 100%|████████████████| 500/500 [02:43<00:00,  3.06it/s]


In [6]:
print("Validation RMSE using model is " + str(best_rmse))

Validation RMSE using model is 0.9775757919279762


# Train the model on Total Data

In [8]:
fm,test_predictions,ohe = factorization_machine(df_total,pd.Series([]),best_r,True,best_n,best_s)

  fm,test_predictions,ohe = factorization_machine(df_total,pd.Series([]),best_r,True,best_n,best_s)
alpha = 1.13 w0 = 3.84 : 100%|████████████████| 500/500 [01:20<00:00,  6.24it/s]


## Generate predictions for Submission

In [None]:
#Load Submission File
sub_pd = pd.read_csv('data/sampleSubmission.csv')
sub_users, sub_movies, sub_pred = extract_users_items_predictions(sub_pd)
sub_test_ratings_dict = {'userID': sub_users,'movieID': sub_movies,'rating': sub_pred}
sub_df = pd.DataFrame(sub_test_ratings_dict)

#Generate Predictions and create submission csv    
X_test = ohe.transform(sub_df[["userID", "movieID"]])
predictions = fm.predict(X_test)
sub_pd['Prediction'] = predictions
sub_pd.set_index("Id", inplace = True)
sub_pd.to_csv("submission_bayesian_factorization_machine.csv")