# Train SVD using data from split_data.ipynb


# Imports


In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from sklearn.metrics import mean_squared_error

# Settings


In [2]:
SEED = 42
N_FOLDS = 5

# Load Train/Test Data


In [3]:
df_train = pd.read_csv("./data/train_ratings.csv")
df_train

Unnamed: 0,user_id,song_id,rating,fold
0,7dde080207001e844690f67c9357e015a0491fc3,SORAKQP12A58A7D699,4,0
1,99c483a27234281d69511eb2321267d77430aa6d,SOMPCSO12A8AE47351,1,0
2,a4da0992cd5c0982d05161c82d6aa0b3a5873b54,SOGOKAV12A8C138521,3,0
3,1582c00c3e53b891a5a1d18da8b635967eabe61b,SOLWRZI12A6D4FC4F0,1,0
4,9d1b38a741ce012762918760b39c072d3e8cfc36,SOPRFNT12AB017F8E9,5,0
...,...,...,...,...
2915257,a6e0d54bdbbe00ad6d4945469671594b23531a64,SOSXLTC12AF72A7F54,5,4
2915258,1fa325996b6103facae6deca7a4a4589a39eabc6,SOVUFMS12AB0186822,2,4
2915259,c4a2dbac8aad122c374c0cdcaaddd300ae7850eb,SOPLVNE12A58A7AC5A,1,4
2915260,accc0e129e50d1cf7fab26e04b21cdbbbe5f08db,SOJUYXY12A8C143472,4,4


In [4]:
df_test = pd.read_csv("./data/test_ratings.csv")
df_test

Unnamed: 0,user_id,song_id,rating
0,e4dea4adcf6ddb7799bbd4c39de312401ca903f6,SOUSOOB12A8C13371F,1
1,d20772a4da25f18e07f699d9caad97d6ce29c087,SOMAKIT12A58A7E292,1
2,14f5804fda727f975f2db17d9fe982173cf5be6a,SOAXGDH12A8C13F8A1,5
3,bdd7e12da4453b3194bf4821483113d0e9f18679,SOTKYBW12A8C13C3EA,5
4,a33a4287b68da98239be727bc4ad8c75f8e8d457,SORFXJO12A6D4FB614,1
...,...,...,...
323913,6ecf0e508a0ac41184c59d7268550feb1ebc13c6,SOYAIPB12A8C143D84,3
323914,f2449d4b7e58856b38ff0f5384176abd1a61bb5f,SOLVRLL12A67020D7F,4
323915,dc61155e20289dcd089ac40181cb88f042404602,SOKUAEP12A8C13BE19,4
323916,6531ff6155ae897a0cab6318df972ed814384554,SOKCORQ12A58A7C74D,5


# Train K-Fold Models


In [5]:
mean_rmse = mean_squared_error(
    df_test.rating, [df_test.rating.mean()] * len(df_test), squared=False
)
print("Score to beat:", mean_rmse)

Score to beat: 1.4131236504465372


In [6]:
def pred(model, row):
    return model.predict(row["user_id"], row["song_id"]).est

In [7]:
train_scores = []
valid_scores = []
test_scores = []

df_train["svd"] = 0
all_test_preds = []

for fold in range(N_FOLDS):
    # Get fold train/valid samples
    train_samples = df_train[df_train.fold != fold][["user_id", "song_id", "rating"]]
    valid_samples = df_train[df_train.fold == fold][["user_id", "song_id", "rating"]]

    # Convert to Surprise dataset
    reader = Reader(rating_scale=(1, 5))
    train_dataset = Dataset.load_from_df(train_samples, reader).build_full_trainset()
    valid_dataset = Dataset.load_from_df(valid_samples, reader).build_full_trainset()

    # Train model
    model = SVD(n_factors=0, n_epochs=40, lr_all=0.008, reg_all=0.02, random_state=SEED)
    model.fit(train_dataset)

    # Predict
    train_preds = train_samples.apply(lambda row: pred(model, row), axis=1).values
    valid_preds = valid_samples.apply(lambda row: pred(model, row), axis=1).values
    test_preds = df_test.apply(lambda row: pred(model, row), axis=1).values

    # Calculate RMSE
    train_score = mean_squared_error(train_samples.rating, train_preds, squared=False)
    valid_score = mean_squared_error(valid_samples.rating, valid_preds, squared=False)
    test_score = mean_squared_error(df_test.rating, test_preds, squared=False)

    # To aggregate
    train_scores.append(train_score)
    valid_scores.append(valid_score)
    test_scores.append(test_score)

    # Save preds for stacking
    df_train.loc[df_train.fold == fold, "svd"] = valid_preds
    all_test_preds.append(test_preds)

    # Print fold stats
    print(f"========== FOLD {fold} ==========")
    print("Train RMSE:", train_score)
    print("Valid RMSE:", valid_score)
    print("Test  RMSE:", test_score)

# Print mean stats
print("============================")
print("Mean Train RMSE:", sum(train_scores) / N_FOLDS)
print("Mean Valid RMSE:", sum(valid_scores) / N_FOLDS)
print("Mean Test  RMSE:", sum(test_scores) / N_FOLDS)

# # Save valid preds
df_train.to_csv("./data/svd_valid.csv", index=False)

# # Save test preds
df_test["svd"] = np.mean(all_test_preds, axis=0)
df_test.to_csv("./data/svd_test.csv", index=False)

Train RMSE: 0.9966579447287514
Valid RMSE: 1.1657533407527165
Test  RMSE: 1.1656076966148992
Train RMSE: 0.997304225468195
Valid RMSE: 1.1658908018097607
Test  RMSE: 1.165240651729801
Train RMSE: 0.9964047934856475
Valid RMSE: 1.165395722923106
Test  RMSE: 1.165595355562452
Train RMSE: 0.996412488834803
Valid RMSE: 1.1671588011279193
Test  RMSE: 1.1652275489483992
Train RMSE: 0.9966830299357912
Valid RMSE: 1.1655303668357881
Test  RMSE: 1.1654523299652795
Mean Train RMSE: 0.9966924964906376
Mean Valid RMSE: 1.1659458066898583
Mean Test  RMSE: 1.1654247165641662
