In [1]:
import os

import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVDpp, KNNWithMeans, BaselineOnly
from surprise.model_selection import cross_validate

from sklearn.model_selection import StratifiedGroupKFold 

In [2]:
EXP_NAME = "somefilter"
class configs:
    OUTPUT_DIR = os.path.join("/workspace", "working", EXP_NAME)
    INPUT_DIR = os.path.join("/workspace", "input", "atmaCup15_dataset")
    TRAIN_CSV = os.path.join(INPUT_DIR, "train_stratifiedgroupkfold.csv")
    ANIME_CSV = os.path.join(INPUT_DIR, "anime.csv")
    TEST_CSV = os.path.join(INPUT_DIR, "test.csv")
    SAMPLE_SUB_CSV = os.path.join(INPUT_DIR, "sample_submission.csv")
    SEED = 42

## SVDpp

In [3]:
# Load the dataset
train_df = pd.read_csv(os.path.join(configs.INPUT_DIR, "train_stratifiedkfold.csv"))
reader = Reader(rating_scale=(1, 10))

In [4]:
feature_name = "svdpp"

models = []
for fold in sorted(train_df["fold"].unique()):
    print("fold", fold)
    train_df_ = train_df[train_df["fold"] != fold].reset_index(drop=True)
    train_data = Dataset.load_from_df(train_df_[['user_id', 'anime_id', 'score']], reader)
    model = SVDpp()
    model.fit(train_data.build_full_trainset())
    models.append(model)

oof_df = pd.DataFrame()
for fold, model in enumerate(models):
    test_df_ = train_df[train_df["fold"] == fold].reset_index(drop=True)
    test_data = Dataset.load_from_df(test_df_[['user_id', 'anime_id', 'score']], reader)
    oof_pred = model.test(test_data.build_full_trainset().build_testset())
    pred = [pred.est for pred in oof_pred]
    # predictions.extend(oof_pred)
    if len(oof_df)==0:
        test_df_[feature_name] = pred
        oof_df = test_df_
    else:
        test_df_[feature_name] = pred
        oof_df = pd.concat([oof_df, test_df_], axis=0)
        print(len(oof_df))
        
test_df = pd.read_csv('/workspace/input/atmaCup15_dataset/test.csv')
test_df['score'] = 0

# Convert the test dataset to the surprise format
test_set = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

predictions = []
for model in models:
    pred = model.test(test_set)
    pred_ = [pred.est for pred in pred]
    predictions.append(pred_)

pred_mean_svdpp = np.mean(predictions, axis=0)
# test_df[feature_name] = pred_mean

fold 0
fold 1
fold 2
fold 3
fold 4
54561
81841
109121
136401


In [5]:
feature_name = "knn_means"

models = []
for fold in sorted(train_df["fold"].unique()):
    print("fold", fold)
    train_df_ = train_df[train_df["fold"] != fold].reset_index(drop=True)
    train_data = Dataset.load_from_df(train_df_[['user_id', 'anime_id', 'score']], reader)
    model = KNNWithMeans()
    model.fit(train_data.build_full_trainset())
    models.append(model)

oof_df_knn = pd.DataFrame()
for fold, model in enumerate(models):
    test_df_ = train_df[train_df["fold"] == fold].reset_index(drop=True)
    test_data = Dataset.load_from_df(test_df_[['user_id', 'anime_id', 'score']], reader)
    oof_pred = model.test(test_data.build_full_trainset().build_testset())
    pred = [pred.est for pred in oof_pred]
    # predictions.extend(oof_pred)
    if len(oof_df_knn)==0:
        test_df_[feature_name] = pred
        oof_df_knn = test_df_
    else:
        test_df_[feature_name] = pred
        oof_df_knn = pd.concat([oof_df_knn, test_df_], axis=0)
        print(len(oof_df_knn))
        
test_df = pd.read_csv('/workspace/input/atmaCup15_dataset/test.csv')
test_df['score'] = 0

# Convert the test dataset to the surprise format
test_set = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

predictions = []
for model in models:
    pred = model.test(test_set)
    pred_ = [pred.est for pred in pred]
    predictions.append(pred_)

pred_mean_knnmeans = np.mean(predictions, axis=0)
# test_df[feature_name] = pred_mean

fold 0
Computing the msd similarity matrix...
Done computing similarity matrix.
fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
fold 4
Computing the msd similarity matrix...
Done computing similarity matrix.
54561
81841
109121
136401


In [6]:
feature_name = "baseline_only"

models = []
for fold in sorted(train_df["fold"].unique()):
    print("fold", fold)
    train_df_ = train_df[train_df["fold"] != fold].reset_index(drop=True)
    train_data = Dataset.load_from_df(train_df_[['user_id', 'anime_id', 'score']], reader)
    model = BaselineOnly()
    model.fit(train_data.build_full_trainset())
    models.append(model)

oof_df_b = pd.DataFrame()
for fold, model in enumerate(models):
    test_df_ = train_df[train_df["fold"] == fold].reset_index(drop=True)
    test_data = Dataset.load_from_df(test_df_[['user_id', 'anime_id', 'score']], reader)
    oof_pred = model.test(test_data.build_full_trainset().build_testset())
    pred = [pred.est for pred in oof_pred]
    # predictions.extend(oof_pred)
    if len(oof_df_b)==0:
        test_df_[feature_name] = pred
        oof_df_b = test_df_
    else:
        test_df_[feature_name] = pred
        oof_df_b = pd.concat([oof_df_b, test_df_], axis=0)
        print(len(oof_df_b))
        
test_df = pd.read_csv('/workspace/input/atmaCup15_dataset/test.csv')
test_df['score'] = 0

# Convert the test dataset to the surprise format
test_set = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

predictions = []
for model in models:
    pred = model.test(test_set)
    pred_ = [pred.est for pred in pred]
    predictions.append(pred_)

pred_mean_bonly = np.mean(predictions, axis=0)
# test_df[feature_name] = pred_mean

fold 0
Estimating biases using als...
fold 1
Estimating biases using als...
fold 2
Estimating biases using als...
fold 3
Estimating biases using als...
fold 4
Estimating biases using als...
54561
81841
109121
136401


# make csv

In [15]:
train_oof = pd.merge(oof_df, oof_df_knn[['user_id', 'anime_id', 'knn_means']], on=['user_id', 'anime_id'], how='left')
train_oof = pd.merge(train_oof, oof_df_b[['user_id', 'anime_id', 'baseline_only']], on=['user_id', 'anime_id'], how='left')

In [16]:
train_oof

Unnamed: 0,user_id,anime_id,score,fold,svdpp,knn_means,baseline_only
0,0008e10fb39e55447333,2290175205d55e81b197,8,0,7.611082,7.469772,7.796142
1,0008e10fb39e55447333,68241332330cbefa2b9e,9,0,7.128258,6.932347,7.226003
2,0008e10fb39e55447333,6c7ce24e0e4f56e0aac0,10,0,8.186384,8.226092,8.145180
3,0008e10fb39e55447333,741d2cba7471560b418a,9,0,7.305001,7.260977,7.505218
4,0008e10fb39e55447333,a1b5af1838c1af21fb99,2,0,4.159800,5.337420,5.663686
...,...,...,...,...,...,...,...
136396,ffa6ff8006f8630f3d11,35d87d3bdeed620ef335,8,4,8.102052,7.890833,8.289222
136397,ffa6ff8006f8630f3d11,963d6e728205d1311231,7,4,6.464810,7.198767,7.291282
136398,ffa6ff8006f8630f3d11,b22bd52bd814b6ed3ac4,7,4,7.590194,7.066377,7.399700
136399,ffa6ff8006f8630f3d11,d922517dbe7fc6d6ff64,7,4,6.767352,7.240048,7.479816


In [19]:
train_oof.to_csv(os.path.join(configs.OUTPUT_DIR, 'train_oof_somefilters.csv'), index=False)

## make test csv

In [22]:
test_df["svdpp"] = pred_mean_svdpp
test_df["knn_means"] = pred_mean_knnmeans
test_df["baseline_only"] = pred_mean_bonly


In [25]:
test_df

Unnamed: 0,user_id,anime_id,score,svdpp,knn_means,baseline_only
0,0008e10fb39e55447333,04068820a73e52dc3b32,0,8.122872,8.021013,8.156319
1,0008e10fb39e55447333,04a3d0b122b24965e909,0,8.384359,8.499189,8.244415
2,0008e10fb39e55447333,1447fe1f10b59912d6a8,0,6.229163,6.744100,6.923504
3,0008e10fb39e55447333,2622632598c68682afd5,0,7.553361,7.747333,7.687429
4,0008e10fb39e55447333,2701850c7216516fec46,0,4.891277,6.184961,6.035873
...,...,...,...,...,...,...
117671,ffe85a36cd20500faa58,f508b02efeac8ecb8cc0,0,8.131427,7.768770,8.484037
117672,ffe85a36cd20500faa58,f5b8ecea3beea4b82d79,0,7.643945,7.768770,7.637740
117673,ffe85a36cd20500faa58,f6c208226b6b69948053,0,7.444523,7.768770,7.432282
117674,ffe85a36cd20500faa58,fe67592c312fc1e17745,0,6.967214,7.768770,7.033105


In [26]:
test_df.to_csv(os.path.join(configs.OUTPUT_DIR, "test_somefilters.csv"), index=False)