# Split Data Into Train/Test/K-Fold


# Imports


In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler

# Settings


In [2]:
SEED = 42
TRAIN_PERCENT = 0.9
N_FOLDS = 5

# Load Filtered Triplets With Ratings


In [3]:
df = pd.read_csv("./data/train_triplets_filtered_ratings.csv")
df

Unnamed: 0,user_id,song_id,play_count,rating
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,1
...,...,...,...,...
32525884,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2,2
32525885,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1,1
32525886,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1,1
32525887,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3,2


# Remove Play Count


In [4]:
df = df[["user_id", "song_id", "rating"]]
df

Unnamed: 0,user_id,song_id,rating
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
32525884,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2
32525885,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1
32525886,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1
32525887,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,2


# Undersampling


In [5]:
X = df[["user_id", "song_id"]]
y = df["rating"]

min_count = y.value_counts().min()
sampling_strategy = {i: min_count for i in range(1, 6)}
undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy)
X, y = undersampler.fit_resample(X, y)

df = X
df["rating"] = y

display(df["rating"].value_counts())
display(df)

1    647836
2    647836
3    647836
4    647836
5    647836
Name: rating, dtype: int64

Unnamed: 0,user_id,song_id,rating
0,e8607279b0f9dd6bad157c49c305ce1ed37e69a6,SOFPDCU12A6D4FD0DC,1
1,a97fd9b804624f3848c6182cd00244909e12415a,SOTRDTL12A58A78705,1
2,9c6fccdd38cf14b3023fced02ef5de5015c63a69,SOBGDOL12A6D4F6F21,1
3,9bd3d0e4dceadda0e697ed0988c0b67d33180622,SOHWFUW12A8C13ECCA,1
4,c2ad7a62d65a14fda6a867e56c1544b8215c4545,SOXHSLO12A6D4FCFF1,1
...,...,...,...
3239175,72188c977cf8dea39f7ca7289afc578644b3950f,SOIROON12A6701E0B8,5
3239176,a69fa52fd2c1e1509b4a6240cd059563a515f204,SOBJYFB12AB018372D,5
3239177,12072daab5401fc81062204d76d267215dd46127,SOSCWDQ12AC4688B77,5
3239178,3b7558e7112c0bfb4aac36ffde287158f9f00e89,SOXAZQK12A8C13FD84,5


# Shuffle Data


In [6]:
df = df.sample(frac=1, random_state=SEED)
df

Unnamed: 0,user_id,song_id,rating
2063916,7dde080207001e844690f67c9357e015a0491fc3,SORAKQP12A58A7D699,4
414696,99c483a27234281d69511eb2321267d77430aa6d,SOMPCSO12A8AE47351,1
1709923,a4da0992cd5c0982d05161c82d6aa0b3a5873b54,SOGOKAV12A8C138521,3
323813,1582c00c3e53b891a5a1d18da8b635967eabe61b,SOLWRZI12A6D4FC4F0,1
2642416,9d1b38a741ce012762918760b39c072d3e8cfc36,SOPRFNT12AB017F8E9,5
...,...,...,...
1692743,6ecf0e508a0ac41184c59d7268550feb1ebc13c6,SOYAIPB12A8C143D84,3
2356330,f2449d4b7e58856b38ff0f5384176abd1a61bb5f,SOLVRLL12A67020D7F,4
2229084,dc61155e20289dcd089ac40181cb88f042404602,SOKUAEP12A8C13BE19,4
2768307,6531ff6155ae897a0cab6318df972ed814384554,SOKCORQ12A58A7C74D,5


# Split Into Train/Test Sets


In [7]:
n = len(df)
n_train_samples = int(TRAIN_PERCENT * n)
n_test_samples = n - n_train_samples
print("samples:", n)
print("train samples:", n_train_samples)
print("test samples:", n_test_samples)

samples: 3239180
train samples: 2915262
test samples: 323918


In [8]:
df_train = df[:n_train_samples]
df_train = df_train.reset_index(drop=True)
df_train

Unnamed: 0,user_id,song_id,rating
0,7dde080207001e844690f67c9357e015a0491fc3,SORAKQP12A58A7D699,4
1,99c483a27234281d69511eb2321267d77430aa6d,SOMPCSO12A8AE47351,1
2,a4da0992cd5c0982d05161c82d6aa0b3a5873b54,SOGOKAV12A8C138521,3
3,1582c00c3e53b891a5a1d18da8b635967eabe61b,SOLWRZI12A6D4FC4F0,1
4,9d1b38a741ce012762918760b39c072d3e8cfc36,SOPRFNT12AB017F8E9,5
...,...,...,...
2915257,a6e0d54bdbbe00ad6d4945469671594b23531a64,SOSXLTC12AF72A7F54,5
2915258,1fa325996b6103facae6deca7a4a4589a39eabc6,SOVUFMS12AB0186822,2
2915259,c4a2dbac8aad122c374c0cdcaaddd300ae7850eb,SOPLVNE12A58A7AC5A,1
2915260,accc0e129e50d1cf7fab26e04b21cdbbbe5f08db,SOJUYXY12A8C143472,4


In [9]:
df_test = df[n_train_samples:]
df_test = df_test.reset_index(drop=True)
df_test

Unnamed: 0,user_id,song_id,rating
0,e4dea4adcf6ddb7799bbd4c39de312401ca903f6,SOUSOOB12A8C13371F,1
1,d20772a4da25f18e07f699d9caad97d6ce29c087,SOMAKIT12A58A7E292,1
2,14f5804fda727f975f2db17d9fe982173cf5be6a,SOAXGDH12A8C13F8A1,5
3,bdd7e12da4453b3194bf4821483113d0e9f18679,SOTKYBW12A8C13C3EA,5
4,a33a4287b68da98239be727bc4ad8c75f8e8d457,SORFXJO12A6D4FB614,1
...,...,...,...
323913,6ecf0e508a0ac41184c59d7268550feb1ebc13c6,SOYAIPB12A8C143D84,3
323914,f2449d4b7e58856b38ff0f5384176abd1a61bb5f,SOLVRLL12A67020D7F,4
323915,dc61155e20289dcd089ac40181cb88f042404602,SOKUAEP12A8C13BE19,4
323916,6531ff6155ae897a0cab6318df972ed814384554,SOKCORQ12A58A7C74D,5


# Split Train Into K-Fold


In [10]:
skf = StratifiedKFold(n_splits=N_FOLDS)

for i, (train_idxs, val_idxs) in enumerate(skf.split(df_train, df_train.rating)):
    df_train.loc[df_train.index[val_idxs], "fold"] = i
    print(train_idxs.shape, val_idxs.shape)

df_train["fold"] = df_train["fold"].astype(int)
df_train

(2332209,) (583053,)
(2332209,) (583053,)
(2332210,) (583052,)
(2332210,) (583052,)
(2332210,) (583052,)


Unnamed: 0,user_id,song_id,rating,fold
0,7dde080207001e844690f67c9357e015a0491fc3,SORAKQP12A58A7D699,4,0
1,99c483a27234281d69511eb2321267d77430aa6d,SOMPCSO12A8AE47351,1,0
2,a4da0992cd5c0982d05161c82d6aa0b3a5873b54,SOGOKAV12A8C138521,3,0
3,1582c00c3e53b891a5a1d18da8b635967eabe61b,SOLWRZI12A6D4FC4F0,1,0
4,9d1b38a741ce012762918760b39c072d3e8cfc36,SOPRFNT12AB017F8E9,5,0
...,...,...,...,...
2915257,a6e0d54bdbbe00ad6d4945469671594b23531a64,SOSXLTC12AF72A7F54,5,4
2915258,1fa325996b6103facae6deca7a4a4589a39eabc6,SOVUFMS12AB0186822,2,4
2915259,c4a2dbac8aad122c374c0cdcaaddd300ae7850eb,SOPLVNE12A58A7AC5A,1,4
2915260,accc0e129e50d1cf7fab26e04b21cdbbbe5f08db,SOJUYXY12A8C143472,4,4


# Save Datasets


In [11]:
df_train.to_csv("./data/train_ratings.csv", index=False)
df_test.to_csv("./data/test_ratings.csv", index=False)