In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

In [2]:
train_table_raw = pd.read_table("ydata-ymusic-rating-study-v1_0-train.txt", sep="\t", header=None)
test_table_raw = pd.read_table("ydata-ymusic-rating-study-v1_0-test.txt", sep="\t", header=None)
user_feature = pd.read_table("ydata-ymusic-rating-study-v1_0-survey-answers.txt", sep="\t", header=None)

In [3]:
n_users_with_feat = user_feature.shape[0]
n_users_with_feat

5400

In [4]:
train_array = train_table_raw.to_numpy()
test_array = test_table_raw.to_numpy()
user_feature_array = user_feature.to_numpy()

In [5]:
train_df = pd.DataFrame(train_array, columns=["user_id", "item_id", "rating"])
random_df = pd.DataFrame(test_array, columns=["user_id", "item_id", "rating"])

In [6]:
train_df

Unnamed: 0,user_id,item_id,rating
0,1,14,5
1,1,35,1
2,1,46,1
3,1,83,1
4,1,93,1
...,...,...,...
311699,15400,564,5
311700,15400,578,1
311701,15400,637,5
311702,15400,884,1


In [7]:
train_df["user_id"] -= 1
train_df["item_id"] -= 1
random_df["user_id"] -= 1
random_df["item_id"] -= 1

In [8]:
train_df = train_df.loc[train_df["user_id"] < n_users_with_feat]
random_df = random_df.loc[random_df["user_id"] < n_users_with_feat]

In [9]:
train_df

Unnamed: 0,user_id,item_id,rating
0,0,13,5
1,0,34,1
2,0,45,1
3,0,82,1
4,0,92,1
...,...,...,...
129174,5399,818,5
129175,5399,827,5
129176,5399,930,5
129177,5399,966,5


In [10]:
train_df.to_csv("train.csv", index=False)
random_df.to_csv("random.csv", index=False)

In [11]:
pd.read_csv("random.csv")

Unnamed: 0,user_id,item_id,rating
0,0,48,1
1,0,125,1
2,0,137,1
3,0,140,1
4,0,176,1
...,...,...,...
53995,5399,631,1
53996,5399,682,3
53997,5399,685,1
53998,5399,783,4


In [12]:
user_feature

Unnamed: 0,0,1,2,3,4,5,6
0,5,5,4,3,5,5,2
1,5,4,4,4,5,5,1
2,5,5,5,5,5,5,1
3,5,5,5,5,5,5,2
4,5,5,5,5,5,5,2
...,...,...,...,...,...,...,...
5395,4,5,5,5,5,5,1
5396,5,5,5,5,5,5,2
5397,5,5,5,5,5,5,1
5398,5,5,5,4,5,5,1


In [13]:
user_feat_onehot = pd.get_dummies(user_feature[0])

In [14]:
for i in range(1, 7):
    df = pd.get_dummies(user_feature[i])
    user_feat_onehot = pd.concat([user_feat_onehot, df], axis=1)

In [15]:
user_feat_onehot

Unnamed: 0,1,2,3,4,5,1.1,2.1,3.1,4.1,5.1,...,3.2,4.2,5.2,1.2,2.2,3.3,4.3,5.3,1.3,2.3
0,False,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,False,True
1,False,False,False,False,True,False,False,False,True,False,...,False,False,True,False,False,False,False,True,True,False
2,False,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,True,False
3,False,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,False,True
4,False,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5395,False,False,False,True,False,False,False,False,False,True,...,False,False,True,False,False,False,False,True,True,False
5396,False,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,False,True
5397,False,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,True,False
5398,False,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,True,False


In [16]:
user_feat_onehot.to_csv("user_feat_onehot.csv", index=False)

(user_feature - 1).to_csv("user_feat_label.csv", index=False)