In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import xgboost as xgb

datapath = r'.\data'

## Slice Localization data split

In [3]:
# Load the data: slice_localization
df = pd.read_csv(datapath+'\slice_localization_data.csv')

In [4]:
df.head(5)

Unnamed: 0,patientId,value0,value1,value2,value3,value4,value5,value6,value7,value8,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
0,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.980381,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.803851
1,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.745726
2,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.6876
3,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.629474
4,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.976833,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.571348


In [5]:
# Slice data train and test split: Patient id shouldn't get mixed during train and test split
u_pid = np.unique(df['patientId'])
print(u_pid, len(u_pid))

# 80 - 20% split based on patient id: 
test_idx = np.arange(1, int(len(u_pid)/5) + 1)*5 - 1
print(test_idx)
test = np.where(np.isin(df['patientId'],test_idx))[0]
train = np.where(~np.isin(df['patientId'],test_idx))[0]
print(len(test), len(train))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96] 97
[ 4  9 14 19 24 29 34 39 44 49 54 59 64 69 74 79 84 89 94]
7675 45825


In [8]:
X_train,  y_train = df.iloc[train, 1:385].values, df.iloc[train, 385].values
X_test, y_test = df.iloc[test, 1:385].values, df.iloc[test, 385].values
print(X_train.shape, X_test.shape)

(45825, 384) (7675, 384)


In [None]:
# Saving train and test into csv file
np.savez(datapath + '\slice_localization_tr.npz', features=X_train, labels=y_train)
np.savez(datapath + '\slice_localization_te.npz', features=X_test, labels=y_test)

## Music Year Prediction data split

In [9]:
# Load the data: YearsPrediction
df = pd.read_csv(datapath + '\YearPredictionMSD.txt', header=None)

In [10]:
# YearsPrediction data
data = df.loc[:, 1:]
target = df.loc[:, 0]

data.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,81,82,83,84,85,86,87,88,89,90
0,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,-2.46783,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,4.5921,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,1.39518,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,-6.36304,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,0.93609,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [11]:
# YearsPrediction split
cut = 463715 # From the website
X_train, y_train = data[:cut].values, target[:cut].values
X_test, y_test = data[cut:].values, target[cut:].values

print(X_train.shape, X_test.shape)


(463715, 90) (51630, 90)


In [12]:
# Saving train and test into npz file
np.savez(datapath + '\YearPredictionMSD_tr.npz', features=X_train, labels=y_train)
np.savez(datapath + '\YearPredictionMSD_te.npz', features=X_test, labels=y_test)