### Generate data files

In [1]:
import numpy as np
import pandas as pd
import h5py
from scipy import stats
import pickle

### Read rating matrix

In [2]:
store = pd.HDFStore('ratingDF_tr.h5')
df_rating_tr = store['df_rating_tr']

In [3]:
store = pd.HDFStore('ratingDF_val.h5')
df_rating_val = store['df_rating_val']

In [4]:
df_rating_tr.shape

(1167015, 3)

In [5]:
df_rating_val.shape

(988707, 3)

### Read the matrix form of the rating matrix generated by 'rating_matrix.py'

In [7]:
store = pd.HDFStore('rating_mat_tr.h5')
rating_tr = store['rating_mat']

In [8]:
store = pd.HDFStore('rating_mat_val.h5')
rating_val = store['rating_mat']

In [9]:
rating_tr.shape

(551605, 24)

In [10]:
rating_val.shape

(551605, 24)

### Generate a small subset

In [11]:
rating_tr_short=rating_tr.head(50000)

In [12]:
rating_val_short=rating_val.head(50000)

### Store the subset into file

In [13]:
with h5py.File('rating_tr_numpy.h5', 'w') as hf:
    hf.create_dataset("rating",  data=rating_tr_short.values)

In [14]:
with h5py.File('rating_val_numpy.h5', 'w') as hf:
    hf.create_dataset("rating",  data=rating_val_short.values)

### Generate the encoding of user information

In [15]:
store = pd.HDFStore('autoenc_inp.h5')
df_autoenc = store['df_autoenc']

In [16]:
index = np.array(rating_tr_short.index)
df = df_autoenc.loc[df_autoenc['ncodpers'].isin(index)]

In [17]:
df=df.sort_values('ncodpers')

In [18]:
judge=(df['ncodpers'].values==index)

In [19]:
judge[judge==False]

array([], dtype=bool)

In [20]:
INPUT_LAYER = 314
xtrain = np.zeros((df.shape[0],INPUT_LAYER),dtype=np.int64)
for i in range(df.shape[0]):
    xtrain[i] = df.values[i][1]

In [21]:
with h5py.File('user_infor.h5', 'w') as hf:
    hf.create_dataset("infor",  data=xtrain)

In [22]:
with h5py.File('user_infor.h5', 'r') as hf:
    trying = hf['infor'][:]

### Final Dataset Split

In [23]:
rating_val_new=rating_val.iloc[50001:100001]

In [24]:
rating_tr_new=rating_tr.iloc[50001:100001]

In [25]:
with open('training_data.pkl', 'wb') as output:
    pickle.dump(rating_tr_new, output)