In [None]:
import pandas as pd
import os

Loading the activity training dataset

In [None]:
act_train_location = os.path.realpath('')+'/act_train.csv'
df_act_train = pd.read_csv(act_train_location)
df_act_train.head()

Loading the activity test dataset

In [None]:
act_test_location = os.path.realpath('')+'/act_test.csv'
df_act_test = pd.read_csv(act_test_location)
df_act_test.head()

Loading the people dataset

In [None]:
people_location = os.path.realpath('')+'/people.csv'
df_people = pd.read_csv(people_location)

Changing column names to differentiate activity characteristics from people characteristics

In [None]:
newcolnames = []
for colname in df_act_train.columns:
    if 'char' in colname and 'a_' not in colname:
        newcolnames.append('a_'+colname)
    elif 'date' in colname:
        newcolnames.append('a_'+colname)
    else:
        newcolnames.append(colname)
df_act_train.columns = newcolnames
newcolnames = []
for colname in df_people.columns:
    if 'char' in colname and 'p_' not in colname:
        newcolnames.append('p_'+colname)
    elif 'date' in colname:
        newcolnames.append('p_'+colname)        
    else:
        newcolnames.append(colname)
df_people.columns = newcolnames

newcolnames = []
for colname in df_act_test.columns:
    if 'char' in colname and 'a_' not in colname:
        newcolnames.append('a_'+colname)
    elif 'date' in colname:
        newcolnames.append('a_'+colname)
    else:
        newcolnames.append(colname)
df_act_test.columns = newcolnames

Setting index to people id before performing join

In [None]:
df_act_train = df_act_train.set_index('people_id')
df_people = df_people.set_index('people_id')
df_act_test = df_act_test.set_index('people_id')

Preparing the training and test datasets by joining them with people information

In [None]:
df_train = df_act_train.join(df_people)
df_test = df_act_test.join(df_people)

Fill NaN with 'type -1'

In [None]:
df_test.fillna(value = 'type -1',inplace=True)
df_train.fillna(value = 'type -1',inplace=True)

Resetting index to activity id

In [None]:
df_train = df_train.reset_index()
df_train.set_index('activity_id',inplace=True)

df_test = df_test.reset_index()
df_test.set_index('activity_id',inplace=True)

Concatenate training and test datasets before using label encoder

In [None]:
df_comp = pd.concat([df_train,df_test])

Importing LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
import numpy as np
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = np.empty(list(X.shape),dtype=int)
        if self.columns is not None:
            for i,col in enumerate(self.columns):
                output[:,i] = LabelEncoder().fit_transform(X[col])
        else:
            for i,col in enumerate(X.columns):
                output[:,i] = LabelEncoder().fit_transform(X[col])
        return output
    
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
chk = MultiColumnLabelEncoder().fit_transform(df_comp.drop(['a_date','outcome','p_date','p_char_38'],axis=1))

In [None]:
chk = np.append(chk,df_comp.p_char_38.reshape(-1,1),axis=1)

In [None]:
import h5py

Saving label encoded data

In [None]:
with h5py.File('data.h5', 'w') as hf:
    hf.create_dataset('dataset_1', data=chk)