In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import torch
import torch.nn as nn
import torch.utils.data as data_utils

In [9]:
torch.cuda.get_device_name(0)

'GeForce GTX 1060'

In [67]:
class build_data(BaseEstimator, TransformerMixin):
    """ Loads and Prepares dataset for pytorch"""
    
    def __init__(self, df, drop, split_size=0.33, rand=22391, batch=1, shuffle=True, pin=True):
        self.rand = rand
        self.split_size = split_size
        self.batch = batch
        self.shuffle = shuffle
        self.pin = pin
        
        df = df.astype('float')
        
        ccols = [i for i in df.columns if 'Feature' in i]
        keep = [i for i in ccols if i not in drop]

        self.x = df.iloc[:,26:147]
        self.x2 = df.loc[:,keep]
        self.y = df.iloc[:,147:]
               
    def _na_fill(self,mode):
        for i in self.x2.columns:
            if i in mode:
                self.x2[i] = self.x2[i].fillna(value=self.x2[i].mode()[0])
            else:
                self.x2[i] = self.x2[i].fillna(value=self.x2[i].median())
                
        self.x = self.x.interpolate(method='linear', axis=1)
        self.x_fin = pd.concat([self.x2,self.x], axis=1)
        
    def _split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.x_fin, self.y, test_size=self.split_size, random_state=self.rand)

        # Seperate Features and TS
        self.X_train_ts = X_train.iloc[:,23:147]
        self.X_test_ts = X_test.iloc[:,23:147]

        self.X_train_ft = X_train.iloc[:,:23]
        self.X_test_ft = X_test.iloc[:,:23]

        # Get Weights for MAE
        self.test_wt, self.train_wt = np.asarray(y_test.iloc[:,-2:]), np.asarray(y_train.iloc[:,-2:])
        self.y_test, self.y_train = np.asarray(y_test.iloc[:,:-2]), np.asarray(y_train.iloc[:,:-2])
        
    def _scale(self,stsc,lab):
        ctrans =  ColumnTransformer(
                    [('scale_all', StandardScaler(), stsc),
                     ('cats', OneHotEncoder(categories='auto'), lab)])
        
        xtsc = StandardScaler()
        ytsc = StandardScaler()
        
        self.X_train_ft = ctrans.fit_transform(self.X_train_ft)
        self.X_test_ft = ctrans.transform(self.X_test_ft)
        self.X_train_ts = xtsc.fit_transform(self.X_train_ts)
        self.X_test_ts = xtsc.transform(self.X_test_ts)
        
        self.x_train = np.concatenate([self.X_train_ft, self.X_train_ts], axis=1)
        self.x_test = np.concatenate([self.X_test_ft, self.X_test_ts], axis=1)
        
        self.y_train = ytsc.fit_transform(self.y_train)
        self.y_test = ytsc.transform(self.y_test)
        
        self.xtrans = xtsc
        self.ytrans = ytsc
        
    def fit(self, mode, stsc, lab):
        self._na_fill(mode)
        self._split()
        self._scale(stsc,lab)
        
        torch_x_train, torch_y_train = torch.from_numpy(self.x_train).float(), torch.from_numpy(self.y_train).float()
        torch_x_test, torch_y_test = torch.from_numpy(self.x_test).float(), torch.from_numpy(self.y_test).float()
        
        train = data_utils.TensorDataset(torch_x_train, torch_y_train)
        test = data_utils.TensorDataset(torch_x_test, torch_y_test)
        
        train_loader = data_utils.DataLoader(train, batch_size=self.batch, shuffle=self.shuffle, pin_memory=self.pin)
        test_loader = data_utils.DataLoader(test, batch_size=self.batch, shuffle=self.shuffle, pin_memory=self.pin)
        
        return train_loader, test_loader
    
    def get_weights(self):
        return self.train_wt, self.test_wt
    
    def reverse_trans(self, x=False, y=False):
        if x is not False:
            return self.xtrans.inverse_transform(x)
        if y is not False:
            return self.ytrans.inverse_transform(y)
        if x is not False and y is not False:
            return self.xtrans.inverse_transform(x), self.ytrans.inverse_transform(y)
        

In [68]:
path = r"C:\Users\rlagr\fin\winton\data\train.csv"
exclude = ['Feature_1', 'Feature_10']
mode = ['Feature_9', 'Feature_13', 'Feature_16', 'Feature_20']
cats = ['Feature_5', 'Feature_13', 'Feature_16', 'Feature_20']
scale = ['Feature_2', 'Feature_3', 'Feature_4', 'Feature_6', 'Feature_8', 'Feature_11', 'Feature_12', 'Feature_14', 'Feature_17', 'Feature_18',
         'Feature_19', 'Feature_21', 'Feature_22', 'Feature_23', 'Feature_24', 'Feature_25', 'Feature_7', 'Feature_9', 'Feature_15']

#df = pd.read_csv(path)
data = build_data(df, exclude)

train_loader, test_loader = data.fit(mode, scale, cats)

In [66]:
test = np.random.randn(1,121)
test2 = data.reverse_trans(x=test)
test2

array([[ 1.21894779e-02,  1.73349014e-02, -2.91885102e-04,
         8.30703863e-04,  2.09945252e-03,  4.92398669e-04,
         3.38496041e-04, -1.53148135e-03, -7.20635039e-04,
        -6.81625984e-04,  1.79684565e-04,  3.00475905e-04,
        -2.56438979e-04,  6.84541312e-05, -7.58982448e-04,
        -8.24113657e-04, -2.24700064e-04, -3.20417908e-04,
        -5.90317011e-04, -9.34957083e-05,  9.66957408e-04,
        -2.08054895e-04, -4.36955712e-04, -1.63127325e-03,
        -6.83008882e-04,  1.59965161e-03,  2.20838620e-04,
        -2.07533869e-03, -1.02349472e-03, -3.37716242e-04,
         8.77656203e-04, -4.07518496e-04,  8.47507454e-04,
         1.29475068e-04,  7.27885672e-04, -6.95361844e-04,
        -5.44769920e-04,  6.50712376e-06, -6.65644611e-04,
         3.95128886e-05,  9.94189975e-04, -5.07781482e-04,
         6.07276109e-04,  9.06447970e-05,  1.91861123e-04,
         1.01588382e-03, -3.71109506e-04, -1.10269846e-03,
        -1.39045539e-03, -4.13796597e-04, -5.91094770e-0