In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
%matplotlib inline
sns.set_style('darkgrid')
init_notebook_mode(connected=True)
cf.go_offline()

  import pandas.util.testing as tm


In [2]:
data = pd.read_csv('train/train.csv')

In [3]:
categorical = ['Feature_5', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_12', 'Feature_13', 'Feature_15', 'Feature_20']
numerical = ['Feature_2', 'Feature_3', 'Feature_4', 'Feature_6',
             'Feature_11', 'Feature_14', 'Feature_17',
             'Feature_18', 'Feature_19','Feature_21', 'Feature_22',
             'Feature_23', 'Feature_24', 'Feature_25']

In [4]:
def label_target_split(data):
    drop_feat = ['Id', 'Feature_1', 'Feature_10', 'Feature_16']
    data.drop(axis=1, columns=drop_feat, inplace=True)
    
    ret_agg_label=[]
    for f in range(2,121):
        ret_agg_label.append(f'Ret_{f}')
    ret_agg_target=[]
    for f in range(121,181):
        ret_agg_target.append(f'Ret_{f}')
    data['RetAgg_sum_label']=data[ret_agg_label].sum(axis=1)
    data['RetAgg_std_label']=data[ret_agg_label].std(axis=1)
    data['Ret_std_label']=data[['Ret_MinusOne','Ret_MinusTwo','RetAgg_sum_label']].std(axis=1)
    data['RetAgg_sum_target']=data[ret_agg_target].sum(axis=1)
    data['RetAgg_std_target']=data[ret_agg_target].std(axis=1)
    data['Ret_std_target']=data[['Ret_PlusOne','Ret_PlusTwo','RetAgg_sum_target']].std(axis=1)
    data.drop(columns=ret_agg_label+ret_agg_target, axis=1, inplace=True)
    
    targets = ['Ret_PlusOne', 'Ret_PlusTwo', 'RetAgg_sum_target', 'RetAgg_std_target', 'Ret_std_target']
    X = data.drop(axis = 1, columns=targets+['Weight_Intraday', 'Weight_Daily'])
    y = data[targets]
    
    return X, y

X, y = label_target_split(data)

In [5]:
class numeric_feature_selector(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._ft = numerical
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self._ft]

class to_df(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return pd.DataFrame(X)
    
class format_categorical(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._ft = categorical
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X[self._ft]
        return X.astype('object')

class get_dummies(BaseEstimator, TransformerMixin):
    def __init__(self, cat=categorical, drop=True):
        self._drop = drop
        self._cat = cat
    def fit(self,X ,y=None):
        return self
    def transform(self, X, y=None):
        X = pd.DataFrame(X, columns=self._cat)
        X = pd.get_dummies(X, drop_first=self._drop)
        return X
    
class ret(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._ret_ft = ['Ret_MinusTwo', 'Ret_MinusOne', 'RetAgg_sum_label', 'RetAgg_std_label', 'Ret_std_label']
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self._ret_ft]

In [6]:
numeric_transformer = Pipeline([('selector', numeric_feature_selector()), 
                                ('imputer', SimpleImputer(strategy='median')),
                                ('scaler', MinMaxScaler(feature_range=(0,1))),
                                ('pca', PCA(n_components=12)),
                                ('to_df', to_df())])

categorical_transformer = Pipeline([('format', format_categorical()),
                                    ('impute', SimpleImputer(strategy='most_frequent')),
                                    ('encode', get_dummies(drop=True))])

In [12]:
preprocessor = FeatureUnion([('numerical_transformer', numeric_transformer),
                             ('categorical_transformer', categorical_transformer),
                             ('Return values', ret())])
X = preprocessor.fit_transform(X)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
model = MultiOutputRegressor(estimator=MLPRegressor(activation='tanh',
                                                    early_stopping=True,
                                                    hidden_layer_sizes=(100,),
                                                    learning_rate='adaptive',
                                                    max_iter=1000))

In [19]:
model.fit(X_train, y_train)

MultiOutputRegressor(estimator=MLPRegressor(activation='tanh', alpha=0.0001,
                                            batch_size='auto', beta_1=0.9,
                                            beta_2=0.999, early_stopping=True,
                                            epsilon=1e-08,
                                            hidden_layer_sizes=(100,),
                                            learning_rate='adaptive',
                                            learning_rate_init=0.001,
                                            max_fun=15000, max_iter=1000,
                                            momentum=0.9, n_iter_no_change=10,
                                            nesterovs_momentum=True,
                                            power_t=0.5, random_state=None,
                                            shuffle=True, solver='adam',
                                            tol=0.0001, validation_fraction=0.1,
                                            ver

In [20]:
y_pred = model.predict(X_test)
y_pred_df=pd.DataFrame(y_pred, columns=targets)

In [21]:
mean_absolute_error(y_test, y_pred)

0.007685260778947112

In [23]:
RetPlusOne = pd.DataFrame()
RetPlusOne['y_test'] = y_test.reset_index()['Ret_PlusOne']
RetPlusOne['y_pred'] = y_pred_df['Ret_PlusOne']

RetPlusTwo = pd.DataFrame()
RetPlusTwo['y_test'] = y_test.reset_index()['Ret_PlusTwo']
RetPlusTwo['y_pred'] = y_pred_df['Ret_PlusTwo']

In [24]:
RetPlusOne.iplot()

In [25]:
RetPlusTwo.iplot()