In [29]:
from kagglegym import make
import kagglegym
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import linear_model
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [2]:
class Model():
    def __init__(self, model, columns):

        self.model   = model
        self.columns = columns
        
    def train_with_validation(self, train, validation):
        # Get the X, and y values, 
        X_train = train[self.columns]
        y_train = np.array(train.y)
        X_val = validation[self.columns]
        y_val = np.array(validation.y)
        
        self.model.fit(X_train, y_train)
        reward = get_reward(y_val, self.model.predict(X_val))
        score = self.model.score(X_val, y_val)
        
        print "{} validation: {} score: {}".format(
            str(self.model).split('(')[0], 
            "{0:.8f}".format(reward),
            "{0:.8f}".format(score))
        
    def train(self, train):
        X_train = train[self.columns]
        y_train = np.array(train.y)
        self.model.fit(X_train, y_train)
        score = self.model.score(X_train, y_train)
        print "{} trained successfully with score: {}".format(
            str(self.model).split('(')[0], 
            "{0:.8f}".format(score))
    
    def predict(self, features):   
        return self.model.predict(features[self.columns])

In [3]:
# initialize kaggle gym environment
env = make()
o = env.reset()

In [4]:
raw_train = o.train.fillna(method='ffill')

In [5]:
# split train and validation sets
train_split = 700
train = raw_train[raw_train['timestamp'] < train_split]
val = raw_train[raw_train['timestamp'] > train_split]
X_val = val.drop('y', 1)
y_val = np.array(val.y)

In [6]:
model_dict = {}

In [7]:
ENcolumns = ['technical_30', 'technical_20', 'technical_40',  ]
model_dict['elasticNet'] = Model(linear_model.ElasticNetCV(normalize=True), ENcolumns)

In [8]:
Rcolumns = ['technical_7', 'technical_20','technical_30', 'technical_40', 'fundamental_8' ]
model_dict['ridge'] = Model(linear_model.Ridge(normalize=True), Rcolumns)

In [9]:
Lcolumns = ['technical_7', 'technical_30', 'fundamental_8', 'fundamental_28', 'fundamental_42', 'fundamental_55' ]
model_dict['lasso'] = Model(linear_model.Lasso(normalize=False, alpha=0.000001), Lcolumns)

In [10]:
Tcolumns = ['technical_20', 'technical_30', 'technical_33', 'fundamental_53']
model_dict['tree'] = Model(xgb.XGBRegressor(), Tcolumns)

In [11]:
for model in model_dict.values():
    model.train(train)

ElasticNetCV trained successfully with score: 0.00048650
XGBRegressor trained successfully with score: 0.00215098
Ridge trained successfully with score: 0.00038739
Lasso trained successfully with score: 0.00018481


In [12]:
l1_features = pd.DataFrame()

In [13]:
for name, model in model_dict.iteritems():
    l1_features[name] = model.predict(train.drop('y', 1))

In [14]:
print l1_features.shape

(610795, 4)


In [15]:
dmat = xgb.DMatrix(l1_features, train.y)

In [16]:
print l1_features.columns

Index([u'elasticNet', u'tree', u'ridge', u'lasso'], dtype='object')


In [17]:
params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'reg:linear', 'max_depth':3, 'min_child_weight':1}
booster = xgb.train(params = params, dtrain = dmat)

In [18]:
l1_val = pd.DataFrame()

In [19]:
for name, model in model_dict.iteritems():
    l1_val[name] = model.predict(X_val)

In [25]:
valmat = xgb.DMatrix(l1_val)


In [26]:
pls = booster.predict(valmat)

In [27]:
# A custom function to compute the R score
def get_reward(y_true, y_fit):
    R2 = 1 - np.sum((y_true - y_fit)**2) / np.sum((y_true - np.mean(y_true))**2)
    R = np.sign(R2) * math.sqrt(abs(R2))
    return(R)

In [31]:
print get_reward(y_val, l1_val['elasticNet'])

0.0175315900548
