In [1]:
from kagglegym import make
import kagglegym
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import linear_model, ensemble
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [2]:
# A custom function to compute the R score
def get_reward(y_true, y_fit):
    R2 = 1 - np.sum((y_true - y_fit)**2) / np.sum((y_true - np.mean(y_true))**2)
    R = np.sign(R2) * math.sqrt(abs(R2))
    return(R)

In [111]:
class Model():
    def __init__(self, model, columns):

        self.model   = model
        self.columns = columns
        
    def train_with_validation(self, train, validation):
        # Get the X, and y values, 
        X_train = train[self.columns]
        y_train = np.array(train.y)
        X_val = validation[self.columns]
        y_val = np.array(validation.y)
        
        self.model.fit(X_train, y_train)
        reward = get_reward(y_val, self.model.predict(X_val))
        score = self.model.score(X_val, y_val)
        
        print "{} validation: {} score: {}".format(
            str(self.model).split('(')[0], 
            "{0:.8f}".format(reward),
            "{0:.8f}".format(score))
        
    def train(self, train):
        X_train = train[self.columns]
        y_train = np.array(train.y)
        self.model.fit(X_train, y_train)
        score = self.model.score(X_train, y_train)
#         print "{} trained successfully with score: {}".format(
#             str(self.model).split('(')[0], 
#             "{0:.8f}".format(score))
    
    def predict(self, features):   
        return self.model.predict(features[self.columns])

In [162]:
class Stacked():
    def __init__(self):
        self.stacked_model = None
        self.cv_xgb = None

        self.model_dict = {}
        
        ENcolumns = ['technical_30', 'technical_20', 'technical_40',  ]
        self.model_dict['elasticNet'] = Model(linear_model.ElasticNetCV(normalize=True), ENcolumns)
        
        Rcolumns = [ 'fundamental_52', 'technical_30', 'technical_33', 'technical_20']
        self.model_dict['ridge'] = Model(linear_model.Ridge(normalize=True), Rcolumns)
        
        Lcolumns = ['fundamental_11', 'fundamental_12', 'technical_44' ]
        self.model_dict['lasso'] = Model(linear_model.Lasso(normalize=True), Lcolumns)
        
#         Tcolumns = ['fundamental_53', 'technical_3' ,'technical_30']
#         self.model_dict['tree'] = Model(xgb.XGBRegressor(), Tcolumns)
        
        Icolumns = ['fundamental_20', 'technical_13','technical_20' ,'technical_30']
        self.model_dict['linear'] = Model(linear_model.LinearRegression(normalize=True), Icolumns)
        
        #self.columns = ['elasticNet', 'ridge', 'lasso', 'tree']
        self.columns = ['elasticNet', 'ridge', 'linear', 'lasso']
        
    def train_with_validation(self, raw_train, train_split=700):
        # split train and validation sets
        X_train = raw_train[raw_train['timestamp'] < train_split]
        val = raw_train[raw_train['timestamp'] > train_split]
        X_val = val.drop('y', 1)
        y_val = np.array(val.y)
        
        self.train(X_train)
        val_preds = self.predict(X_val)

        print "Validation reward: {}".format(get_reward(y_val, val_preds))    
        
         
    def train(self, raw_train, train_split=500):
        train = raw_train[raw_train['timestamp'] < train_split]
        val = raw_train[raw_train['timestamp'] > train_split]
        train = raw_train
        for model in self.model_dict.values():
            model.train(train)
               
        # train layer 1
        l1_features = pd.DataFrame()
        
        for name, model in self.model_dict.iteritems():
            if name in self.columns:
                l1_features[name] = model.predict(train.drop('y', 1))   
        l1_features['y'] = train.y.values
        l1_features = l1_features.fillna(0)
        print l1_features.shape
        self.stacked_model = Model(linear_model.Ridge(alpha=.0001), self.columns)
        self.stacked_model.train(l1_features)
#         dmat = xgb.DMatrix(train.drop('y', 1), train.y)
#         params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
#                      'objective': 'reg:linear', 'max_depth':6, 'min_child_weight':1}
#         self.cv_xgb = xgb.cv(params = params, dtrain = dmat, num_boost_round = 3000, nfold = 5,
#                 metrics = ['error'], # Make sure you enter metrics inside a list or you may encounter issues!
#                 early_stopping_rounds = 100) # Look for early stopping that minimizes error\
        #final_gb = xgb.train(our_params, xgdmat, num_boost_round = 432)
        print "Stacked model successfully trained."
            
        
    def predict(self, features):
        assert self.stacked_model is not None, 'Model has not trained yet.'
        l1_features = pd.DataFrame()
        for name, model in self.model_dict.iteritems():
            if name in self.columns:
                l1_features[name] = model.predict(features)
        preds = self.stacked_model.predict(l1_features)
        return preds

In [163]:
# initialize kaggle gym environment
env = make()
o = env.reset()
raw_train = o.train.fillna(0)

In [164]:
averages_by_ts = raw_train.groupby('timestamp').aggregate(np.mean)
offset_averages = averages_by_ts.shift(1)
added_features = averages_by_ts.join(offset_averages, how='inner', rsuffix='_offset')
raw_train = raw_train.join(added_features, on='timestamp', how='left', rsuffix='_rolling')

In [165]:
print raw_train['technical_42_rolling'].shape

(806298,)


In [166]:
# stacked_val = Stacked()
# stacked_val.train_with_validation(raw_train)
# stacked= stacked_val
stacked = Stacked()
stacked.train(raw_train)

(806298, 5)
Stacked model successfully trained.


In [167]:
# dmat = xgb.DMatrix(l1_features, train.y)
# params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
#              'objective': 'reg:linear', 'max_depth':3, 'min_child_weight':1}
# booster = xgb.train(params = params, dtrain = dmat)

In [168]:
env = make()
o = env.reset()
print('Starting a new calculation for score')
rewards = []

print('Starting to fit a model')
while True:
    features = o.features.copy().fillna(0)
#     averages_by_ts = features[['technical_42', 'technical_12', 'timestamp']].groupby('timestamp').aggregate(np.mean)
#     features = features.join(averages_by_ts, on='timestamp', how='left', rsuffix = '_rolling')
    prediction  = stacked.predict(features)
    target      = o.target
    target['y'] = prediction

    timestamp = o.features["timestamp"][0]

    if timestamp % 100 == 0 :
        print timestamp
    
    o, reward, done, info = env.step(target)
    rewards.append(reward)
    if done: break

print info['public_score']

Starting a new calculation for score
Starting to fit a model
1000
1100
1200
1300
1400
1500
1600
1700
1800
0.0163421951821


In [160]:
raw_train = raw_train.fillna(0)

In [161]:
# columns_to_test = ['technical_20', 'technical_20_rolling', 'technical_20_offset', 
#                    'technical_30', 'technical_30_rolling', 'technical_30_offset',
#                    'technical_40', 'technical_40_rolling', 'technical_40_offset',
#                    'fundamental_11', 'fundamental_11_rolling', 'fundamental_11_offset']

columns_to_test = ['technical_22', 'technical_20', 'technical_30', 'technical_13', 
    'technical_34', 'fundamental_11', 'technical_40', 'technical_19', 'technical_11', 
                  'technical_7', 'fundamental_53', 'fundamental_51']
models = {}
for i in range(len(columns_to_test)):
    for j in range(i+1, len(columns_to_test)):
        
        m = Model(linear_model.LinearRegression(normalize=True), [columns_to_test[i], columns_to_test[j]])
        m.train(raw_train)
        models[columns_to_test[i] + '_' +  columns_to_test[j]]= m

In [1]:
print max(models, key=models.get)


NameError: name 'models' is not defined

In [52]:
raw_columns = ['technical_20', 'technical_30', 'technical_40', 'fundamental_11']

for model in models:
    counter = 0
    env = make()
    o = env.reset()
    print('Starting a new calculation for score')
    rewards = []
    
    print('Starting to fit a model')
    while True:
        features = o.features.copy().fillna(method='ffill')
        averages_by_ts = features[raw_columns +  ['timestamp']].groupby('timestamp').aggregate(np.mean)
        averages_by_ts = averages_by_ts.join(averages_by_ts.shift(1),  how='inner', rsuffix='_offset')
        features = features.join(averages_by_ts, on='timestamp', how='left', rsuffix = '_rolling')
        prediction  = model.predict(features)
        target      = o.target
        target['y'] = prediction

        timestamp = o.features["timestamp"][0]

        o, reward, done, info = env.step(target)
        rewards.append(reward)
        if done: break
    counter += 1
    print 'column:' + str(info['public_score'])

Starting a new calculation for score
Starting to fit a model
column:-0.0110238809088
Starting a new calculation for score
Starting to fit a model
column:-0.0750978618279
Starting a new calculation for score
Starting to fit a model
column:-0.0848172390404
Starting a new calculation for score
Starting to fit a model
column:-0.0172011256038
Starting a new calculation for score
Starting to fit a model
column:-0.0950112445907
Starting a new calculation for score
Starting to fit a model
column:-0.0874539856616
Starting a new calculation for score
Starting to fit a model
column:-0.00861636034261
Starting a new calculation for score
Starting to fit a model
column:-0.0353368838057
Starting a new calculation for score
Starting to fit a model
column:-0.0882262871807
Starting a new calculation for score
Starting to fit a model
column:-0.0106844739705
Starting a new calculation for score
Starting to fit a model
column:-0.0605628374869
Starting a new calculation for score
Starting to fit a model
col