In [1]:
import kagglegym
import numpy as np
import pandas as pd
import random
from sklearn import ensemble, linear_model, metrics, feature_selection, neighbors
import math

In [2]:
# Configuration
# ========================================================================================

add_na_indicators = True
add_diff_features = True
na_indicator_cols = ['technical_9', 'technical_0', 'technical_32', 'technical_16', 
    'technical_38', 'technical_44', 'technical_20', 'technical_30', 'technical_13'] 
    # allegedly selected by tree-based algorithms
diff_cols = ['technical_22', 'technical_20', 'technical_30', 'technical_13', 
    'technical_34'] # also allegedly selected by tree-based algorithms
univar_rlm_cols = ['technical_22', 'technical_20', 'technical_30_d1', 'technical_20_d1',
    'technical_30', 'technical_13', 'technical_34']
useful_columns = ['technical_20', 'technical_30', 'fundamental_11',
                  'technical_40', 'technical_19', 'technical_11', 
                  'technical_7', 'fundamental_53', 'fundamental_51',
                  'technical_22_d1', 'technical_20_d1', 'technical_30_d1', 'technical_13_d1']
nr_l2_best_models = 10
wrlm_quant = 0.99
wrlm_min_trainset_fraction = 0.9
wslm_n_covar_sets = 30
wslm_max_nr_covars_per_set = 3
wslm_max_abs_y = 0.086
l1_et_n_estimators = 100
l1_et_max_depth = 4
l3_et_n_estimators = 100
l3_et_max_depth = 4
rnd_seed = 17

In [3]:
# A custom function to compute the R score
def get_reward(y_true, y_fit):
    R2 = 1 - np.sum((y_true - y_fit)**2) / np.sum((y_true - np.mean(y_true))**2)
    R = np.sign(R2) * math.sqrt(abs(R2))
    return(R)


In [4]:
class stepwise_regressors:
    def __init__(self):
        self.estimators = [ensemble.ExtraTreesRegressor(),
                            linear_model.LinearRegression(),
                           linear_model.LinearRegression(normalize=True),
                             linear_model.ElasticNet(alpha = 0.0001),
                             linear_model.ElasticNet(normalize=True),
                            linear_model.Lasso(),
                            linear_model.Lasso(normalize=True),
                            linear_model.Ridge(),
                            linear_model.Ridge(normalize=True),
                            linear_model.LassoLars(),
                            linear_model.BayesianRidge(),
                          ]
        self.models = []
        self.selected_features = []
        self.ts_intervals = [0, 300, 600, 1000]
        
    def fit(self, data):
        for i in range(len(self.ts_intervals)-1):
            for estimator in self.estimators:
                selector = feature_selection.RFE(estimator, step=2,  
                                 verbose=0,n_features_to_select=3,
                                 #scoring=metrics.make_scorer(metrics.r2_score)
                                                  )
                
                train = data[data['timestamp'] > self.ts_intervals[i]]
                train = train[train['timestamp'] < self.ts_intervals[i+1]]
                sinput = train.drop('y',1)
                selector = selector.fit(sinput, train.y.values)
                selected_indices = [k for k in range(len(selector.support_)) if selector.support_[k]]
                selected_columns = sinput.columns[selector.support_]
                self.selected_features.append(selected_columns)
                self.models.append(estimator.fit(train[selected_columns], train.y.values))
                print("Fit estimator {} with features {} for ts {} to {}".format(
                    estimator, selected_columns, self.ts_intervals[i], self.ts_intervals[i+1]))

    def predict(self, X):
        stacked = pd.DataFrame()
        index = 0
        for model in self.models:
            stacked['stacked_'+str(index)] = model.predict(X[self.selected_features[index]])
            index += 1
        return stacked

In [5]:
print('Initializing')
random.seed(rnd_seed)
env = kagglegym.make()
obs = env.reset()

train = obs.train

Initializing


In [6]:
train_median = train.median(axis=0)

print('Adding missing value counts per row')
train['nr_missing'] = train.isnull().sum(axis = 1)

print('Adding missing value indicators')
if add_na_indicators:
    for col in na_indicator_cols:
        train[col + '_isna'] = pd.isnull(train[col]).apply(lambda x: 1 if x else 0)
        if len(train[col + '_isna'].unique()) == 1:
            print('Dropped constant missingness indicator:', col, '_isna')
            del train[col + '_isna']
            na_indicator_cols.remove(col)
            
print('Adding diff features')
if add_diff_features:
    train = train.sort_values(by = ['id', 'timestamp'])
    for col in diff_cols:
        # FIXME: why not group by (id, ts)? why only a lag of 1 sample?
        train[col + '_d1'] = train[col].rolling(2).apply(lambda x: x[1] - x[0]).fillna(0)
    train = train[train.timestamp != 0] # Drop first timestamp that had no diffs 
    #(FIXME can be confusing; why not leave missing?)

# We're going to use all of these features for modeling
# base_features = [x for x in train.columns if x not in ['id',  'y']]
# labels = train['y']

Adding missing value counts per row
Adding missing value indicators
Adding diff features


In [7]:
train = train.fillna(train_median)

In [None]:
sr = stepwise_regressors()
base_features = [x for x in train.columns if x not in ['id',  'y']]
sr.fit(train)

In [None]:
preds = sr.predict(train)
preds[::100000]

In [None]:
keep_columns = []
for cname in preds.columns:
    if get_reward(train.y.values, preds[cname]) > 0.0:
        keep_columns.append(cname)
print keep_columns
preds = preds[keep_columns]
train_df = preds.loc[:,preds.std()>1e-5]

In [None]:
train_df[::100000]

In [None]:
l1 = ensemble.ExtraTreesRegressor(n_estimators = l1_et_n_estimators, 
    max_depth = l1_et_max_depth, n_jobs = -1, random_state = rnd_seed, verbose = 0)
l1.fit(train_df[:400000], train.y.values[:400000])
get_reward(l1.predict(train_df[400000:]), train.y.values[400000:])

In [None]:
print l1.predict(train_df[400000:400020])
print train.y.values[400000:400020]

In [None]:
start = 400000
end =   500000
val =   600000

In [None]:
lr = linear_model.LinearRegression(normalize=True)
lr.fit(train[['technical_20', 'technical_30']][start:end], labels[start:end])

In [None]:
lr.predict(train[['technical_20', 'technical_30']])[::10000]

In [None]:
print get_reward(labels[end:val], lr.predict(train[['technical_20', 'technical_30']][end:val]))