In [1]:
from kagglegym import make
import numpy as np
import pandas as pd
import random
import xgboost as xgb
from sklearn import ensemble, linear_model, metrics
import time

In [2]:
rnd = 0

In [3]:
env = make()
o = env.reset()
train = o.train
print(train.shape)
d_mean= train.median(axis=0)
train["nbnulls"]=train.isnull().sum(axis=1)
col=[x for x in train.columns if x not in ['id', 'timestamp', 'y']]

(806298, 111)


In [4]:
#keeping na information on some columns (best selected by the tree algorithms)
add_nas_ft=True
nas_cols=['technical_9', 'technical_0', 'technical_32', 'technical_16', 'technical_38', 
'technical_44', 'technical_20', 'technical_30', 'technical_13']
#columns kept for evolution from one month to another (best selected by the tree algorithms)
add_diff_ft=True
diff_cols=['technical_22','technical_20', 'technical_30', 'technical_13', 'technical_34']

In [5]:

#homemade class used to infer randomly on the way the model learns
class createLinearFeatures:
    
    def __init__(self, n_neighbours=1, max_elts=None, verbose=True, random_state=None):
        self.rnd=random_state
        self.n=n_neighbours
        self.max_elts=max_elts
        self.verbose=verbose
        self.neighbours=[]
        self.clfs=[]
        
    def fit(self,train,y):
        if self.rnd!=None:
            random.seed(self.rnd)
        if self.max_elts==None:
            self.max_elts=len(train.columns)
        list_vars=list(train.columns)
        random.shuffle(list_vars)
        
        lastscores=np.zeros(self.n)+1e15

        for elt in list_vars[:self.n]:
            self.neighbours.append([elt])
        list_vars=list_vars[self.n:]
        
        for elt in list_vars:
            indice=0
            scores=[]
            for elt2 in self.neighbours:
                if len(elt2)<self.max_elts:
                    clf=linear_model.LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
                    clf.fit(train[elt2+[elt]], y)
                    scores.append(metrics.mean_squared_error(y,clf.predict(train[elt2 + [elt]])))
                    indice=indice+1
                else:
                    scores.append(lastscores[indice])
                    indice=indice+1
            gains=lastscores-scores
            if gains.max()>0:
                temp=gains.argmax()
                lastscores[temp]=scores[temp]
                self.neighbours[temp].append(elt)

        indice=0
        for elt in self.neighbours:
            clf=linear_model.LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
            clf.fit(train[elt], y)
            self.clfs.append(clf)
            if self.verbose:
                print(indice, lastscores[indice], elt)
            indice=indice+1
                    
    def transform(self, train):
        indice=0
        for elt in self.neighbours:
            #this line generates a warning. Could be avoided by working and returning
            #with a copy of train.
            #kept this way for memory management
            train['neighbour'+str(indice)]=self.clfs[indice].predict(train[elt])
            indice=indice+1
        return train
    
    def fit_transform(self, train, y):
        self.fit(train, y)
        return self.transform(train)

In [6]:

#a home-made class attempt to remove outliers by successive quantization on residuals
class recurrent_linear_approx():
    def __init__(self, quant=.999, limit_size_train=.9):
        self.quant=quant
        self.limit_size_train=limit_size_train
        self.bestmodel=[]
       
    def fit(self, train, y):
        internal_model=linear_model.Ridge(fit_intercept=False)
        bestscore=1e15
        better=True
        indextrain=train.dropna().index
        limitlen=len(train)*self.limit_size_train
        while better:
            internal_model.fit(train.ix[indextrain], y.ix[indextrain])
            score=metrics.mean_squared_error(internal_model.predict(train.ix[indextrain]), y.ix[indextrain])
            if score < bestscore:
                bestscore=score
                self.bestmodel=internal_model
                residual=y.ix[indextrain]-internal_model.predict(train.ix[indextrain])
                indextrain=residual[abs(residual)<=abs(residual).quantile(self.quant)].index
                if len(indextrain)<limitlen:
                    better=False
            else:
                better=False
                self.bestmodel=internal_model

    def predict(self, test):
        return self.bestmodel.predict(test)

In [7]:

if add_nas_ft:
    for elt in nas_cols:
        train[elt + '_na'] = pd.isnull(train[elt]).apply(lambda x: 1 if x else 0)
        #no need to keep columns with no information
        if len(train[elt + '_na'].unique())==1:
            print("removed:", elt, '_na')
            del train[elt + '_na']
            nas_cols.remove(elt)

In [8]:
if add_diff_ft:
    train=train.sort_values(by=['id','timestamp'])
    for elt in diff_cols:
        #a quick way to obtain deltas from one month to another but it is false on the first
        #month of each id
        train[elt+"_d"]= train[elt].rolling(2).apply(lambda x:x[1]-x[0]).fillna(0)
    #removing month 0 to reduce the impact of erroneous deltas
    train=train[train.timestamp!=0]

print(train.shape)
cols=[x for x in train.columns if x not in ['id', 'timestamp', 'y']]

(805548, 126)


In [9]:

#generation of linear models
cols2fit=['technical_22','technical_20', 'technical_30_d', 'technical_20_d', 'technical_30', 
'technical_13', 'technical_34']
models=[]
columns=[]
residuals=[]
for elt in cols2fit:
    print("fitting linear model on ", elt)
    model=recurrent_linear_approx(quant=.99, limit_size_train=.9)
    model.fit(train.loc[:,[elt]],train.loc[:, 'y'])
    models.append(model)
    columns.append([elt])
    residuals.append(abs(model.predict(train[[elt]].fillna(d_mean))-train.y))

('fitting linear model on ', 'technical_22')
('fitting linear model on ', 'technical_20')
('fitting linear model on ', 'technical_30_d')
('fitting linear model on ', 'technical_20_d')
('fitting linear model on ', 'technical_30')
('fitting linear model on ', 'technical_13')
('fitting linear model on ', 'technical_34')


In [10]:
train=train.fillna(d_mean)
    
#adding all trees generated by a tree regressor
print("adding new features")
featureexpander=createLinearFeatures(n_neighbours=30, max_elts=2, verbose=True, random_state=rnd)
index2use=train[abs(train.y)<0.086].index
featureexpander.fit(train.ix[index2use,cols],train.ix[index2use,'y'])
trainer=featureexpander.transform(train[cols])
treecols=trainer.columns

adding new features
(0, 0.00037784251617267728, [u'fundamental_1', u'fundamental_9'])
(1, 0.00037784251617267728, [u'technical_31', u'fundamental_14'])
(2, 0.00037783913025333236, [u'fundamental_25', 'technical_22_d'])
(3, 0.00037783940206281841, [u'fundamental_22', u'fundamental_16'])
(4, 0.00037779127560153377, [u'technical_38', 'technical_30_d'])
(5, 0.00037782202707603574, [u'technical_9', u'fundamental_55'])
(6, 0.0003778045647777617, ['technical_38_na', u'fundamental_26'])
(7, 0.00037783448351547122, ['technical_13_na', u'fundamental_48'])
(8, 0.00037784266169182956, [u'technical_24', u'technical_44'])
(9, 0.0003778428363148123, [u'fundamental_20', u'fundamental_49'])
(10, 0.00037783599691465497, [u'technical_17', u'technical_1'])
(11, 0.00037784263258799911, [u'fundamental_3', u'technical_33'])
(12, 0.00037784242886118591, [u'fundamental_12', u'fundamental_34'])
(13, 0.00037778879050165415, [u'technical_22', u'fundamental_33'])
(14, 0.00037783512379974127, [u'fundamental_8', u'f

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
param = {
#          'bst:max_depth':2, 
#          'bst:eta':1, 
         'silent':0, # 0 => verbose 
         'objective':'reg:linear',
         'nthread':8,
         #'eval_metric':'rmse',
        }

print("training trees")
dmatrix = xgb.DMatrix(trainer, label=train.y)
#model = ensemble.ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=rnd, verbose=0)
#model.fit(trainer,train.y)
model = xgb.train(param, dmatrix, num_boost_round=10)
#print(pd.DataFrame(model.feature_importances_,index=treecols).sort_values(by=[0]).tail(30))


training trees


In [12]:
print train.columns

Index([             u'id',       u'timestamp',       u'derived_0',
             u'derived_1',       u'derived_2',       u'derived_3',
             u'derived_4',   u'fundamental_0',   u'fundamental_1',
         u'fundamental_2',
       ...
       u'technical_38_na', u'technical_44_na', u'technical_20_na',
       u'technical_30_na', u'technical_13_na',  u'technical_22_d',
        u'technical_20_d',  u'technical_30_d',  u'technical_13_d',
        u'technical_34_d'],
      dtype='object', length=126)


In [13]:
#model selection : create a new target selecting models with lowest asolute residual for each line
#the objective at this step is to keep only the few best elements which should
#lead to a better generalization
num_to_keep=10
targetselector=np.array(residuals).T
targetselector=np.argmin(targetselector, axis=1)
print("selecting best models:")
print(pd.Series(targetselector).value_counts().head(num_to_keep))

tokeep=pd.Series(targetselector).value_counts().head(num_to_keep).index
tokeepmodels=[]
tokeepcolumns=[]
tokeepresiduals=[]
for elt in tokeep:
    tokeepmodels.append(models[elt])
    tokeepcolumns.append(columns[elt])
    tokeepresiduals.append(residuals[elt])

selecting best models:
0    288446
1    151625
2    108203
6     86043
3     77312
4     52441
5     41478
dtype: int64


In [14]:

#creating a new target for a model in charge of predicting which model is best for the current line
targetselector=np.array(tokeepresiduals).T
targetselector=np.argmin(targetselector, axis=1)

In [15]:
print("training selection model")
modelselector = ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=4, n_jobs=-1, random_state=rnd, verbose=0)
modelselector.fit(trainer, targetselector)
print(pd.DataFrame(modelselector.feature_importances_,index=treecols).sort_values(by=[0]).tail(30))

lastvalues=train[train.timestamp==905][['id']+diff_cols].copy()

training selection model
                       0
technical_14    0.005686
technical_12    0.005951
technical_27    0.006126
technical_29    0.006325
technical_39    0.007406
technical_6     0.007577
fundamental_21  0.009345
fundamental_0   0.009535
neighbour18     0.010082
technical_32    0.010626
neighbour10     0.010674
fundamental_59  0.011042
technical_37    0.011807
neighbour4      0.012927
nbnulls         0.013760
technical_38    0.014425
technical_17    0.018520
technical_11    0.019415
technical_2     0.020817
technical_13_d  0.022886
technical_20_d  0.023922
technical_7     0.026224
technical_30_d  0.026827
technical_40    0.034245
technical_30    0.041481
technical_13    0.046331
technical_20    0.046564
technical_34    0.121276
neighbour13     0.126897
technical_22    0.214560
