In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

import gc
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata

['test.csv', 'train.csv', 'sample_submission.csv']


In [11]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

features = [x for x in train_df.columns if x.startswith("var")]

In [12]:
for var in features:
    if np.corrcoef( train_df['target'], train_df[var] )[1][0] < 0:
        train_df[var] = train_df[var] * -1
        test_df[var]  = test_df[var]  * -1

In [13]:
#count all values
var_stats = {}
hist_df = pd.DataFrame()
for var in features:
    var_stats = train_df[var].append(test_df[var]).value_counts()
    hist_df[var] = pd.Series(test_df[var]).map(var_stats)
    hist_df[var] = hist_df[var] > 1
#remove fake test rows
ind = hist_df.sum(axis=1) != 200

In [14]:
#recount values without fake rows
var_stats = {}
for var in features:
    var_stats[var] = train_df[var].append(test_df[ind][var]).value_counts()

In [15]:
def logit(p):
    return np.log(p) - np.log(1 - p)

def var_to_feat(vr, var_stats, feat_id ):
    new_df = pd.DataFrame()
    new_df["var"] = vr.values
    new_df["hist"] = pd.Series(vr).map(var_stats)
    new_df["feature_id"] = feat_id
    new_df["var_rank"] = new_df["var"].rank()/200000.
    return new_df.values

In [16]:
TARGET = np.array( list(train_df['target'].values) * 200 )

TRAIN = []
var_mean = {}
var_var  = {}
for var in features:
    tmp = var_to_feat(train_df[var], var_stats[var], int(var[4:]) )
    var_mean[var] = np.mean(tmp[:,0]) 
    var_var[var]  = np.var(tmp[:,0])
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    TRAIN.append( tmp )
TRAIN = np.vstack( TRAIN )

del train_df
_=gc.collect()

print( TRAIN.shape, len( TARGET ) )

(40000000, 4) 40000000


In [17]:
import time
model = lgb.LGBMClassifier(**{
     'learning_rate': 0.04,
     'num_leaves': 31,
     'max_bin': 1023,
     'min_child_samples': 1000,
     'reg_alpha': 0.1,
     'reg_lambda': 0.2,
     'feature_fraction': 1.0,
     'bagging_freq': 1,
     'bagging_fraction': 0.85,
     'objective': 'binary',
     'n_jobs': -1,
     'n_estimators':200,})

MODELS = []
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=11111)

for fold_, (train_indexes, valid_indexes) in enumerate(skf.split(TRAIN, TARGET)):
    print('Fold:', fold_ )
    model = model.fit( TRAIN[train_indexes], TARGET[train_indexes],
                      eval_set = (TRAIN[valid_indexes], TARGET[valid_indexes]),
                      verbose = 10,
                      eval_metric='auc',
                      early_stopping_rounds=25,
                      categorical_feature = [2] )
    
    
    MODELS.append( model )

del TRAIN, TARGET
_=gc.collect()

Fold: 0


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528007	valid_0's binary_logloss: 0.325586
[20]	valid_0's auc: 0.528201	valid_0's binary_logloss: 0.325356
[30]	valid_0's auc: 0.528269	valid_0's binary_logloss: 0.325253
[40]	valid_0's auc: 0.528356	valid_0's binary_logloss: 0.325203
[50]	valid_0's auc: 0.528396	valid_0's binary_logloss: 0.325177
[60]	valid_0's auc: 0.528405	valid_0's binary_logloss: 0.325163
[70]	valid_0's auc: 0.528382	valid_0's binary_logloss: 0.325156
[80]	valid_0's auc: 0.528378	valid_0's binary_logloss: 0.325151
Early stopping, best iteration is:
[55]	valid_0's auc: 0.528421	valid_0's binary_logloss: 0.325169
Fold: 1


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.526874	valid_0's binary_logloss: 0.325591
[20]	valid_0's auc: 0.527129	valid_0's binary_logloss: 0.325363
[30]	valid_0's auc: 0.527225	valid_0's binary_logloss: 0.325261
[40]	valid_0's auc: 0.527317	valid_0's binary_logloss: 0.325213
[50]	valid_0's auc: 0.527387	valid_0's binary_logloss: 0.325188
[60]	valid_0's auc: 0.527431	valid_0's binary_logloss: 0.325174
[70]	valid_0's auc: 0.527434	valid_0's binary_logloss: 0.325167
[80]	valid_0's auc: 0.527446	valid_0's binary_logloss: 0.325162
[90]	valid_0's auc: 0.527446	valid_0's binary_logloss: 0.325159
[100]	valid_0's auc: 0.527424	valid_0's binary_logloss: 0.325158
Early stopping, best iteration is:
[84]	valid_0's auc: 0.527457	valid_0's binary_logloss: 0.32516
Fold: 2


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528154	valid_0's binary_logloss: 0.325583
[20]	valid_0's auc: 0.528495	valid_0's binary_logloss: 0.325349
[30]	valid_0's auc: 0.52855	valid_0's binary_logloss: 0.325244
[40]	valid_0's auc: 0.528604	valid_0's binary_logloss: 0.325194
[50]	valid_0's auc: 0.528586	valid_0's binary_logloss: 0.325168
[60]	valid_0's auc: 0.528563	valid_0's binary_logloss: 0.325155
Early stopping, best iteration is:
[41]	valid_0's auc: 0.528611	valid_0's binary_logloss: 0.32519
Fold: 3


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.527289	valid_0's binary_logloss: 0.325586
[20]	valid_0's auc: 0.52764	valid_0's binary_logloss: 0.325357
[30]	valid_0's auc: 0.527809	valid_0's binary_logloss: 0.325255
[40]	valid_0's auc: 0.527853	valid_0's binary_logloss: 0.325207
[50]	valid_0's auc: 0.527873	valid_0's binary_logloss: 0.325183
[60]	valid_0's auc: 0.527876	valid_0's binary_logloss: 0.32517
[70]	valid_0's auc: 0.527907	valid_0's binary_logloss: 0.325162
[80]	valid_0's auc: 0.527897	valid_0's binary_logloss: 0.325158
[90]	valid_0's auc: 0.527918	valid_0's binary_logloss: 0.325156
[100]	valid_0's auc: 0.527923	valid_0's binary_logloss: 0.325154
[110]	valid_0's auc: 0.527921	valid_0's binary_logloss: 0.325154
[120]	valid_0's auc: 0.52791	valid_0's binary_logloss: 0.325153
[130]	valid_0's auc: 0.527893	valid_0's binary_logloss: 0.325153
Early stopping, best iteration is:
[106]	valid_0's auc: 0.527927	valid_0's binary_logloss: 0.325154
Fold:

New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528635	valid_0's binary_logloss: 0.325579
[20]	valid_0's auc: 0.528838	valid_0's binary_logloss: 0.325341
[30]	valid_0's auc: 0.52902	valid_0's binary_logloss: 0.325233
[40]	valid_0's auc: 0.529106	valid_0's binary_logloss: 0.325181
[50]	valid_0's auc: 0.529139	valid_0's binary_logloss: 0.325154
[60]	valid_0's auc: 0.529153	valid_0's binary_logloss: 0.325139
[70]	valid_0's auc: 0.52915	valid_0's binary_logloss: 0.32513
[80]	valid_0's auc: 0.529171	valid_0's binary_logloss: 0.325125
[90]	valid_0's auc: 0.529169	valid_0's binary_logloss: 0.325122
[100]	valid_0's auc: 0.529176	valid_0's binary_logloss: 0.32512
[110]	valid_0's auc: 0.529166	valid_0's binary_logloss: 0.325119
[120]	valid_0's auc: 0.52918	valid_0's binary_logloss: 0.325118
[130]	valid_0's auc: 0.529186	valid_0's binary_logloss: 0.325118
[140]	valid_0's auc: 0.529202	valid_0's binary_logloss: 0.325117
[150]	valid_0's auc: 0.529204	valid_0's bi

New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528034	valid_0's binary_logloss: 0.325597
[20]	valid_0's auc: 0.528367	valid_0's binary_logloss: 0.325369
[30]	valid_0's auc: 0.528518	valid_0's binary_logloss: 0.325268
[40]	valid_0's auc: 0.528574	valid_0's binary_logloss: 0.32522
[50]	valid_0's auc: 0.528628	valid_0's binary_logloss: 0.325195
[60]	valid_0's auc: 0.528692	valid_0's binary_logloss: 0.325181
[70]	valid_0's auc: 0.528702	valid_0's binary_logloss: 0.325174
[80]	valid_0's auc: 0.528713	valid_0's binary_logloss: 0.32517
[90]	valid_0's auc: 0.52872	valid_0's binary_logloss: 0.325167
[100]	valid_0's auc: 0.528755	valid_0's binary_logloss: 0.325165
[110]	valid_0's auc: 0.528742	valid_0's binary_logloss: 0.325165
[120]	valid_0's auc: 0.528752	valid_0's binary_logloss: 0.325164
Early stopping, best iteration is:
[100]	valid_0's auc: 0.528755	valid_0's binary_logloss: 0.325165
Fold: 6


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528573	valid_0's binary_logloss: 0.325576
[20]	valid_0's auc: 0.528827	valid_0's binary_logloss: 0.325339
[30]	valid_0's auc: 0.528901	valid_0's binary_logloss: 0.325232
[40]	valid_0's auc: 0.528978	valid_0's binary_logloss: 0.32518
[50]	valid_0's auc: 0.52902	valid_0's binary_logloss: 0.325154
[60]	valid_0's auc: 0.529047	valid_0's binary_logloss: 0.325139
[70]	valid_0's auc: 0.529033	valid_0's binary_logloss: 0.32513
[80]	valid_0's auc: 0.529023	valid_0's binary_logloss: 0.325125
Early stopping, best iteration is:
[59]	valid_0's auc: 0.529053	valid_0's binary_logloss: 0.32514
Fold: 7


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528384	valid_0's binary_logloss: 0.325589
[20]	valid_0's auc: 0.528671	valid_0's binary_logloss: 0.325359
[30]	valid_0's auc: 0.528747	valid_0's binary_logloss: 0.325257
[40]	valid_0's auc: 0.528807	valid_0's binary_logloss: 0.325207
[50]	valid_0's auc: 0.528839	valid_0's binary_logloss: 0.325183
[60]	valid_0's auc: 0.528865	valid_0's binary_logloss: 0.325169
[70]	valid_0's auc: 0.528869	valid_0's binary_logloss: 0.325162
[80]	valid_0's auc: 0.528885	valid_0's binary_logloss: 0.325157
[90]	valid_0's auc: 0.528879	valid_0's binary_logloss: 0.325155
[100]	valid_0's auc: 0.52889	valid_0's binary_logloss: 0.325152
[110]	valid_0's auc: 0.528882	valid_0's binary_logloss: 0.325152
[120]	valid_0's auc: 0.528886	valid_0's binary_logloss: 0.32515
Early stopping, best iteration is:
[103]	valid_0's auc: 0.528893	valid_0's binary_logloss: 0.325152
Fold: 8


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.527704	valid_0's binary_logloss: 0.325588
[20]	valid_0's auc: 0.528011	valid_0's binary_logloss: 0.325357
[30]	valid_0's auc: 0.528141	valid_0's binary_logloss: 0.325253
[40]	valid_0's auc: 0.528205	valid_0's binary_logloss: 0.325204
[50]	valid_0's auc: 0.528267	valid_0's binary_logloss: 0.325179
[60]	valid_0's auc: 0.528317	valid_0's binary_logloss: 0.325165
[70]	valid_0's auc: 0.528325	valid_0's binary_logloss: 0.325158
[80]	valid_0's auc: 0.528322	valid_0's binary_logloss: 0.325153
[90]	valid_0's auc: 0.528323	valid_0's binary_logloss: 0.325151
Early stopping, best iteration is:
[68]	valid_0's auc: 0.528335	valid_0's binary_logloss: 0.325159
Fold: 9


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528262	valid_0's binary_logloss: 0.325586
[20]	valid_0's auc: 0.528341	valid_0's binary_logloss: 0.325356
[30]	valid_0's auc: 0.528393	valid_0's binary_logloss: 0.325253
[40]	valid_0's auc: 0.528546	valid_0's binary_logloss: 0.325204
[50]	valid_0's auc: 0.528629	valid_0's binary_logloss: 0.325177
[60]	valid_0's auc: 0.528676	valid_0's binary_logloss: 0.325164
[70]	valid_0's auc: 0.528662	valid_0's binary_logloss: 0.325157
[80]	valid_0's auc: 0.528719	valid_0's binary_logloss: 0.325151
[90]	valid_0's auc: 0.528723	valid_0's binary_logloss: 0.325149
[100]	valid_0's auc: 0.52873	valid_0's binary_logloss: 0.325147
[110]	valid_0's auc: 0.528738	valid_0's binary_logloss: 0.325145
[120]	valid_0's auc: 0.52873	valid_0's binary_logloss: 0.325145
[130]	valid_0's auc: 0.528735	valid_0's binary_logloss: 0.325144
Early stopping, best iteration is:
[107]	valid_0's auc: 0.528746	valid_0's binary_logloss: 0.325146


In [20]:
ypred = np.zeros( (200000,200) )
for feat,var in enumerate(features):
    tmp = var_to_feat(test_df[var], var_stats[var], int(var[4:]) )
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    for model_id in range(10):
        model = MODELS[model_id]
        ypred[:,feat] += model.predict_proba( tmp )[:,1] / 10.
ypred = np.mean( logit(ypred), axis=1 )

sub = test_df[['ID_code']]
sub['target'] = ypred
sub['target'] = sub['target'].rank() / 200000.
sub.to_csv('sample_submission1.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
