In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss

### train data

In [2]:
model_1 = pd.read_csv('../data/model/baseline_v1.csv').rename(columns={'score':'model_1'})
model_2 = pd.read_csv('../data/model/baseline_v2.csv').rename(columns={'score':'model_2'})
model_3 = pd.read_csv('../data/model/vgg11_v1.csv').rename(columns={'score':'model_3'})
model_4 = pd.read_csv('../data/model/vgg11_v2.csv').rename(columns={'score':'model_4'})
model_5 = pd.read_csv('../data/model/vgg16_v1.csv').rename(columns={'score':'model_5'})
model_6 = pd.read_csv('../data/model/vgg16_v2.csv').rename(columns={'score':'model_6'})

In [3]:
train = model_1.copy()
train = train.merge(model_2, on=['id','label'])
train = train.merge(model_3, on=['id','label'])
train = train.merge(model_4, on=['id','label'])
train = train.merge(model_5, on=['id','label'])
train = train.merge(model_6, on=['id','label'])
print('train:', train.shape)
for feat in train.columns[2:]:
    print(feat, ':', log_loss(train['label'], train[feat]))
train.head()

train: (1604, 8)
model_1 : 0.194535667787
model_2 : 0.185711479653
model_3 : 0.199571412049
model_4 : 0.191928795004
model_5 : 0.198212263034
model_6 : 0.194420133159


Unnamed: 0,id,label,model_1,model_2,model_3,model_4,model_5,model_6
0,dfd5f913,0,0.00526,0.011012,0.005000035,0.017239,0.00141,0.022507
1,525ab75c,0,0.017436,0.017317,0.4303748,0.133345,0.0001,0.0001
2,161a6860,0,0.0001,0.0001,9.335847e-08,0.0001,0.000178,0.000151
3,86730f0d,1,0.726244,0.722004,0.8014979,0.860262,0.857558,0.878166
4,a210f335,0,0.012685,0.002879,0.02527743,0.005114,0.025568,0.04032


### test data

In [4]:
model_1 = pd.read_csv('../data/submit/baseline_v1.csv').rename(columns={'is_iceberg':'model_1'})
model_2 = pd.read_csv('../data/submit/baseline_v2.csv').rename(columns={'is_iceberg':'model_2'})
model_3 = pd.read_csv('../data/submit/vgg11_v1.csv').rename(columns={'is_iceberg':'model_3'})
model_4 = pd.read_csv('../data/submit/vgg11_v2.csv').rename(columns={'is_iceberg':'model_4'})
model_5 = pd.read_csv('../data/submit/vgg16_v1.csv').rename(columns={'is_iceberg':'model_5'})
model_6 = pd.read_csv('../data/submit/vgg16_v2.csv').rename(columns={'is_iceberg':'model_6'})

In [5]:
test = model_1.copy()
test = test.merge(model_2, on=['id'])
test = test.merge(model_3, on=['id'])
test = test.merge(model_4, on=['id'])
test = test.merge(model_5, on=['id'])
test = test.merge(model_6, on=['id'])
print('test:', test.shape)
test.head()

test: (8424, 7)


Unnamed: 0,id,model_1,model_2,model_3,model_4,model_5,model_6
0,5941774d,0.026424,0.018251,0.073787,0.043357,0.169123,0.072555
1,4023181e,0.561251,0.988309,0.308845,0.894838,0.224208,0.497121
2,b20200e4,0.001384,0.235232,1.7e-05,0.592631,0.114,0.005318
3,e7f018bb,0.994056,0.998084,0.997888,0.998768,0.999978,1.0
4,4371c8c3,0.039178,0.683242,0.005809,0.052006,0.053348,0.048794


### stacking

In [6]:
def stack_func(value, low, high):
    if np.all(value < low):
        return min(value)
    elif np.all(value > high):
        return max(value)
    else:
        return np.mean(value)

In [7]:
scores = train[['model_1','model_2','model_3','model_4','model_5','model_6']].copy()
scores.corr()

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,model_6
model_1,1.0,0.967201,0.961674,0.952086,0.943551,0.936648
model_2,0.967201,1.0,0.956585,0.963948,0.940643,0.945073
model_3,0.961674,0.956585,1.0,0.967572,0.948122,0.938075
model_4,0.952086,0.963948,0.967572,1.0,0.946191,0.94987
model_5,0.943551,0.940643,0.948122,0.946191,1.0,0.948658
model_6,0.936648,0.945073,0.938075,0.94987,0.948658,1.0


In [8]:
low_values = list(range(5, 41, 5))
high_values = list(range(60,96,5))
benchmark = 0.17


for low_ in low_values:
    for high_ in high_values:
        predict = scores.apply(lambda x : stack_func(x, low_/100,high_/100), axis=1).clip(0.001, 0.999)
        if benchmark > log_loss(train['label'], predict):
            print('low, high:', low_, high_)
            print('loss:', log_loss(train['label'], predict))

low, high: 10 95
loss: 0.16971692854
low, high: 15 80
loss: 0.169797479442
low, high: 15 95
loss: 0.168777285066
low, high: 25 95
loss: 0.169907763856
low, high: 30 95
loss: 0.169666824118


In [9]:
train['stack'] = scores.apply(lambda x : stack_func(x, 15/100, 95/100), axis=1).clip(0.001, 0.999)
print('loss:', log_loss(train['label'], train['stack']))
train.head()

loss: 0.168777285066


Unnamed: 0,id,label,model_1,model_2,model_3,model_4,model_5,model_6,stack
0,dfd5f913,0,0.00526,0.011012,0.005000035,0.017239,0.00141,0.022507,0.00141
1,525ab75c,0,0.017436,0.017317,0.4303748,0.133345,0.0001,0.0001,0.099779
2,161a6860,0,0.0001,0.0001,9.335847e-08,0.0001,0.000178,0.000151,0.001
3,86730f0d,1,0.726244,0.722004,0.8014979,0.860262,0.857558,0.878166,0.807622
4,a210f335,0,0.012685,0.002879,0.02527743,0.005114,0.025568,0.04032,0.002879


In [10]:
train.to_csv('../data/train_scores.csv', index=False)

In [11]:
scores = test[['model_1','model_2','model_3','model_4','model_5','model_6']].copy()
scores.corr()

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,model_6
model_1,1.0,0.716529,0.908832,0.562962,0.834328,0.772714
model_2,0.716529,1.0,0.54427,0.856853,0.65938,0.678721
model_3,0.908832,0.54427,1.0,0.429716,0.783679,0.666838
model_4,0.562962,0.856853,0.429716,1.0,0.626266,0.683814
model_5,0.834328,0.65938,0.783679,0.626266,1.0,0.905164
model_6,0.772714,0.678721,0.666838,0.683814,0.905164,1.0


In [12]:
test['stack'] = scores.apply(lambda x : stack_func(x, 15/100, 95/100), axis=1).clip(0.001, 0.999)
test.head()

Unnamed: 0,id,model_1,model_2,model_3,model_4,model_5,model_6,stack
0,5941774d,0.026424,0.018251,0.073787,0.043357,0.169123,0.072555,0.06725
1,4023181e,0.561251,0.988309,0.308845,0.894838,0.224208,0.497121,0.579095
2,b20200e4,0.001384,0.235232,1.7e-05,0.592631,0.114,0.005318,0.158097
3,e7f018bb,0.994056,0.998084,0.997888,0.998768,0.999978,1.0,0.999
4,4371c8c3,0.039178,0.683242,0.005809,0.052006,0.053348,0.048794,0.147063


In [13]:
test.to_csv('../data/test_scores.csv', index=False)

In [None]:
submit = test[['id','stack']].copy()
submit = submit.rename(columns={'stack':'is_iceberg'})
submit.to_csv('../data/submit/stacked.csv', index=False)