In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
pd.options.mode.chained_assignment = None

### datasets

In [3]:
train_score_1 = pd.read_csv('../data/model/model_12.csv')
train_score_1.columns = [train_score_1.columns[0]] + ['m1_' + x for x in train_score_1.columns[1:]]
train_score_2 = pd.read_csv('../data/model/model_13.csv')
train_score_2.columns = [train_score_2.columns[0]] + ['m2_' + x for x in train_score_2.columns[1:]]
train_score_3 = pd.read_csv('../data/model/model_14.csv')
train_score_3.columns = [train_score_3.columns[0]] + ['m3_' + x for x in train_score_3.columns[1:]]

train_labels = pd.read_csv('../data/download/train.csv').drop('comment_text', axis=1)
train_data = train_score_1.merge(train_score_2, on='id')
train_data = train_data.merge(train_score_3, on='id')

train_labels = train_labels.sort_values(by='id').reset_index(drop=True)
train_labels = train_labels.merge(train_data[['id']], on='id')
train_data = train_data.sort_values(by='id').reset_index(drop=True)
train_ids = train_labels[['id']].copy()
train_data = train_data.drop('id', axis=1)
train_labels = train_labels.drop('id', axis=1)
print('train_data:', train_data.shape, train_labels.shape)
del train_score_1, train_score_2

train_data: (143614, 18) (143614, 6)


In [4]:
test_score_1 = pd.read_csv('../data/submit/model_12.csv')
test_score_1.columns = [test_score_1.columns[0]] + ['m1_' + x for x in test_score_1.columns[1:]]
test_score_2 = pd.read_csv('../data/submit/model_13.csv')
test_score_2.columns = [test_score_2.columns[0]] + ['m2_' + x for x in test_score_2.columns[1:]]
test_score_3 = pd.read_csv('../data/submit/model_14.csv')
test_score_3.columns = [test_score_3.columns[0]] + ['m3_' + x for x in test_score_3.columns[1:]]

test_data = test_score_1.merge(test_score_2, on='id')
test_data = test_data.merge(test_score_3, on='id')

test_ids = test_data[['id']].copy()
test_data = test_data.drop('id', axis=1)
print('test_data:', test_data.shape)
del test_score_1, test_score_2

test_data: (153164, 18)


### model

In [5]:
overall = 0. 

def eval_metric(labels, predict):
    fpr, tpr, threshold = roc_curve(labels, predict)
    return round(auc(fpr, tpr),4)
    
def model(label):
    feats = ['m1_' + label, 'm2_' + label, 'm3_' + label]
    scores = train_data[feats]
    scores[label] = scores.apply(lambda x : np.mean(x), axis=1)
    labels = train_labels[label]
    print('auc:', eval_metric(labels,scores[label]))
    model = train_ids.copy()
    model[label] = scores[label].copy()
    global overall
    overall += eval_metric(labels,scores[label])
    scores = test_data[feats]
    scores[label] = scores.apply(lambda x : np.mean(x), axis=1)
    submit = test_ids.copy()
    submit[label] = scores[label].copy()
    return [model, submit]

In [6]:
toxic = model('toxic')
severe_toxic = model('severe_toxic')
obscene = model('obscene')
threat = model('threat')
insult = model('insult')
identity_hate = model('identity_hate')

auc: 0.9864
auc: 0.9921
auc: 0.9949
auc: 0.9932
auc: 0.9897
auc: 0.9912


In [7]:
model = toxic[0].copy()
model = model.merge(severe_toxic[0], on='id')
model = model.merge(obscene[0], on='id')
model = model.merge(threat[0], on='id')
model = model.merge(insult[0], on='id')
model = model.merge(identity_hate[0], on='id')

In [8]:
submit = toxic[1].copy()
submit = submit.merge(severe_toxic[1], on='id')
submit = submit.merge(obscene[1], on='id')
submit = submit.merge(threat[1], on='id')
submit = submit.merge(insult[1], on='id')
submit = submit.merge(identity_hate[1], on='id')

In [9]:
model.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,0.000343,2.063982e-08,4.400472e-05,9.074979e-07,4.145759e-06,8.587443e-08
1,000103f0d9cfb60f,0.000925,7.461638e-09,0.0001231259,2.982393e-07,1.546507e-05,3.740627e-07
2,00013fa6fb6ef643,4.7e-05,3.369212e-09,7.469913e-07,6.055486e-08,7.792089e-07,1.02083e-07
3,0001b41b1c6bb37e,1e-05,1.976703e-09,4.574772e-07,2.592655e-08,4.502377e-07,3.216462e-07
4,0001d958c54c6e35,0.006757,3.632239e-07,4.492467e-05,0.0001219883,0.0006395739,5.310035e-06


In [10]:
submit.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.997683,0.4283029,0.972121,0.07699043,0.91205,0.407813
1,0000247867823ef7,7.2e-05,2.060591e-09,2e-06,1.302585e-08,3e-06,2.063223e-07
2,00013b17ad220c46,0.000391,4.63201e-08,1.5e-05,2.429554e-07,2.6e-05,7.086313e-06
3,00017563c3f7919a,4.4e-05,4.227803e-09,2e-06,5.042865e-07,4e-06,7.298163e-08
4,00017695ad8997eb,0.001649,5.380736e-08,4.7e-05,4.278784e-06,3.1e-05,1.579944e-06


In [11]:
model.to_csv('../data/model/simple_stack_1.csv', index=False)

In [12]:
submit.to_csv('../data/submit/simple_stack_1.csv', index=False)

In [13]:
overall / 6

0.99124999999999996