In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
pd.options.mode.chained_assignment = None

### datasets

In [2]:
train_score_1 = pd.read_csv('../data/other/model/fasttext_1.csv')
train_score_1.columns = [train_score_1.columns[0]] + ['m1_' + x for x in train_score_1.columns[1:]]
train_score_2 = pd.read_csv('../data/other/model/fasttext_2.csv')
train_score_2.columns = [train_score_2.columns[0]] + ['m2_' + x for x in train_score_2.columns[1:]]
train_score_3 = pd.read_csv('../data/other/model/fasttext_3.csv')
train_score_3.columns = [train_score_3.columns[0]] + ['m3_' + x for x in train_score_3.columns[1:]]
train_score_4 = pd.read_csv('../data/other/model/fasttext_4.csv')
train_score_4.columns = [train_score_4.columns[0]] + ['m4_' + x for x in train_score_4.columns[1:]]
train_score_5 = pd.read_csv('../data/other/model/glove_1.csv')
train_score_5.columns = [train_score_5.columns[0]] + ['m5_' + x for x in train_score_5.columns[1:]]
train_score_6 = pd.read_csv('../data/other/model/glove_2.csv')
train_score_6.columns = [train_score_6.columns[0]] + ['m6_' + x for x in train_score_6.columns[1:]]

train_labels = pd.read_csv('../data/download/train.csv').drop('comment_text', axis=1)
train_data = train_score_1.merge(train_score_2, on='id')
train_data = train_data.merge(train_score_3, on='id')
train_data = train_data.merge(train_score_4, on='id')
train_data = train_data.merge(train_score_5, on='id')
train_data = train_data.merge(train_score_6, on='id')

train_labels = train_labels.sort_values(by='id').reset_index(drop=True)
train_labels = train_labels.merge(train_data[['id']], on='id')
train_data = train_data.sort_values(by='id')
train_ids = train_labels[['id']].copy()
train_data = train_data.drop('id', axis=1)
train_labels = train_labels.drop('id', axis=1)
print('train_data:', train_data.shape, train_labels.shape)
del train_score_1, train_score_2

train_data: (159571, 36) (159571, 6)


In [3]:
test_score_1 = pd.read_csv('../data/other/submit/fasttext_1.csv')
test_score_1.columns = [test_score_1.columns[0]] + ['m1_' + x for x in test_score_1.columns[1:]]
test_score_2 = pd.read_csv('../data/other/submit/fasttext_2.csv')
test_score_2.columns = [test_score_2.columns[0]] + ['m2_' + x for x in test_score_2.columns[1:]]
test_score_3 = pd.read_csv('../data/other/submit/fasttext_3.csv')
test_score_3.columns = [test_score_3.columns[0]] + ['m3_' + x for x in test_score_3.columns[1:]]
test_score_4 = pd.read_csv('../data/other/submit/fasttext_4.csv')
test_score_4.columns = [test_score_4.columns[0]] + ['m4_' + x for x in test_score_4.columns[1:]]
test_score_5 = pd.read_csv('../data/other/submit/glove_1.csv')
test_score_5.columns = [test_score_5.columns[0]] + ['m5_' + x for x in test_score_5.columns[1:]]
test_score_6 = pd.read_csv('../data/other/submit/glove_2.csv')
test_score_6.columns = [test_score_6.columns[0]] + ['m6_' + x for x in test_score_6.columns[1:]]

test_data = test_score_1.merge(test_score_2, on='id')
test_data = test_data.merge(test_score_3, on='id')
test_data = test_data.merge(test_score_4, on='id')
test_data = test_data.merge(test_score_5, on='id')
test_data = test_data.merge(test_score_6, on='id')

test_ids = test_data[['id']].copy()
test_data = test_data.drop('id', axis=1)
print('test_data:', test_data.shape)
del test_score_1, test_score_2

test_data: (153164, 36)


### model

In [5]:
overall = 0. 

def eval_metric(labels, predict):
    fpr, tpr, threshold = roc_curve(labels, predict)
    return round(auc(fpr, tpr),4)
    
def model(label):
    feats = ['m1_' + label, 'm2_' + label,'m3_' + label, 'm4_' + label, 'm5_' + label, 'm6_' + label]
    scores = train_data[feats]
    scores[label] = scores.apply(lambda x : np.mean(x), axis=1)
    labels = train_labels[label]
    print('auc:', eval_metric(labels,scores[label]))
    model = train_ids.copy()
    model[label] = scores[label].copy()
    global overall
    overall += eval_metric(labels,scores[label])
    scores = test_data[feats]
    scores[label] = scores.apply(lambda x : np.mean(x), axis=1)
    submit = test_ids.copy()
    submit[label] = scores[label].copy()
    return [model, submit]

In [6]:
toxic = model('toxic')
severe_toxic = model('severe_toxic')
obscene = model('obscene')
threat = model('threat')
insult = model('insult')
identity_hate = model('identity_hate')

auc: 0.9856
auc: 0.9915
auc: 0.9939
auc: 0.9921
auc: 0.9888
auc: 0.9902


In [7]:
model = toxic[0].copy()
model = model.merge(severe_toxic[0], on='id')
model = model.merge(obscene[0], on='id')
model = model.merge(threat[0], on='id')
model = model.merge(insult[0], on='id')
model = model.merge(identity_hate[0], on='id')

In [8]:
submit = toxic[1].copy()
submit = submit.merge(severe_toxic[1], on='id')
submit = submit.merge(obscene[1], on='id')
submit = submit.merge(threat[1], on='id')
submit = submit.merge(insult[1], on='id')
submit = submit.merge(identity_hate[1], on='id')

In [9]:
model.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,0.000189,8.70739e-10,9e-06,1.626924e-08,2e-06,4.019145e-08
1,000103f0d9cfb60f,0.002525,2.700227e-07,9.8e-05,1.845108e-05,7.7e-05,5.589548e-06
2,000113f07ec002fd,0.001443,1.0096e-08,3.7e-05,1.744054e-06,9e-06,1.953806e-07
3,00013fa6fb6ef643,0.000213,1.79159e-09,4e-06,3.520491e-08,2e-06,1.106725e-07
4,0001b41b1c6bb37e,0.000391,8.747799e-09,9.1e-05,2.27635e-08,1.4e-05,2.555719e-07


In [10]:
submit.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998933,0.378591,0.9847157,0.151054,0.95217,0.4267313
1,0000247867823ef7,6.7e-05,1.005848e-08,1.351508e-06,1.244558e-07,1.12166e-06,1.382291e-07
2,00013b17ad220c46,0.000295,5.03885e-08,8.424013e-06,5.419039e-07,8.327425e-06,1.058228e-06
3,00017563c3f7919a,3.5e-05,3.838036e-09,8.975482e-07,4.560075e-07,8.23662e-07,3.621174e-08
4,00017695ad8997eb,0.001715,1.076093e-07,3.974976e-05,3.513278e-06,2.090479e-05,1.41536e-06


In [11]:
model.to_csv('../data/other/model/simple_stack.csv', index=False)

In [12]:
submit.to_csv('../data/other/submit/simple_stack.csv', index=False)

In [13]:
overall / 6

0.99034999999999995