In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
pd.options.mode.chained_assignment = None

### datasets

In [2]:
train_score_1 = pd.read_csv('../data/model/baseline_1.csv')
train_score_1.columns = [train_score_1.columns[0]] + ['m1_' + x for x in train_score_1.columns[1:]]
train_score_2 = pd.read_csv('../data/model/baseline_2.csv')
train_score_2.columns = [train_score_2.columns[0]] + ['m2_' + x for x in train_score_2.columns[1:]]
train_score_3 = pd.read_csv('../data/model/baseline_3.csv')
train_score_3.columns = [train_score_2.columns[0]] + ['m3_' + x for x in train_score_3.columns[1:]]
train_labels = pd.read_csv('../data/download/train.csv').drop('comment_text', axis=1)
train_data = train_score_1.merge(train_score_2, on='id').merge(train_score_3, on='id')
train_labels = train_labels.sort_values(by='id').reset_index(drop=True)
train_ids = train_labels[['id']].copy()
train_data = train_data.drop('id', axis=1)
train_labels = train_labels.drop('id', axis=1)
print('train_data:', train_data.shape, train_labels.shape)
del train_score_1, train_score_2

train_data: (159571, 18) (159571, 6)


In [3]:
test_score_1 = pd.read_csv('../data/submit/baseline_1.csv')
test_score_1.columns = [test_score_1.columns[0]] + ['m1_' + x for x in test_score_1.columns[1:]]
test_score_2 = pd.read_csv('../data/submit/baseline_2.csv')
test_score_2.columns = [test_score_2.columns[0]] + ['m2_' + x for x in test_score_2.columns[1:]]
test_score_3 = pd.read_csv('../data/submit/baseline_3.csv')
test_score_3.columns = [test_score_2.columns[0]] + ['m3_' + x for x in test_score_3.columns[1:]]
test_data = test_score_1.merge(test_score_2, on='id').merge(test_score_3, on='id')
test_ids = test_data[['id']].copy()
test_data = test_data.drop('id', axis=1)
print('test_data:', test_data.shape)
del test_score_1, test_score_2

test_data: (153164, 18)


### model

In [4]:
def eval_metric(labels, predict):
    fpr, tpr, threshold = roc_curve(labels, predict)
    return round(auc(fpr, tpr),4)
    
def model(label):
    feats = ['m1_' + label, 'm2_' + label, 'm3_' + label]
    scores = train_data[feats]
    scores[label] = scores.apply(lambda x : np.mean(x), axis=1)
    labels = train_labels[label]
    print('auc:', eval_metric(labels,scores[label]))
    scores = test_data[feats]
    scores[label] = scores.apply(lambda x : np.mean(x), axis=1)
    submit = test_ids.copy()
    submit[label] = scores[label].copy()
    return submit

In [5]:
toxic = model('toxic')
severe_toxic = model('severe_toxic')
obscene = model('obscene')
threat = model('threat')
insult = model('insult')
identity_hate = model('identity_hate')

auc: 0.9821
auc: 0.9904
auc: 0.991
auc: 0.9916
auc: 0.9867
auc: 0.9854


In [6]:
submit = toxic.copy()
submit = submit.merge(severe_toxic, on='id')
submit = submit.merge(obscene, on='id')
submit = submit.merge(threat, on='id')
submit = submit.merge(insult, on='id')
submit = submit.merge(identity_hate, on='id')

In [7]:
submit.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.993498,0.4225,0.951626,0.168819,0.913571,0.625295
1,0000247867823ef7,0.000543,5e-06,0.00014,4e-06,8.6e-05,1.2e-05
2,00013b17ad220c46,0.001079,3.9e-05,0.000668,9e-06,0.000276,7.2e-05
3,00017563c3f7919a,0.000177,2e-06,5.9e-05,8e-06,6e-05,7e-06
4,00017695ad8997eb,0.003841,6.3e-05,0.000889,6e-05,0.000378,5.6e-05


In [8]:
submit.to_csv('../data/submit/baseline_4.csv', index=False)

In [9]:
(0.9821 + 0.9904 + 0.991 + 0.9916 + 0.9867 + 0.9854) / 6

0.9878666666666667