In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
train = pd.read_csv('./data_toxic_regression/train.csv')
test = pd.read_csv('./data_toxic_regression/test.csv')
sample_submission = pd.read_csv('./data_toxic_regression/sample_submission.csv')

In [7]:
df = pd.concat([train['comment_text'], test['comment_text']], axis=0)
df = df.fillna("unknown")
nrow_train = train.shape[0]

vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
X = vectorizer.fit_transform(df)

col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

preds = np.zeros((test.shape[0], len(col)))

In [8]:
loss = []

for i, j in enumerate(col):
    print('===Fit '+j)
    model = LogisticRegression()
    model.fit(X[:nrow_train], train[j])
    preds[:,i] = model.predict_proba(X[nrow_train:])[:,1]

    pred_train = model.predict_proba(X[:nrow_train])[:,1]
    print('ROC AUC:', roc_auc_score(train[j], pred_train))
    loss.append(roc_auc_score(train[j], pred_train))

print('mean column-wise ROC AUC:', np.mean(loss))

===Fit toxic
ROC AUC: 0.9840078635569363
===Fit severe_toxic
ROC AUC: 0.9922876067203097
===Fit obscene
ROC AUC: 0.9930288209490679
===Fit threat
ROC AUC: 0.9952588847863963
===Fit insult
ROC AUC: 0.9873622174227361
===Fit identity_hate
ROC AUC: 0.9900230119956406
mean column-wise ROC AUC: 0.9903280675718479


In [10]:
submission_samples = pd.read_csv('../data/sample_submission.csv')
sample_submission_id = pd.DataFrame({'id': sample_submission["id"]})
submission = pd.concat([sample_submission, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('submission_7.csv', index=False)

In [9]:
# submission_samples = pd.read_csv('../data/sample_submission.csv')
# sample_submission_id = pd.DataFrame({'id': submission_samples["id"]})
# submission_output = pd.concat([sample_submission_id, pd.DataFrame(predictions, columns = columns_toxic)], axis=1)
# submission_output.to_csv('submission_6.csv', index=False)