In [None]:
import os
import gc
import copy
import time
import random
import string

import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import rankdata

In [None]:
def ridge_cv(vex, X, y, X_test, folds, stratified):
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=123)
    val_scores = []
    rmse_scores = []
    X_less_toxics = []
    X_more_toxics = []

    preds = []

    for fold, (train_index, val_index) in enumerate(kf.split(X, stratified)):
        X_train, y_train = X[train_index], y[train_index]
        X_val, y_val = X[val_index], y[val_index]
        model = Ridge()
        model.fit(X_train, y_train)

        rmse_score = mean_squared_error(model.predict(X_val), y_val, squared=False)
        rmse_scores.append(rmse_score)

        X_less_toxic = vec.transform(df_val['less_toxic'])
        X_more_toxic = vec.transform(df_val['more_toxic'])

        p1 = model.predict(X_less_toxic)
        p2 = model.predict(X_more_toxic)

        X_less_toxics.append(p1)
        X_more_toxics.append(p2)

        val_acc = (p1 < p2).mean()
        val_scores.append(val_acc)

        pred = model.predict(X_test)
        preds.append(pred)

        print(f'FOLD:{fold}, rmse_fold:{rmse_score:.5f}, val_acc:{val_acc:.5f}')

    mean_val_acc = np.mean(val_scores)
    mean_rmse_score = np.mean(rmse_scores)

    p1 = np.mean(np.vstack(X_less_toxics), axis=0)
    p2 = np.mean(np.vstack(X_more_toxics), axis=0)

    val_acc = (p1 < p2).mean()

    print(f'00F: val_acc:{val_acc:.5f}, mean val_acc:{mean_val_acc:.5f}, mean rmse_score:{mean_rmse_score:.5f}')

    preds = np.mean(np.vstack(preds), axis=0)

    return p1, p2, preds

In [None]:
data_path = '../input/'

In [None]:
df_val = pd.read_csv(data_path + "jigsaw-toxic-severity-rating/validation_data.csv")
df_test = pd.read_csv(data_path + "jigsaw-toxic-severity-rating/comments_to_score.csv")

# Toxic Comment Classification Challenge data set

In [None]:
jf_train_df = pd.read_csv(data_path + "jigsaw-toxic-comment-classification-challenge/train.csv")
jf_train_df.head()

In [None]:
jf_train_df['y'] = jf_train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)
jf_train_df = jf_train_df.query('y > 0')
print(jf_train_df.shape)
jf_train_df.head()

In [None]:
toxic = 1.0
severe_toxic = 2.0
obscene = 1.0
threat = 1.0
insult = 1.0
identity_hate = 2.0

def create_train (df):
    df['y'] = df['y'] + df['severe_toxic']*severe_toxic
    df['y'] = df['y'] + df['obscene']*obscene
    df['y'] = df['y'] + df['threat']*threat
    df['y'] = df['y'] + df['insult']*insult
    df['y'] = df['y'] + df['identity_hate']*identity_hate

    df = df[['comment_text', 'y', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].rename(columns={'comment_text': 'text'})

    # undersampling -> 0인 값은 이미 제거했으므로 일단은 건너뛰기
    # min_len = (df['y'] >= 1).sum()

    return df

df = create_train(jf_train_df)
print(jf_train_df['y'].value_counts())

In [None]:
FOLDS = 5

vec = TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3, ngram_range=(4, 6))
X = vec.fit_transform(df['text'])
y = df['y'].values
X_test = vec.transform(df_test['text'])

stratified = np.around(y)

jf_p1, jf_p2, jf_preds = ridge_cv(vec, X, y, X_test, FOLDS, stratified)

# Jigsaw Unintended Bias in Toxicity Classification

In [None]:
js_train_df = pd.read_csv(data_path + "jigsaw-unintended-bias-in-toxicity-classification/train.csv")
js_train_df.head()

In [None]:
js_train_df['y'] = js_train_df[['target', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack']].max(axis=1)
print(js_train_df.shape)
js_train_df = js_train_df.query('y > 0')
print(js_train_df.shape)

In [None]:
js_train_df['y'] = js_train_df[['severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']].sum(axis=1)
js_train_df['y'] = js_train_df.apply(lambda row: row['target'] if row['target'] <= 0.5 else row['y'], axis=1)
js_train_df = js_train_df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

# down sampling

df = js_train_df
print(df['y'].value_counts())

In [None]:
FOLDS = 5
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3, ngram_range=(4, 6))
X = vec.fit_transform(df['text'])
y = df['y'].values
X_test = vec.transform(df_test['text'])

stratified = (np.around(y, decimals=1)*10).astype(int)
js_p1, js_p2, js_preds = ridge_cv(vec, X, y, X_test, FOLDS, stratified)

# Ruddit data set

In [None]:
rd_train_df = pd.read_csv(data_path + "ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
rd_train_df['y'] = rd_train_df['offensiveness_score'].map(lambda x : 0.0 if x <=0 else x)
print(rd_train_df.shape)

In [None]:
rd_train_df = rd_train_df.query('y > 0')
rd_train_df = rd_train_df[['txt', 'y']].rename(columns={'txt': 'text'})

print(rd_train_df['y'].value_counts())
print(rd_train_df.shape)

In [None]:
FOLDS = 5
df = rd_train_df
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3, ngram_range=(4, 6))
X = vec.fit_transform(df['text'])
y = df['y'].values
X_test = vec.transform(df_test['text'])

stratified = (np.around(y, decimals=1)*10).astype(int)
rd_p1, rd_p2, rd_preds = ridge_cv(vec, X, y, X_test, FOLDS, stratified)

# Ensemble

In [None]:
# 첫 번째 성능 박살나서 일단 제외
# jf_max = max(jf_p1.max(), jf_p2.max())
js_max = max(js_p1.max(), js_p2.max())
rd_max = max(rd_p1.max(), rd_p2.max())

# p1 = jf_p1/jf_max + js_p1/js_max + rd_p1/rd_max
# p2 = jf_p2/jf_max + js_p2/js_max + rd_p2/rd_max

p1 = js_p1/js_max + rd_p1/rd_max
p2 = js_p2/js_max + rd_p2/rd_max

val_acc = (p1 < p2).mean()
print(f'Ensemble: val_acc:{val_acc:.5f}')

In [None]:
score = js_preds/js_max + rd_preds/rd_max

df_test['score'] = rankdata(score, method='ordinal')

df_test[['comment_id', 'score']].to_csv("submission.csv", index=False)

df_test.head()