## Setup

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.linear_model import Ridge, LinearRegression

## Import dataset

### Jigsaw toxic comment classification dataset

In [None]:
df1 = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
cols = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
        'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}
for col in cols: df1[col] = df1[col] * cols[col]
df1['label'] = df1.loc[:, 'toxic':'identity_hate'].sum(axis=1)
df1 = df1[['comment_text', 'label']].rename(columns={'comment_text': 'text'})
print(df1.info())
df1.sample(10)

### Jigsaw regression based dataset

In [None]:
df2 = pd.read_csv("../input/jigsaw-regression-based-data/train_data_version2.csv")
df2 = df2.rename(columns={'y': 'label'})
print(df2.info())
df2

### Ruddit with text dataset

In [None]:
df3 = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
df3 = df3[['txt', 'offensiveness_score']].rename(columns={'txt': 'text', 'offensiveness_score': 'label'})
print(df3.info())
df3

### Validation dataset

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(df_val.info())
df_val

### Test dataset

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
print(df_sub.info())
df_sub

## Model 1 - Jigsaw toxic comment classification dataset

In [None]:
%%time

# Data
df_train1, df_val1 = df1, df_val

# TF-IDF
tfidf1 = TfidfVectorizer(
    min_df=4,
    max_df=0.5, 
    analyzer='char_wb', 
    ngram_range=(3,5),
    )

X_train1 = tfidf1.fit_transform(df_train1['text'])
y_train1 = df_train1['label'].values
X_val_less1 = tfidf1.transform(df_val1['less_toxic'])
X_val_more1 = tfidf1.transform(df_val1['more_toxic'])

print(X_train1.shape)
print(X_val_less1.shape)

In [None]:
%%time

# Train
model1 = Ridge()
model1.fit(X_train1, y_train1)

# Predict validation set
p_less1 = model1.predict(X_val_less1)
p_more1 = model1.predict(X_val_more1)

# Predict test set
X_test1 = tfidf1.transform(df_sub['text'])
p1_sub = model1.predict(X_test1)

# Validation score
(p_less1 < p_more1).mean()

## Model 2 - Jigsaw regression based dataset

In [None]:
%%time

# Data
df_train2, df_val2 = df2, df_val

# TF-IDF
tfidf2 = TfidfVectorizer(
    min_df=4,
    max_df=0.5, 
    analyzer='char_wb', 
    ngram_range=(3,5),
    )

X_train2 = tfidf2.fit_transform(df_train2['text'])
y_train2 = df_train2['label'].values
y_train2 = np.around(y_train2, decimals=1)
X_val_less2 = tfidf2.transform(df_val2['less_toxic'])
X_val_more2 = tfidf2.transform(df_val2['more_toxic'])

print(X_train2.shape)
print(X_val_less2.shape)

In [None]:
%%time

# Train
model2 = Ridge()
model2.fit(X_train2, y_train2)

# Predict validation set
p_less2 = model2.predict(X_val_less2)
p_more2 = model2.predict(X_val_more2)

# Predict test set
X_test2 = tfidf2.transform(df_sub['text'])
p2_sub = model2.predict(X_test2)

# Validation score
(p_less2 < p_more2).mean()

## Model 3 - Ruddit with text dataset

In [None]:
%%time

# Data
df_train3, df_val3 = df3, df_val

# TF-IDF
tfidf3 = TfidfVectorizer(
    min_df=4,
    max_df=0.5, 
    analyzer='char_wb', 
    ngram_range=(3,5),
    )

X_train3 = tfidf3.fit_transform(df_train3['text'])
y_train3 = df_train3['label'].values
X_val_less3 = tfidf3.transform(df_val3['less_toxic'])
X_val_more3 = tfidf3.transform(df_val3['more_toxic'])

print(X_train3.shape)
print(X_val_less3.shape)

In [None]:
%%time

# Train
model3 = Ridge()
model3.fit(X_train3, y_train3)

# Predict validation set
p_less3 = model3.predict(X_val_less3)
p_more3 = model3.predict(X_val_more3)

# Predict testset
X_test3 = tfidf3.transform(df_sub['text'])
p3_sub = model3.predict(X_test3)

# Validation score
(p_less3 < p_more3).mean()

## Evaluation

In [None]:
# Combine 3 models
(p_less1+p_less2+p_less3 < p_more1+p_more2+p_more3).mean()

In [None]:
# Model 1 + 2
(p_less1+p_less2 < p_more1+p_more2).mean()

In [None]:
# Model 2 + 3
(p_less2+p_less3 < p_more2+p_more3).mean()

In [None]:
# Model 1 + 3
(p_less1+p_less3 < p_more1+p_more3).mean()

In [None]:
# Finding best weights
for i in np.arange(0.1, 1, 0.05):
    print(f'{round(i,2)} / {round(1-i,2)} : {(p_less1*i+p_less3*(1-i) < p_more1*i+p_more3*(1-i)).mean()}\n')

## Submission

In [None]:
df_sub['score'] = p1_sub*0.6 + p3_sub*0.4
df_sub

In [None]:
df_sub[['comment_id', 'score']].to_csv('./submission.csv', index=False)
print(f"Export sucessfully!")