In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

First a quick check on the data

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df.head()

As it can be seen below, we have almost 160 thousand comments and most of them is not toxic and the other categories are even less frequent.

In [None]:
df.describe()

# First approach

With the loeaded classification dataset I build a simple multiclass classification model that would provide me probabilities whether the comment is toxic, severe toxic, obscene etc.

This probabilities can be weighted based on the validation dataset that has two comments comparison, less toxic and more toxic. The more toxic will have a value of 1 and the less toxic -1 or 1 and then an another classification or regression model will be trained on them.

I need to build a data processing pipeline element, that formats the text and removes numbers.

In [None]:
# creating a custom transformer
from sklearn.base import BaseEstimator,TransformerMixin

class TextCleaner(BaseEstimator,TransformerMixin):
    def __init__(self):
        return None
        #self.factor = factor
        
    def cleaner(self,X,y=None):
        X = X.str.replace('\n', ' \n ')
        X = X.str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
        X = X.str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')  
    # Remove numbers -> these are possible not toxic
        X = X.str.replace(r'[0-9]', '')
    # Add space around repeating characters
        X = X.str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
        X = X.str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
        X = X.str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
        X = X.str.replace(r'[ ]{2,}',' ').str.strip() 
        return X

    def fit(self,X,y=None):
        X = pd.DataFrame(X).copy()
        return self#X.apply(self.cleaner)
    
    def transform(self,X,y=None):
        X = pd.DataFrame(X).copy()
        
        return X.apply(self.cleaner)
    

# Keeping only the important columns

+ now the first pipeline can be built 
+ TFIDF vectorizer

In [None]:
# step 2
df = df[["comment_text", "toxic","severe_toxic","obscene","threat","insult","identity_hate"]]
df.head()

## Creating the first pipeline and fitting

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LinearRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import GaussianNB

def reshaper(X):
    return X.reshape(-1)
rs = FunctionTransformer(reshaper)

def denser(X):
    return X.toarray()
denser = FunctionTransformer(denser)


X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(np.array(df.comment_text)),
                                                    np.array(df[["toxic", "severe_toxic",
                       "obscene","threat","insult","identity_hate"]] ), test_size = 0.3, random_state = 123 )
 


ct = ColumnTransformer(transformers=[ ('textCleaner',TextCleaner(), 0) ])

vectorizer = FeatureUnion([
       # ("tfid_1", TfidfVectorizer(min_df= 3
       #                          , max_df=0.3
       #                          , analyzer = 'word'
       #                          , ngram_range = (1,3)
       #                          , stop_words="english"
       #                         )),
        ("tfid_imp", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5)))
    ])

pipeline_0 = Pipeline(steps=[["textCleaner", ct], 
                             ["reshaper", rs]
                           , ["features", vectorizer]])
pipeline_cleaner = Pipeline(steps=[["textCleaner", ct], 
                             ["reshaper", rs]
                           ])

In [None]:
## pipeline 1
from sklearn.multioutput import MultiOutputClassifier

pipeline_1 = Pipeline(steps=[["textCleaner", ct]
                             , ["reshaper", rs]
                             , ["features", vectorizer]
                             #, ["dense", denser]
                             ,["multiclass", 
                        MultiOutputClassifier(LogisticRegression(solver='liblinear'), n_jobs=-1)]
                            ])
pipeline_1.fit(X_train, y_train)

## Predicting with the first pipeline

In [None]:
# predictions
y_mc = np.log(pipeline_1.predict_proba(X_test ))

In [None]:
import seaborn as sns
sns.displot(y_mc[0][:,1])

Evaluate the ordering capability of the models one by one

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
def eval_multiclass_one(test, y_mc, title, index):
    print("AUC: {0:.4f}".format( roc_auc_score(test[index], y_mc[index][:, 1])))
    tpr, fpr, _ = roc_curve(test[index], y_mc[index][:,1])
    plt.figure()
    plt.plot(tpr, fpr)
    plt.title(title)
    plt.show()
eval_multiclass_one(np.transpose(y_test), y_mc, "toxic", 0)

In [None]:
eval_multiclass_one(np.transpose(y_test), y_mc, "severe_toxic", 1)

In [None]:
eval_multiclass_one(np.transpose(y_test), y_mc, "threat", 3)

In [None]:
eval_multiclass_one(np.transpose(y_test), y_mc, "obscene", 2)

## Now we can test it on the validation dataset

1. importing the dataset
2. splitting into more and less toxic

In [None]:
# test the scoring :)
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(len(df_val))
df_val.head()

In [None]:
df_val_lt = df_val["less_toxic"]
df_val_mt = df_val["more_toxic"]

y_lt = np.log(pipeline_1.predict_proba(pd.DataFrame(np.array(df_val_lt))))
y_mt = np.log(pipeline_1.predict_proba(pd.DataFrame(np.array(df_val_mt))))

In [None]:
df_val["lt_score"] = y_lt[0][:, 1] 
df_val["mt_score"] = y_mt[0][:, 1]

df_val[["less_toxic", "more_toxic", "lt_score", "mt_score"]].head()

Checking how well the model decides between more and less toxic, based only on the toxic scores.

In [None]:
print("Accuracy: \n")
print(len(df_val[df_val.lt_score < df_val.mt_score])/len(df_val))

In [None]:
sns.displot(df_val.lt_score, alpha = 0.5)
sns.displot(df_val.mt_score, alpha = 0.5, color = "orange")
plt.show()

In [None]:
df_val["correct"] = np.where(df_val.mt_score > df_val.lt_score, 1, 0)
df_val[df_val.correct == 0].head()

In [None]:
#sns.displot(df_val[df_val.correct == 0].lt_score)
sns.histplot(df_val[df_val.correct == 0].lt_score,cumulative=False, color = "orange")
sns.histplot(df_val[df_val.correct == 1].lt_score,cumulative=False, alpha = 0.5)
plt.show()

In [None]:
#sns.displot(df_val[df_val.correct == 0].lt_score)
sns.histplot(df_val[df_val.correct == 0].mt_score,cumulative=False, color = "orange")
sns.histplot(df_val[df_val.correct == 1].mt_score,cumulative=False, alpha = 0.5)
plt.show()

Refit on full and predict again...

In [None]:
sns.histplot(df_val.lt_score-df_val.mt_score, cumulative = True)
plt.show()
sns.histplot(df_val.lt_score-df_val.mt_score, cumulative = False)
plt.show()

## Seems like it is working fine so far

* weight the other predictions -> build a model to find the optimal weights


In [None]:
df_val_lt_scores = pd.DataFrame(np.transpose(y_lt)[1,:,:])
df_val_lt_scores["target"] = -1
df_val_mt_scores = pd.DataFrame(np.transpose(y_mt)[1,:,:])
df_val_mt_scores["target"] = 1
df_val_lt_scores.head()

In [None]:
X_new = pd.concat([df_val_lt_scores, df_val_mt_scores]).sample(frac=1).reset_index(drop=True)
X_new.head()

In [None]:
np.mean(X_new.target)

## Benchmark shows that without model the aus is 0.6705

In [None]:
X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new.drop("target", axis = 1), np.array(X_new.target),
                                                                   test_size = 0.3, random_state = 123)

# benchmark
print("AUC: {0:.4f}".format( roc_auc_score(y_new_test, X_new_test.iloc[:,0])))
tpr, fpr, _ = roc_curve(y_new_test, X_new_test.iloc[:,0])
plt.figure()
plt.plot(tpr, fpr)
#plt.title()
plt.show()

In [None]:
## B

## Builidng second pipeline

* Polynomila features
* Standard scaler
* Ridge regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

pol = PolynomialFeatures(degree = 2)
scaler = StandardScaler()
#X_new_train_n = pol.fit_transform(X_new_train)

steps = [["pol", pol],["scaler", scaler], ["Ridge", Ridge(alpha = 0.7)]]#GradientBoostingRegressor(max_depth=5,n_estimators=50,random_state= 123  )]]
pipeline_weight_find = Pipeline(steps = steps)

pipeline_weight_find.fit(X = X_new_train, y = y_new_train)
y_final = pipeline_weight_find.predict(X_new_test)

In [None]:
print("AUC: {0:.4f}".format( roc_auc_score(y_new_test, y_final[:])))
tpr, fpr, _ = roc_curve(y_new_test, y_final[:])
plt.figure()
plt.plot(tpr, fpr)
#plt.title()
plt.show()

In [None]:
sns.histplot(y_final[:])

## try cascading

X_new_train

In [None]:
X_gb_new_t = X_new_train[(X_new_train[0] > -3) & (X_new_train[0] < -1) ]
Y_gb_new_t = y_new_train[(X_new_train[0] > -3) & (X_new_train[0] < -1) ]

In [None]:
np.mean(Y_gb_new_t)

In [None]:
pipeline_cascade = Pipeline([["pol", pol],["scaler", scaler], ["GB", GradientBoostingRegressor(random_state= 123)]])
pipeline_cascade.fit(X_gb_new_t, Y_gb_new_t)

In [None]:
y_casc = pipeline_cascade.predict(X_new_test)

y_final_ = y_final*0.8 + y_casc * 0.2

print("AUC: {0:.4f}".format( roc_auc_score(y_new_test, y_final_[:])))
tpr, fpr, _ = roc_curve(y_new_test, y_final_[:])
plt.figure()
plt.plot(tpr, fpr)
#plt.title()
plt.show()

In [None]:
sns.histplot(y_casc)

## Start the full training cycle

In [None]:
pipeline_1.fit(pd.DataFrame(np.array(df.comment_text)), np.array(df[["toxic", "severe_toxic",
                       "obscene","threat","insult","identity_hate"]]))

In [None]:
df_val_lt = df_val["less_toxic"]
df_val_mt = df_val["more_toxic"]
y_lt = np.log(pipeline_1.predict_proba(pd.DataFrame(np.array(df_val_lt))))
y_mt = np.log(pipeline_1.predict_proba(pd.DataFrame(np.array(df_val_mt))) )              

df_val_lt_scores = pd.DataFrame(np.transpose(y_lt)[1,:,:])
df_val_lt_scores["target"] = -1
df_val_mt_scores = pd.DataFrame(np.transpose(y_mt)[1,:,:])
df_val_mt_scores["target"] = 1
#df_val_lt_scores.head()
               
X_new = pd.concat([df_val_lt_scores, df_val_mt_scores]).sample(frac=1).reset_index(drop=True)
#X_new.head()
pipeline_weight_find.fit(np.array(X_new.drop("target", axis = 1)),np.array(X_new.target) )

In [None]:
X_gb_new_t = X_new[(X_new[0] > -3) & (X_new[0] < -1) ].drop("target", axis = 1)
Y_gb_new_t = X_new[(X_new[0] > -3) & (X_new[0] < -1) ].target

pipeline_cascade.fit(X_gb_new_t, Y_gb_new_t)

In [None]:
sns.histplot(Y_gb_new_t)

In [None]:
#pipeline_weight_find.fit(np.array(X_new.drop("target", axis = 1)),np.array(X_new.target) )


# Submission 0:

In [None]:
df_to_submit = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_to_submit.head()

In [None]:
X_new_sub = np.log(pipeline_1.predict_proba(pd.DataFrame(np.array(df_to_submit.iloc[:, 1]))) )
y_to_submit = pipeline_weight_find.predict(np.transpose(X_new_sub)[1,:,:])
y_cascade = pipeline_cascade.predict(np.transpose(X_new_sub)[1,:,:])

In [None]:
#sns.histplot(y_to_submit, bins = 1000, alpha = 0.2)
sns.histplot(y_cascade, bins = 1000, color = "orange", alpha = 1)

In [None]:
y_cascade

In [None]:
# y_to_submit = pipeline_1.predict_proba(df_to_submit["cleaned"].values.astype('U'))
# y_to_submit[:10]

#y_to_submit = pipeline_mc.predict_proba(df_to_submit["cleaned"].values.astype('U'))
#y_to_submit[:10]

In [None]:
# extra step lg


In [None]:
from scipy.stats import rankdata
df_to_submit["score"] = 0.8 * y_to_submit[:] + y_cascade[:] * 0.2 
df_to_submit = df_to_submit[["comment_id", "score"]] 
df_to_submit.head()

In [None]:
sns.histplot(df_to_submit.score, bins = 1000)

In [None]:
df_to_submit.to_csv("./submission.csv", index = False)
