In [None]:
import os
import gc
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import glob

In [None]:
import string
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from gensim import utils
import gensim.parsing.preprocessing as gsp

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Logistic Regression

In [None]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

In [None]:
train2.toxic = train2.toxic.round().astype(int)
train = pd.concat([train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000)
    ])
#rate=10
#train = train[::rate]
train.head()

In [None]:
valid.head()

In [None]:
test.head()

In [None]:
print("Validation data set size:",valid.shape)
print("Test data set size:",test.shape)
print("Training data set size:",train.shape)

In [None]:
def get_language(text):
    return Detector("".join(x for x in text if x.isprintable()),quiet=True).languages[0].name

In [None]:
pip install pyicu

In [None]:
pip install pycld2

In [None]:
from polyglot.detect import Detector
from polyglot.utils import pretty_list

In [None]:
train['language'] = train["comment_text"].apply(get_language)

In [None]:
train.head()

In [None]:
train['toxic'].value_counts()

In [None]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()
 
filters = [
           gsp.strip_tags, #remove tags 
           gsp.strip_punctuation, #remove punctuation
           gsp.strip_multiple_whitespaces, #standarized the spaces 
           gsp.strip_numeric,
           gsp.remove_stopwords, #stop words  
           gsp.strip_short, 
           gsp.stem_text #stemming 
          ]

def clean_text(s):
    s = str(s).lower() 
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [None]:
train[train.language=='xx']

In [None]:
#clean the text first 
train['comment_text'].fillna("unknown", inplace=True)
train["comment_text"] = train["comment_text"].apply(clean_text)

In [None]:
#vectorization of the model 
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, strip_accents='unicode', use_idf=1,smooth_idf=1, sublinear_tf=1)

pipeline = Pipeline([
    ('tfidf', vec),
    ('logreg', LogisticRegression(penalty='elasticnet')),
])

parameters = {
    'tfidf__max_features': [None, 1000, 5000, 50000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # unigrams or unigrams + bigrams
    'logreg__penalty' : ['l1', 'l2'],
    'logreg__C' : np.logspace(-4, 4, 20),
    'logreg__solver' : ['liblinear'],
}

## ROC AUC

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(pipeline, parameters, n_jobs=-1, verbose=1,scoring=['roc_auc'],cv=cv, n_iter=10, refit='roc_auc')

In [None]:
%%time 

random_search.fit(train["comment_text"], train['toxic']);

In [None]:
pd.DataFrame(random_search.cv_results_).sort_values('mean_test_roc_auc', ascending=False)

### Validation dataset fitting
- With AUC model

In [None]:
#valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
valid['comment_text'].fillna("unknown", inplace=True)
valid["comment_text"] = valid["comment_text"].apply(clean_text)

#score the validation dataset
y_valid = valid['toxic']
y_pred_valid = random_search.best_estimator_.predict_proba(valid["comment_text"])
# print('Testing accuracy %s' % accuracy_score(y_valid, y_pred_valid))
# print('Testing F1 score: {}'.format(f1_score(y_valid, y_pred_valid, average='weighted')))
print('Validation AUC score %s' % roc_auc_score(y_valid, y_pred_valid[:, 1]))

In [None]:
test["comment_text"] = test["content"].apply(clean_text)

#score the submission file 
y_pred = random_search.best_estimator_.predict_proba(test["comment_text"])

#load the sample submission file 
sample_sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

submid = pd.DataFrame({'id': sample_sub["id"]})
submission = pd.concat([submid,test.comment_text, pd.DataFrame(y_pred[:, 1],columns=['toxic'])], axis=1)
submission.head()

In [None]:
submission.toxic = submission.toxic.round().astype(int)

In [None]:
submission.toxic.value_counts()

In [None]:
submission.head(10)

# DistilBERT

In [None]:
!pip install -U transformers

In [None]:
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from tqdm.notebook import tqdm
import tokenizers
from tokenizers import BertWordPieceTokenizer

In [None]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'[0-9"]', '', text) # number
    text = re.sub(r'#[\S]+\b', '', text) # hash
    text = re.sub(r'@[\S]+\b', '', text) # mention
    text = re.sub(r'https?\S+', '', text) # link
    text = re.sub(r'\s+', ' ', text) # multiple white spaces
#     text = re.sub(r'\W+', ' ', text) # non-alphanumeric
    return text.strip()

def text_process(text):
    ws = text.split(' ')
    if(len(ws)>160):
        text = ' '.join(ws[:160]) + ' ' + ' '.join(ws[-32:])
    return text

def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    print('encoding with', tokenizer)
    
    # for transformers 3.5
    if isinstance(tokenizer, transformers.DistilBertTokenizer) or \
        isinstance(tokenizer, transformers.DistilBertTokenizerFast):
    #     tokenizer.enable_truncation(max_length=maxlen)
    #     tokenizer.enable_padding(max_length=maxlen)
        all_ids = []

        for i in tqdm(range(0, len(texts), chunk_size)):
            text_chunk = texts[i:i+chunk_size].tolist()
    #         encs = tokenizer.encode_batch(text_chunk)
            encs = tokenizer(text_chunk, padding='max_length', truncation=True, max_length=maxlen)
    #         all_ids.extend([enc.ids for enc in encs])
            all_ids.extend(encs['input_ids']) 
    elif isinstance(fast_tokenizer, tokenizers.implementations.bert_wordpiece.BertWordPieceTokenizer): 
        tokenizer.enable_truncation(max_length=maxlen)
        tokenizer.enable_padding(max_length=maxlen)
        all_ids = []

        for i in tqdm(range(0, len(texts), chunk_size)):
            text_chunk = texts[i:i+chunk_size].tolist()
            encs = tokenizer.encode_batch(text_chunk)
            all_ids.extend([enc.ids for enc in encs])

    
    return np.array(all_ids)

In [None]:
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')

save_path = '/kaggle/working/distilbert_base_cased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)
fast_tokenizer = tokenizer

# "faster as the tokenizers from transformers because they are implemented in Rust."
# fast_tokenizer = BertWordPieceTokenizer('distilbert_base_cased/vocab.txt', lowercase=False)

## TPU Config

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
# Configuration
AUTO = tf.data.experimental.AUTOTUNE
SHUFFLE = 2048
EPOCHS1 = 20
EPOCHS2 = 4
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
VERBOSE = 2

In [None]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

In [None]:
train2.toxic = train2.toxic.round().astype(int)
train = pd.concat([train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000)
    ])
#rate=10
#train = train[::rate]
train.head()

In [None]:
def len_sent(data):
    return len(data.split())
train["num_words_comment_text"] = train["comment_text"].apply(lambda x : len_sent(x))
#sns.kdeplot(train[train["toxic"] == 0]["num_words_comment_text"].values, shade = True, color = "red", label='non_toxity')
#sns.kdeplot(train[train["toxic"] == 1]["num_words_comment_text"].values, shade = True, color = "blue", label='toxity')

y_train = train['toxic'].values



In [None]:
train.head()

In [None]:
del train['toxic']; gc.collect()

In [None]:
train['comment_text'] = train['comment_text'].apply(lambda x: clean_text(x))
train['comment_text'] = train['comment_text'].apply(lambda x: text_process(x))
x_train = fast_encode(train['comment_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)

In [None]:

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(SHUFFLE)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

del x_train; gc.collect()


valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
valid['comment_text'] = valid.apply(lambda x: clean_text(x['comment_text']), axis=1)
valid['comment_text'] = valid['comment_text'].apply(lambda x: text_process(x))
x_valid = fast_encode(valid['comment_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
y_valid = valid['toxic'].values

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

del x_valid; gc.collect()

## Callbacks

In [None]:
lrs = ReduceLROnPlateau(monitor='val_auc', mode ='max', factor = 0.7, min_lr= 1e-7, verbose = 1, patience = 2)
es1 = EarlyStopping(monitor='val_auc', mode='max', verbose = 1, patience = 5, restore_best_weights=True)
es2 = EarlyStopping(monitor='auc', mode='max', verbose = 1, patience = 1, restore_best_weights=True)
callbacks_list1 = [lrs,es1]
callbacks_list2 = [lrs,es2]

## Build Model

In [None]:
def build_model(transformer, max_len=512):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    x = tf.keras.layers.Dropout(0.4)(cls_token)
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc'), 'accuracy'])
    
    return model

## Load model in TPU

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

## Run Model

In [None]:
# not train on order to save memory
n_steps = len(y_train) // (BATCH_SIZE*8)

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS1,
    callbacks=callbacks_list1,
    verbose=VERBOSE
)

del train_dataset; gc.collect()

In [None]:
train_history_df = pd.DataFrame.from_dict(train_history.history)
train_history_df

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.plot(train_history_df['val_auc'], label='val_auc')
plt.plot(train_history_df['auc'], label='auc')
plt.legend(fontsize=15)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_history_df['accuracy'], label='accuracy')
plt.plot(train_history_df['val_accuracy'], label='val_accuracy')
plt.legend(fontsize=15)

In [None]:
n_steps = len(y_valid) // (BATCH_SIZE)

train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS2,
    callbacks=callbacks_list2,
    verbose=VERBOSE
)

In [None]:
train_history2_df = pd.DataFrame.from_dict(train_history_2.history)
train_history2_df

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_history2_df['loss'], label='loss')
plt.plot(train_history2_df['auc'], label='auc')
plt.plot(train_history2_df['accuracy'], label='accuracy')
plt.legend(fontsize=15)

In [None]:
x_test = fast_encode(test['content'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
sub['toxic'] = model.predict(test_dataset, verbose=1)

In [None]:
test["comment_text"] = test["content"].apply(clean_text)

In [None]:
sub["comment_text"] = test.comment_text
sub.toxic = sub.toxic.round().astype(int)

In [None]:
sub.head(10)

In [None]:
sub.toxic.value_counts()