# Begin

In [None]:
import numpy as np
import pandas as pd
import re

all_lines = []

VERSION = 0
try:
    with open('../input/jigsaw-toxic-severity-lightgbm-training/version.info', 'r') as f:
        all_lines = f.readlines()
    print("Version file read successfully.")
except:
    pass

finally:
    print("Updating version file...")
    with open('version.info', 'w') as f:
        if len(all_lines) != 0:
            VERSION = int(re.sub(r'\D', '', '\n'.join(all_lines))) + 1
            
        f.writelines(f"Version: {VERSION}")
        print("current version:", VERSION)

We will be using multiple dataset from previous Jigsaw competitions as complementary train data.

In [None]:
train_1 = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
train_1

For the main train data, we will simple associate the column `less_toxic` with the score of `0` and the column `more_toxic` with the score of `1`. There are some text that are duplicates so we will filter out the uniques for each level of toxicity.

In [None]:
train_1_cleaned = pd.concat(
    [pd.DataFrame({"text":train_1.less_toxic.unique(), "score":np.zeros(train_1.less_toxic.nunique())}),
     pd.DataFrame({"text":train_1.more_toxic.unique(), "score":np.ones(train_1.more_toxic.nunique())})],
    axis=0).reset_index(drop=True)
train_1_cleaned

In [None]:
train_2 = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
train_2

For the data from `jigsaw-unintended-bias-in-toxicity-classification` we will only be using the `comment_text` and the `target` columns. We will adjust the score on a 50% base if, and only if, the score is not near zero.

In [None]:
train_2_cleaned = pd.DataFrame({'text':train_2.comment_text, 'score':train_2.target})
train_2_cleaned['score'] = train_2_cleaned['score'].apply(lambda x: 0 if x <= 0.05 else 0.5+(x/2))
train_2_cleaned

In [None]:
train_3 = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
train_3

For the data from `jigsaw-toxic-comment-classification-challenge`, we will be scoring them based on the average of the five columns present as targets. Similar to `train_2` we will be using base 50% once more.

In [None]:
score = np.mean(train_3[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']], axis=1)
train_3_cleaned = pd.DataFrame({'text':train_3.comment_text, 'score':score})
train_3_cleaned['score'] = train_3_cleaned['score'].apply(lambda x: 0 if x <= 0.05 else 0.5+(x/2))
train_3_cleaned

Now, we will merge all train data as one.

In [None]:
train_df = pd.concat(
    [train_1_cleaned, train_2_cleaned, train_3_cleaned],
    axis=0
).reset_index(drop=True)
train_df

Let's check one random comment for each unique score (based from a rounded cutoff or 2 decimal places) printing the first 100 raw chars of the corresponding text.

In [None]:
printed = []
for i in sorted(train_df.score.unique()):
    n = np.round(i, 2) 
    if n in printed:
        continue
    printed.append(n)
    print(f"{len(printed):<3}: {i:.5f}\t{repr(np.random.choice(train_df[train_df.score==i]['text']))[:100]}")

# Data Preprocessing

Now that we have a train data, let's focus on preparing the data for our training. The first step is to clean the text input string.

In [None]:
train_df[train_df.score == 0.].shape[0]/train_df.shape[0]

There's over 70% of the data that is not toxic (`score` is `0.0`) so we need to reduce their numbers.

In [None]:
train_df_sel = train_df[train_df.score != 0.]
train_df_zer = train_df[train_df.score == 0.].reset_index(drop=True)
train_df_zer = train_df_zer[:train_df_zer.shape[0]//4]
train_df = pd.concat([train_df_zer, train_df_sel], axis=0).reset_index(drop=True)
print(train_df[train_df.score == 0.].shape[0]/train_df.shape[0])
train_df

Now, we will vectorize our train data.

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

import re

W_to_I = {'':0}
add_space_before_punc = lambda x: re.sub(r'(\W|_)', r' \1 ', x)
remove_whitespaces = lambda x: re.sub(r'\s+', ' ', x)
remove_multiples = lambda x: re.sub(r'(.)\1{2,}', r'\1\1', x) #Remove repeated char multiple times

train_df['clean_text'] = train_df.text.progress_apply(
    lambda x: remove_whitespaces(remove_multiples(add_space_before_punc(x)))
)

# Average len is 44 with min of 1 word and max of 4948.
train_df['len'] = train_df.clean_text.progress_apply(lambda x: len(x.split()))

def convert_to_int(word):
    c = W_to_I.get(word, -1)
    if c==-1:
        c = len(W_to_I)
        W_to_I[word] = c
    return c

def convert_text_to_arr(text, max_len=60):
    words = text.split()[:max_len]
    n = len(words)
    if n < max_len:
        words += ['' for _ in range(max_len - n)]
    words = [convert_to_int(word) for word in words]
    return np.array(words)

X = train_df.clean_text.progress_apply(lambda x: convert_text_to_arr(x))
X = np.concatenate([i.reshape((1, -1)) for i in X.values], axis=0)
y = train_df['score']
stratify = ['toxic' if y_v>=0.5 else 'not' for y_v in y]

# Training

In [None]:
from lightgbm import LGBMRegressor as method
from sklearn.model_selection import train_test_split as tts

current_seed = np.random.randint(7, 1e6)
print("Using seed:", current_seed)

print("Splitting train and validation set...")
train_x, valid_x, train_y, valid_y = tts(X, y, test_size=0.1,
                                         shuffle=True, random_state=current_seed,
                                        stratify=stratify)
print("Train shapes:", train_x.shape, train_y.shape,
      "\nTest shapes:", valid_x.shape, valid_y.shape)

depth = 1000
model = method(
    device='cpu',
    #gpu_platform_id=0,
    #gpu_device_id=0,
    boosting_type='goss',
    objective='mse',
    is_unbalance=True,
    n_estimators=depth**2,
    learning_rate=0.05/(VERSION+1),
    max_bin=64,
    #subsample_freq=10,
    #subsample=0.75,
    #max_depth=2,
    #num_leaves=5,
    reg_alpha=1.5,
    reg_lambda=5.75,
    random_state=current_seed,
    #force_col_wise=True,
    silent=True,
    n_jobs=64,
)
print("Model fitting...")
model.fit(train_x, train_y,
          eval_set=[[train_x, train_y], [valid_x, valid_y]],
          early_stopping_rounds=depth//5,
          verbose=depth//50,
          init_model="../input/jigsaw-toxic-severity-lightgbm-training/model_booster_weights.txt",
         )
model.booster_.save_model("model_booster_weights.txt", num_iteration=model.best_iteration_)
print("Training end.")

In [None]:
import matplotlib.pyplot as plt

preds =model.predict(valid_x, num_iteration=model.best_iteration_)

plt.title("Prediction and True value difference with abs(y - pred):")
plt.ylim([0, 1])
plt.plot(range(len(preds)), np.abs(np.where(valid_y=='toxic', 1, 0) - preds), c='#ff0000', alpha=0.5)
plt.show()

# Submission

In [None]:
from scipy.stats import rankdata

test = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test['clean_text'] = test.text.progress_apply(
    lambda x: remove_whitespaces(remove_multiples(add_space_before_punc(x)))
)
test_X = test.clean_text.progress_apply(lambda x: convert_text_to_arr(x))
test_X = np.concatenate([i.reshape((1, -1)) for i in test_X.values], axis=0)

preds = model.predict(test_X, num_iteration=model.best_iteration_)
display(preds, preds.min(), preds.max())
sub = pd.DataFrame({'comment_id':test.comment_id.values, 'score':rankdata(preds, method='ordinal')})
sub.to_csv('submission.csv', index=False)
sub