# Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import re
from bs4 import BeautifulSoup
import os
import random
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.kernel_ridge import KernelRidge

import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
TEST_DATA_PATH = '../input/jigsaw-toxic-severity-rating/comments_to_score.csv'
VALID_DATA_PATH = '../input/jigsaw-toxic-severity-rating/validation_data.csv'
TRAIN_DATA_PATH = '../input/jigsaw-toxic-comment-classification-challenge/train.csv'

In [None]:
SEED = 10
MAX_FEATURES = 10_000

In [None]:
def set_seed(seed=42):
    """Utility function to use for reproducibility.
    :param seed: Random seed
    :return: None
    """
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


def set_display():
    """Function sets display options for charts and pd.DataFrames.
    """
    # Plots display settings
    plt.style.use('fivethirtyeight')
    plt.rcParams['figure.figsize'] = 12, 8
    plt.rcParams.update({'font.size': 14})
    # DataFrame display settings
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.options.display.float_format = '{:.4f}'.format
    
    
def text_cleaning(text: str) -> str:
    """Function cleans text removing special characters,
    extra spaces, embedded URL links, HTML tags and emojis.
    Code source: https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-infer
    :param text: Original text
    :return: Preprocessed text
    """
    template = re.compile(r'https?://\S+|www\.\S+')  # website links
    text = template.sub(r'', text)

    soup = BeautifulSoup(text, 'lxml')  # HTML tags
    only_text = soup.get_text()
    text = only_text

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r"[^a-zA-Z\d]", " ", text)  # special characters
    text = re.sub(' +', ' ', text)  # extra spaces
    text = text.strip()  # spaces at the beginning and at the end of string

    return text

In [None]:
set_seed(SEED)
set_display()

In [None]:
# Extract classified text samples and clean the texts.
data_train = pd.read_csv(TRAIN_DATA_PATH)
data_train['comment_text'] = data_train['comment_text'].apply(text_cleaning)
data_train.head()

In [None]:
categories = data_train.loc[:, 'toxic':'identity_hate'].sum()
plt.title('Category Frequency')
plt.bar(categories.index, categories.values)
plt.show()

In [None]:
scores = data_train.loc[:, 'toxic':'identity_hate'].sum(axis=1).value_counts()
plt.bar(scores.index, scores.values)
plt.title('Scores Distribution: Simple Sum')
plt.show()

In [None]:
# Multiplication factors for categories.
cat_mtpl = {'toxic': 1, 'severe_toxic': 1.75, 'obscene': 0.95,
            'threat': 2, 'insult': 1.6, 'identity_hate': 1.95}

for category in cat_mtpl:
    data_train[category] = data_train[category] * cat_mtpl[category]

data_train['score'] = data_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

In [None]:
plt.hist(data_train['score'])
plt.title('Scores Distribution: Adjusted Sum')
plt.show()

In [None]:
n_samples_toxic = len(data_train[data_train['score'] != 0])
n_samples_normal = len(data_train) - n_samples_toxic

idx_to_drop = data_train[data_train['score'] == 0].index[n_samples_toxic//5:]
data_train = data_train.drop(idx_to_drop)

print(f'Reduced number of neutral text samples from {n_samples_normal} to {n_samples_toxic//5}.')
print(f'Total number of training samples: {len(data_train)}')

In [None]:
print(f'Mean toxicity score: {data_train["score"].mean()}\n'
      f'Standard deviation: {data_train["score"].std()}')

# Model 

In [None]:
# Candidate models
kridge = make_pipeline(
    TfidfVectorizer(decode_error='ignore', stop_words='english', max_features=MAX_FEATURES),
    KernelRidge()
)

randforest = make_pipeline(
    TfidfVectorizer(decode_error='ignore', stop_words='english', max_features=MAX_FEATURES),
    RandomForestRegressor(n_jobs=-1)
)

In [None]:
models = [
    ('KernelRidge', kridge),
    ('RandomForest', randforest)
]

In [None]:
# New data for validation: text pairs.
data_valid = pd.read_csv(VALID_DATA_PATH)

# Clean the texts
data_valid['less_toxic'] = data_valid['less_toxic'].apply(text_cleaning)
data_valid['more_toxic'] = data_valid['more_toxic'].apply(text_cleaning)

data_valid.head()

In [None]:
# Train each model on all available samples from previous competition.
for name, model in models:
    print('-' * 50)
    model.fit(data_train['comment_text'], data_train['score'])
    print(f'{name} model completed training.')

    # Estimate toxicity score for text pairs.
    data_valid[f'less_toxic_score_{name}'] = model.predict(data_valid['less_toxic'])
    data_valid[f'more_toxic_score_{name}'] = model.predict(data_valid['more_toxic'])
    print(f'{name} model completed prediction.')

    # Compare scores for all text pairs.
    data_valid[f'result_{name}'] = \
        data_valid[f'more_toxic_score_{name}'] > data_valid[f'less_toxic_score_{name}']

    # Ratio of correctly scored text pairs.
    print('Correct predictions:', data_valid[f'result_{name}'].sum() / len(data_valid))
    
    joblib.dump(model, f'{name}.joblib')

In [None]:
# Check the accuracy of averaged scores from the best models.
data_valid['less_toxic_score'] = data_valid[['less_toxic_score_KernelRidge', 'less_toxic_score_RandomForest']].mean(axis=1)

data_valid['more_toxic_score'] = data_valid[['more_toxic_score_KernelRidge', 'more_toxic_score_RandomForest']].mean(axis=1)

data_valid[f'result'] = data_valid[f'more_toxic_score'] > data_valid[f'less_toxic_score']
print('Correct averaged predictions:', data_valid[f'result'].sum() / len(data_valid))

In [None]:
# New data for text scoring.
data_test = pd.read_csv(TEST_DATA_PATH)
data_test['text'] = data_test['text'].apply(text_cleaning)
data_test.head()

# Predictions

In [None]:
# Get prediction from the best models.
for name, model in models[0:]:
    data_test[f'score_{name}'] = model.predict(data_test['text'])
    print(f'{name} model completed prediction.')

In [None]:
# Average the result.
data_test['score'] = data_test[['score_KernelRidge', 'score_RandomForest']].mean(axis=1)

In [None]:
data_test[['comment_id', 'score']].to_csv('submission.csv', index=False)
data_test[['comment_id', 'score']].head()