In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from datasets import Dataset
from transformers import PreTrainedTokenizerFast
from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
)
import torch
import torch.nn as nn

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
N_SPLITS = 5
EARLY_STOPPING_ROUNDS = 200
SEED = 42

CLASSIFY_DATA = "/kaggle/input/comment-classify/train.csv"
RUDDIT_DATA = "/kaggle/input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv"
TEST_DATA_PATH = "/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv"

RANK_MODEL_PATH = '/kaggle/input/classic-toxic-ensemble/save_small_model'
df_test = pd.read_csv(TEST_DATA_PATH)

params = {'max_iter': 1096, 'tol': 0.013287415929166577, 'alpha': 0.4270990730406918, 'solver': 'auto'}

In [None]:
def dummy_fun(doc):
    return doc

vectorizer = TfidfVectorizer(
    analyzer = 'word',
    tokenizer = dummy_fun,
    preprocessor = dummy_fun,
    token_pattern = None)


def cross_validate_sub(
    model,
    train_data,
    test_data
):
    train_oof = np.zeros(len(train_data))
    predictions_test = np.zeros((len(test_data), N_SPLITS))

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

    for fold, (train_idx, valid_idx) in tqdm(enumerate(kf.split(train_data))):
        train_fold, valid_fold = train_data.iloc[train_idx], train_data.iloc[valid_idx]
        
        y_train = train_fold['Y']
        y_valid = valid_fold['Y']

        X_train = train_fold['X']
        X_valid = valid_fold['X']

        X_train_transformed = vectorizer.fit_transform(X_train)

        model.fit(
            X_train_transformed, 
            y_train
        )

        X_valid_transformed = vectorizer.transform(X_valid)

        temp_oof = model.predict(X_valid_transformed)
        train_oof[valid_idx] = temp_oof

        print(f'Fold {fold} RMSE: ', mean_squared_error(y_valid, temp_oof))

        test_data_transform = vectorizer.transform(test_data)
        predictions_test[:, fold] = model.predict(test_data_transform)

    print(f'OOF AUC: ', mean_squared_error(train_data['Y'], train_oof))
    
    return train_oof, predictions_test, model


In [None]:
def get_ruddit_data():
    df_ruddit = pd.read_csv(RUDDIT_DATA)
    df_ruddit = df_ruddit[['txt', 'offensiveness_score']].rename(columns = {'txt': 'text', 'offensiveness_score': 'y'})
    df_ruddit['y'] = (df_ruddit['y'] - df_ruddit.y.min()) / (df_ruddit.y.max() - df_ruddit.y.min())
    df_ruddit = df_ruddit[df_ruddit['text']!='[deleted]'] 
    df_ruddit = df_ruddit.drop_duplicates()
    df_ruddit = df_ruddit.dropna()    

    raw_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
    raw_tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
    raw_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

    dataset = Dataset.from_pandas(df_ruddit[['text']])

    def get_training_corpus():
        for i in range(0, len(dataset), 1000):
            yield dataset[i : i + 1000]["text"]

    raw_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=raw_tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )

    labels = df_ruddit['y']
    comments = df_ruddit['text']

    return comments, labels, tokenizer



In [None]:
def get_data_classify():
    df_train = pd.read_csv(CLASSIFY_DATA)
    cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
                'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}
    for category in cat_mtpl:
        df_train[category] = df_train[category] * cat_mtpl[category]
    df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].mean(axis=1)
    df_train['y'] = df_train['score']
    min_len = (df_train['y'] > 0).sum()  # len of toxic comments
    df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=SEED)  # take non toxic comments
    df_train_final = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df

    raw_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
    raw_tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
    raw_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

    dataset = Dataset.from_pandas(df_train_final[['comment_text']])

    def get_training_corpus():
        for i in range(0, len(dataset), 1000):
            yield dataset[i : i + 1000]["comment_text"]

    raw_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=raw_tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )

    labels = df_train_final['y']
    comments = df_train_final['comment_text']

    return comments, labels, tokenizer

In [None]:
comments, labels, tokenizer = get_ruddit_data()
tokenized_comments = tokenizer(comments.to_list())['input_ids']

train_data = pd.DataFrame()

train_data['X'] = tokenized_comments
train_data['Y'] = labels.tolist()

#Create Model
model = Ridge(**params)

_, output_ruddit, _ = cross_validate_sub(
        model,
        train_data,
        tokenizer(df_test['text'].to_list())['input_ids']
    )

In [None]:
comments, labels, tokenizer = get_data_classify()
tokenized_comments = tokenizer(comments.to_list())['input_ids']

train_data = pd.DataFrame()

train_data['X'] = tokenized_comments
train_data['Y'] = labels.tolist()

#Create Model
model = Ridge(**params)

_, output_classify, _ = cross_validate_sub(
        model,
        train_data,
        tokenizer(df_test['text'].to_list())['input_ids']
    )

In [None]:
scoring_data = np.concatenate((np.array(output_ruddit), np.array(output_classify)), axis = 1)
ranking_input = torch.tensor(scoring_data, dtype=torch.float)

class BinaryClassification(nn.Module):
    def __init__(self, hidden_layers):
        super(BinaryClassification, self).__init__()
        self.layer_1 = nn.Linear(hidden_layers, 1)
        self.relu = nn.ReLU()
    def forward(self, inputs):
        x = self.relu(inputs)
        x = self.layer_1(x)
        return x

model = BinaryClassification(10)
model.load_state_dict(torch.load(RANK_MODEL_PATH))
model.eval()

final_output = model(torch.tensor(ranking_input, dtype=torch.float)).view(-1).detach().numpy().tolist()

In [None]:
submission = pd.DataFrame()

submission['comment_id'] = df_test['comment_id']
submission['score'] = final_output

submission.to_csv('submission.csv', index=False)