In [None]:
import pandas as pd

submission_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

# DistilBERT

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import DistilBertModel, DistilBertForMaskedLM, get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.utils.rnn as rnn

from sklearn.model_selection import train_test_split

from tqdm import tqdm

from collections import namedtuple
from copy import deepcopy
import random

import re
from bs4 import BeautifulSoup

%matplotlib inline


tokenizer = DistilBertTokenizer.from_pretrained('../input/jigsaw-bert-lm')


class CommentDataset(data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = self.X.iloc[idx]
        target = self.y.iloc[idx]
        return text, target
    
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


RegressionOutput = namedtuple("RegressionOutput", ['loss', 'logits'])

class DistilBertGruRegression(nn.Module):
    def __init__(self, base):
        super(DistilBertGruRegression, self).__init__()
        self.distilbert = base
        self.gru = nn.GRU(768, 50, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(p=0.2)
        self.classifier = nn.Linear(100, 1)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        context_embs = self.distilbert(input_ids, attention_mask)[0]  # {bs, seq_len, 768}
        context_embs = rnn.pack_padded_sequence(context_embs, attention_mask.sum(axis=1).cpu(),
                                                   batch_first=True, enforce_sorted=False)
        
        gru_output, _ = self.gru(context_embs)  # {bs, seq_len, 50 * 2}
        gru_output = rnn.pad_packed_sequence(gru_output, batch_first=True, total_length=120)[0]
        
        pooled_output = torch.max(gru_output, axis=1)[0]  # {bs, 50 * 2}
        pooled_output = self.dropout(pooled_output)
        output = nn.ReLU()(self.classifier(pooled_output))
        
        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(output.squeeze(), labels.squeeze())

        return RegressionOutput(loss, output)
    
    
base_model = DistilBertModel.from_pretrained("../input/jigsaw-bert-lm")


reg_model = DistilBertGruRegression(base_model)
reg_model.load_state_dict(torch.load("../input/jigsaw-bert-lm/finetuned-model-e1-l0.405.pt"))


df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")


def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip().lower() # remove spaces at the beginning and at the end of string

    return text


tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)


sub_dataset = CommentDataset(df_sub.text, df_sub.comment_id)  # comment_id will not be used, just dummy arg

sub_loader = data.DataLoader(sub_dataset, shuffle=False, batch_size=32, num_workers=2)


scores = []

reg_model.to(device)
reg_model.eval()
with torch.no_grad():
    with tqdm(total=len(sub_loader)) as t:
        for X, _ in sub_loader:
            inputs = tokenizer.batch_encode_plus(X, return_tensors='pt',
                                       padding='max_length', max_length=120,
                                       truncation=True).to(device)

            pred = reg_model(**inputs).logits[:, 0].cpu().detach().tolist()
            scores.extend(pred)
            
            t.update()
            

del reg_model
del tokenizer
del base_model

            
df_sub['score_distilbert'] = scores
df_sub['score_distilbert'] /= df_sub['score_distilbert'].max()


"DONE."

# Ridge Ensemble

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors, FastText

import re 
from scipy import sparse
import time
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns=100

from sklearn.linear_model import Ridge

import pickle as pkl


def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text


with open('../input/jigsaw-preprocessed-tc/tfidf-vectorizer.pkl', mode='rb') as f:
    vec = pkl.load(f)
    
    
fmodel = FastText.load('../input/jigsaw-regression-based-data/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin')


def splitter(text):
    tokens = []
    
    for word in text.split(' '):
        tokens.append(word)
    
    return tokens

def vectorizer(text):
    tokens = splitter(text)
    
    x1 = vec.transform([text]).toarray()
    x2 = np.mean(fmodel.wv[tokens], axis = 0).reshape(1, -1)
    x = np.concatenate([x1, x2], axis = -1).astype(np.float16)
    del x1
    del x2 
    
    return x  


with open('../input/ridge-models/m_05.pkl', 'rb') as f:
    model = pkl.load(f)

with open('../input/ridge-models/m_1.pkl', 'rb') as f:
    l_model = pkl.load(f)

with open('../input/ridge-models/m_15.pkl', 'rb') as f:
    d_model = pkl.load(f)

with open('../input/ridge-models/m_2.pkl', 'rb') as f:
    s_model = pkl.load(f)

    
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")


tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)


X_sub_temp = []
for text in tqdm(df_sub.text):
    X_sub_temp.append(vectorizer(text))

    
EMB_DIM = len(vec.vocabulary_) + 256
EMB_DIM


X_sub_temp = np.array(X_sub_temp).reshape(-1, EMB_DIM)
X_test = sparse.csr_matrix(X_sub_temp)

del X_sub_temp


ptl = l_model.predict(X_test)
ptd = d_model.predict(X_test)
pts = s_model.predict(X_test)


df_sub['score'] = (ptl + ptd + pts) / 3
df_sub['score'] -= df_sub.score.min()
df_sub['score'] /= df_sub.score.max()


"DONE."

# Active Learning

In [None]:
import numpy as np
import pandas as pd
import pickle

import fasttext
import lightgbm as lgb
from tqdm.auto import tqdm
from tqdm import tqdm
from bs4 import BeautifulSoup
import re

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
import keras.preprocessing
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.models import Model
import keras.layers
from sklearn.model_selection import train_test_split
import skopt


def convert_to_tensor(text,tokenizer):
    text = tokenizer.texts_to_sequences(text)
    pre = keras.preprocessing.sequence.pad_sequences(
    text, maxlen=63, dtype='int32', padding='post',
    truncating='post', value=0)
    text = tf.convert_to_tensor(pre)
    return text


with open('../input/bogdan-dataset/tokenizer_bogdan.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)
model = keras.models.load_model('../input/bogdan-dataset/keras_bogdan.h5')
coefs= [3.606628582431543,
 1.3735886700947442,
 3.482668891973004,
 2.7860855581794235,
 2.686352276545167,
 1.009564541977703]

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove specialrandom_state=harecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip().lower() # remove spaces at the beginning and at the end of string
    
    # lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
    # del stopwords
    text = ' '.join([word for word in text.split(' ') if word not in stop])
    
    return text

lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

tqdm.pandas()

TEST_DATA_PATH = "/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv"
df_test = pd.read_csv(TEST_DATA_PATH)
df_test['clean_text'] = df_test['text'].progress_apply(text_cleaning)
test_data = convert_to_tensor(df_test['clean_text'],tokenizer)
test_pred = tf.squeeze(model(test_data)).numpy()
df_test['score'] = (test_pred * coefs).sum(axis=1)
df_test['score']-=df_test['score'].min()
df_test['score']/=df_test['score'].max()


"DONE."

# LightGBM

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from nltk import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import string

from scipy import sparse
from bs4 import BeautifulSoup
import re

from gensim.models import FastText
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

import pickle
from tqdm import tqdm

import lightgbm as lgbm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import utils
from sklearn.preprocessing import MinMaxScaler


test_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

def splitter(text):
    return [word for word in text.split(" ")]


def get_fasttext_embeddings(text):
    tokens = splitter(text)
    return np.mean(fmodel.wv[tokens], axis=0).reshape(1, -1)


def send_to_words(sentences):
    for sentence in sentences:
        yield(splitter(sentence))
        
        
def get_doc2vec_embeddings(model, corpus_size, vectors_size, words_list):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in tqdm(range(0, corpus_size)):
        vectors[i] = model.infer_vector(words_list[i])
    return vectors


vec = pickle.load(open("../input/lgbm-tuned/vec.pickle", "rb"))
svd = pickle.load(open("../input/lgbm-tuned/svd.pickle", "rb"))

lgbm_model = lgbm.Booster(model_file="../input/lgbm-tuned/lgbm_tuned_v18.txt")

fmodel = FastText.load("../input/jigsaw-regression-based-data/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin")
fasttext_embeddings_test = [get_fasttext_embeddings(text) for text in tqdm(test_df.text)]
fasttext_embeddings_test = np.array(fasttext_embeddings_test).reshape(-1, 256)

doc2vec_model = Doc2Vec.load("../input/lgbm-tuned/doc2vecmodel.mod")
corpus_test = test_df.text.values.tolist()
text_words_test = list(send_to_words(corpus_test))
doc2vec_embeddings_test = get_doc2vec_embeddings(doc2vec_model, len(text_words_test), 200, text_words_test)


X_sub = vec.transform(test_df["text"])
X_sub = svd.transform(X_sub)

X_sub = np.hstack((X_sub, fasttext_embeddings_test, doc2vec_embeddings_test))

y_test_preds = lgbm_model.predict(X_sub)

# test_df["score"] = y_test_preds
# scaler = MinMaxScaler()
# test_df["score"] = scaler.fit_transform(test_df["score"].values.reshape(-1, 1))


"DONE."

In [None]:
pred_lightgbm = pd.Series(y_test_preds)
pred_lightgbm -= pred_lightgbm.min()
pred_lightgbm[pred_lightgbm > 4.5] = 4.5 + 0.01 * pred_lightgbm[pred_lightgbm > 4.5]
pred_lightgbm /= pred_lightgbm.max()
pred_lightgbm.describe()

In [None]:
pred_ridge_ensemble = pd.Series(ptl + ptd + pts)
pred_ridge_ensemble -= pred_ridge_ensemble.min()
pred_ridge_ensemble[pred_ridge_ensemble > 10.9] = 10.9 + 0.01 * pred_ridge_ensemble[pred_ridge_ensemble > 10.9]
pred_ridge_ensemble /= pred_ridge_ensemble.max()
pred_ridge_ensemble.describe()

In [None]:
pred_active_learning = pd.Series((test_pred * coefs).sum(axis=1))
pred_active_learning -= pred_active_learning.min()
pred_active_learning[pred_active_learning > 10.2] = 10.2 + 0.01 * pred_active_learning[pred_active_learning > 10.2]
pred_active_learning /= pred_active_learning.max()
pred_active_learning.describe()

In [None]:
pred_distilbert = pd.Series(scores)
pred_distilbert -= pred_distilbert.min()
pred_distilbert[pred_distilbert > 3] = 3 + 0.01 * pred_distilbert[pred_distilbert > 3]
pred_distilbert /= pred_distilbert.max()
pred_distilbert.describe()

# Aggregate scores and submit

In [None]:
submission_df['score'] = (pred_distilbert * 0.19 +
                          pred_ridge_ensemble * 0.35 +
                          pred_active_learning * 0.19 +
                          pred_lightgbm * 0.27)

In [None]:
submission_df[['score']].describe()

In [None]:
submission_df[['comment_id', 'score']].to_csv("submission.csv", index=False)