In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import sys
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.linear_model import Ridge

In [None]:
class CFG:
    ######################
    # Globals #
    ######################
    EXP_ID = 'XX' # '029'
    seed = 2021 # 71
    epochs = 3
    folds = [0, 1, 2, 3, 4]
    N_FOLDS = 5
    LR = 2e-5
    max_len = 128 # 256
    train_bs = 16 
    valid_bs = 64 # 128
    log_interval = 4 # 10
    model_name = '../input/roberta-base'
    ID_COL = 'Id'
    TARGET_COL = 'Pawpularity'
    TARGET_DIM = 1
    EARLY_STOPPING = True
    DEBUG = True # False # True
    margin = 0.5
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(CFG.seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class Jigsaw4DatasetTest:
    def __init__(self, df, cfg):
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.text = df['text'].values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):

        inputs = self.tokenizer(
            self.text[item], 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True,
            add_special_tokens=True,
        )
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }

In [None]:
class RoBERTaBase(nn.Module):
    def __init__(self, model_path):
        super(RoBERTaBase, self).__init__()
        self.in_features = 768
        self.roberta = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(0.2)
        self.l0 = nn.Linear(self.in_features, 1)

    def forward(self, ids, mask):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask,
            output_hidden_states=False
        )
        x = roberta_outputs[1]
        logits = self.l0(self.dropout(x))
        return logits.squeeze(-1)

In [None]:
def make_preds(model_paths1, model_paths2, model_paths3, model_paths4,
               model_paths5, model_paths6):
    df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
    y_pred1 = []
    y_pred2 = []
    y_pred3 = []
    y_pred4 = []
    y_pred5 = []
    y_pred6 = []
    for fold, (model_path1, model_path2, model_path3, model_path4, 
               model_path5, model_path6) in enumerate(zip(model_paths1, model_paths2, model_paths3, model_paths4, 
                                                                       model_paths5, model_paths6)):
        model1 = RoBERTaBase(CFG.model_name)
        model1.to(device)
        model1.load_state_dict(torch.load(model_path1))
        model1.eval()
        
        model2 = RoBERTaBase(CFG.model_name)
        model2.to(device)
        model2.load_state_dict(torch.load(model_path2))
        model2.eval()
        
        model3 = RoBERTaBase(CFG.model_name)
        model3.to(device)
        model3.load_state_dict(torch.load(model_path3))
        model3.eval()
        
        model4 = RoBERTaBase(CFG.model_name)
        model4.to(device)
        model4.load_state_dict(torch.load(model_path4))
        model4.eval()
        
        model5 = RoBERTaBase(CFG.model_name)
        model5.to(device)
        model5.load_state_dict(torch.load(model_path5))
        model5.eval()
        
        model6 = RoBERTaBase(CFG.model_name)
        model6.to(device)
        model6.load_state_dict(torch.load(model_path6))
        model6.eval()
        
        dataset = Jigsaw4DatasetTest(df=df, cfg=CFG)
        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=CFG.valid_bs, num_workers=0, pin_memory=True, shuffle=False
        )

        final_output1 = []
        final_output2 = []
        final_output3 = []
        final_output4 = []
        final_output5 = []
        final_output6 = []
        for b_idx, data in tqdm(enumerate(data_loader)):
            with torch.no_grad():
                inputs = data['input_ids'].to(device)
                masks = data['attention_mask'].to(device)
                output1 = model1(inputs, masks)
                output2 = model2(inputs, masks)
                output3 = model3(inputs, masks)
                output4 = model4(inputs, masks)
                output5 = model5(inputs, masks)
                output6 = model6(inputs, masks)
                
                # output = torch.sigmoid(output)
                output1 = output1.detach().cpu().numpy().tolist()
                output2 = output2.detach().cpu().numpy().tolist()
                output3 = output3.detach().cpu().numpy().tolist()
                output4 = output4.detach().cpu().numpy().tolist()
                output5 = output5.detach().cpu().numpy().tolist()
                output6 = output6.detach().cpu().numpy().tolist()
                
                final_output1.extend(output1)
                final_output2.extend(output2)
                final_output3.extend(output3)
                final_output4.extend(output4)
                final_output5.extend(output5)
                final_output6.extend(output6)
                
        y_pred1.append(np.array(final_output1))
        y_pred2.append(np.array(final_output2))
        y_pred3.append(np.array(final_output3))
        y_pred4.append(np.array(final_output4))
        y_pred5.append(np.array(final_output5))
        y_pred6.append(np.array(final_output6))
        torch.cuda.empty_cache()
        
    y_pred1 = np.mean(y_pred1, 0)
    y_pred2 = np.mean(y_pred2, 0)
    y_pred3 = np.mean(y_pred3, 0)
    y_pred4 = np.mean(y_pred4, 0)
    y_pred5 = np.mean(y_pred5, 0)
    y_pred6 = np.mean(y_pred6, 0)
    return y_pred1, y_pred2, y_pred3, y_pred4, y_pred5, y_pred6

In [None]:
model_paths1 = [f'../input/kaerururu-jigsaw4-017/fold-{i}.bin' for i in CFG.folds]
model_paths2 = [f'../input/kaerururu-jigsaw4-029/fold-{i}.bin' for i in CFG.folds]
model_paths3 = [f'../input/kaerururu-jigsaw4-047/fold-{i}.bin' for i in CFG.folds]
model_paths4 = [f'../input/kaerururu-jigsaw4-052/fold-{i}.bin' for i in CFG.folds]
model_paths5 = [f'../input/kaerururu-jigsaw4-076/fold-{i}.bin' for i in CFG.folds]
model_paths6 = [f'../input/kaerururu-jigsaw4-0080/fold-{i}.bin' for i in CFG.folds]

sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
preds_017, preds_029, preds_047, preds_052, preds_076, preds_080 = make_preds(model_paths1, model_paths2, model_paths3, model_paths4, 
                                                                                         model_paths5, model_paths6)

In [None]:
# electra base discriminator

class CFG:
    ######################
    # Globals #
    ######################
    EXP_ID = '0XX'
    seed = 2021 # 71
    epochs = 3
    folds = [0, 1, 2, 3, 4]
    N_FOLDS = 5
    LR = 2e-5
    max_len = 128 # 256
    train_bs = 16 
    valid_bs = 128 # 32 * 2
    log_interval = 4 # 10
    model_name = '../input/electra/base-discriminator'
    TARGET_DIM = 1
    EARLY_STOPPING = True
    DEBUG = True # False # True
    margin = 0.5
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class ElectraBaseDiscriminator(nn.Module):
    def __init__(self, model_path):
        super(ElectraBaseDiscriminator, self).__init__()
        self.in_features = 768
        self.roberta = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(0.2)
        self.l0 = nn.Linear(self.in_features, 1)

    def forward(self, ids, mask):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask,
            output_hidden_states=False
        )
        x = roberta_outputs['last_hidden_state'][:, 0, :]
        logits = self.l0(self.dropout(x))
        return logits.squeeze(-1)

In [None]:
def make_preds(model_paths1):
    df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
    y_pred1 = []
    for fold, model_path1 in enumerate(model_paths1):
        model1 = ElectraBaseDiscriminator(CFG.model_name)
        model1.to(device)
        model1.load_state_dict(torch.load(model_path1))
        model1.eval()
        
        dataset = Jigsaw4DatasetTest(df=df, cfg=CFG)
        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=CFG.valid_bs, num_workers=0, pin_memory=True, shuffle=False
        )

        final_output1 = []
        for b_idx, data in tqdm(enumerate(data_loader)):
            with torch.no_grad():
                inputs = data['input_ids'].to(device)
                masks = data['attention_mask'].to(device)
                output1 = model1(inputs, masks)
                output1 = output1.detach().cpu().numpy().tolist()
                
                final_output1.extend(output1)
                
        y_pred1.append(np.array(final_output1))
        torch.cuda.empty_cache()
        
    y_pred1 = np.mean(y_pred1, 0)
    return y_pred1

In [None]:
model_paths = [f'../input/kaerururu-jigsaw4-0082/fold-{i}.bin' for i in CFG.folds]

preds_082 = make_preds(model_paths)

In [None]:
# distil bert base uncased
# 050, 066

class CFG1:
    ######################
    # Globals #
    ######################
    EXP_ID = '050'
    seed = 2021 # 71
    epochs = 3
    folds = [0, 1, 2, 3, 4]
    N_FOLDS = 5
    LR = 2e-5
    max_len = 128 # 256
    train_bs = 16 
    valid_bs = 128 # 32 * 2
    model_name = '../input/distilbertbaseuncased'
    TARGET_DIM = 1
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    
class CFG2:
    ######################
    # Globals #
    ######################
    EXP_ID = '050'
    seed = 2021 # 71
    epochs = 3
    folds = [0, 1, 2, 3, 4]
    N_FOLDS = 5
    LR = 2e-5
    max_len = 224
    train_bs = 16 
    valid_bs = 128 # 32 * 2
    model_name = '../input/distilbertbaseuncased'
    TARGET_DIM = 1
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class DistilBertBaseUncased(nn.Module):
    def __init__(self, model_path):
        super(DistilBertBaseUncased, self).__init__()
        self.in_features = 768
        self.roberta = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(0.2)
        self.l0 = nn.Linear(self.in_features, 1)

    def forward(self, ids, mask):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask,
            output_hidden_states=False
        )
        x = roberta_outputs['last_hidden_state'][:, 0, :]
        logits = self.l0(self.dropout(x))
        return logits.squeeze(-1)

In [None]:
def make_preds(model_paths1, model_paths2):
    df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
    y_pred1 = []
    y_pred2 = []
    for fold, (model_path1, model_path2) in enumerate(zip(model_paths1, model_paths2)):
        model1 = DistilBertBaseUncased(CFG1.model_name)
        model1.to(device)
        model1.load_state_dict(torch.load(model_path1))
        model1.eval()
        
        model2 = DistilBertBaseUncased(CFG2.model_name)
        model2.to(device)
        model2.load_state_dict(torch.load(model_path2))
        model2.eval()
        
        dataset1 = Jigsaw4DatasetTest(df=df, cfg=CFG1)
        data_loader1 = torch.utils.data.DataLoader(
            dataset1, batch_size=CFG.valid_bs, num_workers=0, pin_memory=True, shuffle=False
        )
        
        dataset2 = Jigsaw4DatasetTest(df=df, cfg=CFG2)
        data_loader2 = torch.utils.data.DataLoader(
            dataset2, batch_size=CFG.valid_bs, num_workers=0, pin_memory=True, shuffle=False
        )

        final_output1 = []
        final_output2 = []
        for b_idx, (data1, data2) in tqdm(enumerate(zip(data_loader1, data_loader2))):
            with torch.no_grad():
                inputs1 = data1['input_ids'].to(device)
                masks1 = data1['attention_mask'].to(device)
                                          
                inputs2 = data2['input_ids'].to(device)
                masks2 = data2['attention_mask'].to(device)
                                          
                output1 = model1(inputs1, masks1)
                output2 = model2(inputs2, masks2)
                
                # output = torch.sigmoid(output)
                output1 = output1.detach().cpu().numpy().tolist()
                output2 = output2.detach().cpu().numpy().tolist()
                
                final_output1.extend(output1)
                final_output2.extend(output2)
                
        y_pred1.append(np.array(final_output1))
        y_pred2.append(np.array(final_output2))
        torch.cuda.empty_cache()
        
    y_pred1 = np.mean(y_pred1, 0)
    y_pred2 = np.mean(y_pred2, 0)
    return y_pred1, y_pred2

In [None]:
model_paths1 = [f'../input/kaerururu-jigsaw4-050/fold-{i}.bin' for i in CFG.folds]
model_paths2 = [f'../input/kaerururu-jigsaw4-066/fold-{i}.bin' for i in CFG.folds]

preds_050, preds_066 = make_preds(model_paths1, model_paths2)

In [None]:
# Albert base v2

class CFG:
    ######################
    # Globals #
    ######################
    EXP_ID = '049'
    seed = 2021 # 71
    epochs = 3
    folds = [0, 1, 2, 3, 4]
    N_FOLDS = 5
    LR = 2e-5
    max_len = 128 # 256
    train_bs = 16 
    valid_bs = 128 # 32 * 2
    log_interval = 4 # 10
    model_name = '../input/pretrained-albert-pytorch/albert-base-v2'
    ID_COL = 'Id'
    TARGET_COL = 'Pawpularity'
    TARGET_DIM = 1
    EARLY_STOPPING = True
    DEBUG = True # False # True
    margin = 0.5
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class AlbertBaseV2(nn.Module):
    def __init__(self, model_path):
        super(AlbertBaseV2, self).__init__()
        self.in_features = 768
        self.roberta = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(0.2)
        self.l0 = nn.Linear(self.in_features, 1)

    def forward(self, ids, mask):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask,
            output_hidden_states=False
        )
        x = roberta_outputs[1]
        logits = self.l0(self.dropout(x))
        return logits.squeeze(-1)

In [None]:
def make_preds(model_paths1, model_paths2):
    df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
    y_pred1 = []
    y_pred2 = []
    for fold, (model_path1, model_path2) in enumerate(zip(model_paths1, model_paths2)):
        model1 = AlbertBaseV2(CFG.model_name)
        model1.to(device)
        model1.load_state_dict(torch.load(model_path1))
        model1.eval()
        
        model2 = AlbertBaseV2(CFG.model_name)
        model2.to(device)
        model2.load_state_dict(torch.load(model_path2))
        model2.eval()
        
        dataset = Jigsaw4DatasetTest(df=df, cfg=CFG)
        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=CFG.valid_bs, num_workers=0, pin_memory=True, shuffle=False
        )

        final_output1 = []
        final_output2 = []
        for b_idx, data in tqdm(enumerate(data_loader)):
            with torch.no_grad():
                inputs = data['input_ids'].to(device)
                masks = data['attention_mask'].to(device)
                output1 = model1(inputs, masks)
                output2 = model2(inputs, masks)
                
                # output = torch.sigmoid(output)
                output1 = output1.detach().cpu().numpy().tolist()
                output2 = output2.detach().cpu().numpy().tolist()
                
                final_output1.extend(output1)
                final_output2.extend(output2)
                
        y_pred1.append(np.array(final_output1))
        y_pred2.append(np.array(final_output2))
        torch.cuda.empty_cache()
        
    y_pred1 = np.mean(y_pred1, 0)
    y_pred2 = np.mean(y_pred2, 0)
    return y_pred1, y_pred2

In [None]:
model_paths1 = [f'../input/kaerururu-jigsaw4-049/fold-{i}.bin' for i in CFG.folds]
model_paths2 = [f'../input/kaerururu-jigsaw4-051/fold-{i}.bin' for i in CFG.folds]

preds_049, preds_051 = make_preds(model_paths1, model_paths2)

In [None]:
# microsoft/deberta-v3-base

class CFG:
    seed = 2021 # 71
    epochs = 3
    folds = [0, 1, 2, 3, 4]
    N_FOLDS = 5
    max_len = 128 # 256
    train_bs = 16 
    valid_bs = 128 # 32 * 2
    model_name = '../input/deberta-v3-base/deberta-v3-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class DebertaV3Base(nn.Module):
    def __init__(self, model_path):
        super(DebertaV3Base, self).__init__()
        self.in_features = 768
        self.roberta = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(0.2)
        self.l0 = nn.Linear(self.in_features, 1)

    def forward(self, ids, mask):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask,
            output_hidden_states=False
        )
        x = roberta_outputs['last_hidden_state'][:, 0, :]
        logits = self.l0(self.dropout(x))
        return logits.squeeze(-1)

In [None]:
def make_preds(model_paths1, model_paths2):
    df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
    y_pred1 = []
    y_pred2 = []
    for fold, (model_path1, model_path2) in enumerate(zip(model_paths1, model_paths2)):
        model1 = DebertaV3Base(CFG.model_name)
        model1.to(device)
        model1.load_state_dict(torch.load(model_path1))
        model1.eval()
        
        model2 = DebertaV3Base(CFG.model_name)
        model2.to(device)
        model2.load_state_dict(torch.load(model_path2))
        model2.eval()
        
        dataset = Jigsaw4DatasetTest(df=df, cfg=CFG)
        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=CFG.valid_bs, num_workers=0, pin_memory=True, shuffle=False
        )

        final_output1 = []
        final_output2 = []
        for b_idx, data in tqdm(enumerate(data_loader)):
            with torch.no_grad():
                inputs = data['input_ids'].to(device)
                masks = data['attention_mask'].to(device)
                output1 = model1(inputs, masks)
                output2 = model2(inputs, masks)
                
                # output = torch.sigmoid(output)
                output1 = output1.detach().cpu().numpy().tolist()
                output2 = output2.detach().cpu().numpy().tolist()
                
                final_output1.extend(output1)
                final_output2.extend(output2)
                
        y_pred1.append(np.array(final_output1))
        y_pred2.append(np.array(final_output2))
        torch.cuda.empty_cache()
        
    y_pred1 = np.mean(y_pred1, 0)
    y_pred2 = np.mean(y_pred2, 0)
    return y_pred1, y_pred2

In [None]:
model_paths1 = [f'../input/kaerururu-jigsaw4-055/fold-{i}.bin' for i in CFG.folds]
model_paths2 = [f'../input/kaerururu-jigsaw4-059/fold-{i}.bin' for i in CFG.folds]

preds_055, preds_059 = make_preds(model_paths1, model_paths2)

In [None]:
# distil roberta base

class CFG:
    seed = 2021 # 71
    epochs = 3
    folds = [0, 1, 2, 3, 4]
    N_FOLDS = 5
    max_len = 128 # 256
    train_bs = 16 
    valid_bs = 128 # 32 * 2
    model_name = '../input/distilroberta-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class DistilRoBERTaBase(nn.Module):
    def __init__(self, model_path):
        super(DistilRoBERTaBase, self).__init__()
        self.in_features = 768
        self.roberta = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(0.2)
        self.l0 = nn.Linear(self.in_features, 1)

    def forward(self, ids, mask):
        roberta_outputs = self.roberta(
            ids,
            attention_mask=mask,
            output_hidden_states=False
        )
        x = roberta_outputs['last_hidden_state'][:, 0, :]
        logits = self.l0(self.dropout(x))
        return logits.squeeze(-1)

In [None]:
def make_preds(model_paths1, model_paths2):
    df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
    y_pred1 = []
    y_pred2 = []
    for fold, (model_path1, model_path2) in enumerate(zip(model_paths1, model_paths2)):
        model1 = DistilRoBERTaBase(CFG.model_name)
        model1.to(device)
        model1.load_state_dict(torch.load(model_path1))
        model1.eval()
        
        model2 = DistilRoBERTaBase(CFG.model_name)
        model2.to(device)
        model2.load_state_dict(torch.load(model_path2))
        model2.eval()
        
        dataset = Jigsaw4DatasetTest(df=df, cfg=CFG)
        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=CFG.valid_bs, num_workers=0, pin_memory=True, shuffle=False
        )

        final_output1 = []
        final_output2 = []
        for b_idx, data in tqdm(enumerate(data_loader)):
            with torch.no_grad():
                inputs = data['input_ids'].to(device)
                masks = data['attention_mask'].to(device)
                output1 = model1(inputs, masks)
                output2 = model2(inputs, masks)
                
                # output = torch.sigmoid(output)
                output1 = output1.detach().cpu().numpy().tolist()
                output2 = output2.detach().cpu().numpy().tolist()
                
                final_output1.extend(output1)
                final_output2.extend(output2)
                
        y_pred1.append(np.array(final_output1))
        y_pred2.append(np.array(final_output2))
        torch.cuda.empty_cache()
        
    y_pred1 = np.mean(y_pred1, 0)
    y_pred2 = np.mean(y_pred2, 0)
    return y_pred1, y_pred2

In [None]:
model_paths1 = [f'../input/kaerururu-jigsaw4-057/fold-{i}.bin' for i in CFG.folds]
model_paths2 = [f'../input/kaerururu-jigsaw4-067/fold-{i}.bin' for i in CFG.folds]

preds_057, preds_067 = make_preds(model_paths1, model_paths2)

# Loading Data 

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    ###
    text = text.replace("'s", ' ')
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace('.', ' ')
    text = text.replace(',', ' ')
    text = text.replace(':', ' ')
    text = text.replace(';', ' ')
    text = text.replace('"', ' ')
    ###
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    text = text.lower()
    return text


spaces = ['\u200b', '\u200e', '\u202a', '\u2009', '\u2028', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\u3000', '\x10', '\x7f', '\x9d', '\xad',
                       '\x97', '\x9c', '\x8b', '\x81', '\x80', '\x8c', '\x85', '\x92', '\x88', '\x8d', '\x80', '\x8e', '\x9a', '\x94', '\xa0', 
                       '\x8f', '\x82', '\x8a', '\x93', '\x90', '\x83', '\x96', '\x9b', '\x9e', '\x99', '\x87', '\x84', '\x9f',]

def rm_spaces(x):
    for space in spaces:
        x = x.replace(space, ' ')
    return x

In [None]:
from scipy.sparse import hstack
from scipy import sparse


def splitter(text):
    tokens = []
    
    for word in text.split(' '):
        tokens.append(word)
    
    return tokens

def vectorizer(text):
    tokens = splitter(text)
    
    x1 = vec.transform([text]).toarray()
    x2 = np.mean(fmodel.wv[tokens], axis = 0).reshape(1, -1)
    x = np.concatenate([x1, x2], axis = -1).astype(np.float16)
    del x1, x2
    
    return x 

In [None]:
import pickle

def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)

def unpickle(filename):
    with open(filename, mode='rb') as fo:
        p = pickle.load(fo)
    return p

In [None]:
from gensim.models import KeyedVectors, FastText

fmodel = FastText.load('../input/jigsaw-rate-severity-good-score-train-dataset/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin')

In [None]:
model = unpickle('../input/jigsaw4-ridge-0-874-rm-space/cv_0.6772_lb_0.874_Ridge.pkl')
vec = unpickle('../input/jigsaw4-ridge-0-874-rm-space/cv_0.6772_lb_0.874_TfidfVectorizer.pkl')
EMB_DIM = len(vec.vocabulary_) + 256

In [None]:
tqdm.pandas()

df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)
df_sub['text'] = df_sub['text'].progress_apply(lambda x: rm_spaces(x).strip())

In [None]:
X_sub_temp = []
for text in tqdm(df_sub.text):
    X_sub_temp.append(vectorizer(text))
    
X_sub_temp = np.array(X_sub_temp).reshape(-1, EMB_DIM)
X_test = sparse.csr_matrix(X_sub_temp)

del X_sub_temp; gc.collect()

In [None]:
ridge_preds = model.predict(X_test)

In [None]:
vec2 = unpickle('../input/jigsaw4-ridge-0-874-ruddit/TfidfVectorizer_Ridge_ruddit.pkl')
model2 = unpickle('../input/jigsaw4-ridge-0-874-ruddit/Tfidf_Ridge_ruddit.pkl')

vec3 = unpickle('../input/jigsaw4-ridge-jigsaw1-weighted/jigsaw1_weighted_TfidfVectorizer.pkl')
model3 = unpickle('../input/jigsaw4-ridge-jigsaw1-weighted/jigsaw1_weighted_Ridge.pkl')

vec4 = unpickle('../input/jigsaw4-ridge-jigsaw1-v2/jigsaw1_v2_TfidfVectorizer.pkl')
model4 = unpickle('../input/jigsaw4-ridge-jigsaw1-v2/jigsaw1_v2_Ridge.pkl')

In [None]:
X_test2 = vec2.transform(df_sub['text'])
reddit_preds = model2.predict(X_test2)

X_test3 = vec3.transform(df_sub['text'])
j1_weighted_preds = model3.predict(X_test3)

X_test4 = vec4.transform(df_sub['text'])
j1_v2_preds = model4.predict(X_test4)

In [None]:
vec = unpickle('../input/jigsaw4-ridge-0-874-pl-pub-test-v2/0874_pseudo_label_only_public_test_tfidf_TfidfVectorizer_v2.pkl')
model5 = unpickle('../input/jigsaw4-ridge-0-874-pl-pub-test-v2/0874_pseudo_label_only_public_test_tfidf_Ridge_v2.pkl')
EMB_DIM = len(vec.vocabulary_) + 256

X_sub_temp = []
for text in tqdm(df_sub.text):
    X_sub_temp.append(vectorizer(text))
    
X_sub_temp = np.array(X_sub_temp).reshape(-1, EMB_DIM)
X_test5 = sparse.csr_matrix(X_sub_temp)

del X_sub_temp; gc.collect()

ridge_pseudo_labeled_preds = model5.predict(X_test5) # 0.876

Ensembling the Ridge Regression models


In [None]:
# https://www.kaggle.com/kaerunantoka/jigsaw4-ensemble-v27-cv?scriptVersionId=87223243

w = [0.06324371, 0.01615546, 0.09597985, 0.03304393, 0.05108132,
        0.03772867, 0.02205191, 0.04967104, 0.08629689, 0.08855165,
        0.04448891, 0.07703826, 0.09451529, 0.09916092, 0.02796084,
        0.01080381, 0.01141286, 0.06257114, 0.02824354] # 0.7790952570745316

df_sub['score'] = w[0] * ridge_preds \
                + w[1] * preds_017 \
                + w[2] * preds_029 \
                + w[3] * preds_047 \
                + w[4] * preds_049 \
                + w[5] * preds_050 \
                + w[6] * preds_051 \
                + w[7] * preds_052 \
                + w[8] * preds_055 \
                + w[9] * preds_057 \
                + w[10] * preds_059 \
                + w[11] * reddit_preds \
                + w[12] * j1_weighted_preds \
                + w[13] * j1_v2_preds \
                + w[14] * preds_066 \
                + w[15] * preds_067 \
                + w[16] * preds_076 \
                + w[17] * preds_080 \
                + w[18] * preds_082

In [None]:
df_sub['score'].count()

In [None]:
df_sub['score'] = df_sub['score'].rank(method='first')

In [None]:
df_sub['score'].nunique()

<h2>Prepare submission file</h2>

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)