In [None]:
import sys
# Python Libraries
import os
import gc
import pickle
from bs4 import BeautifulSoup
import re 

# Third party
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import rankdata
import plotly.express as px
%matplotlib inline

# Pytorch 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModel,AutoConfig
from transformers import AlbertModel,AlbertTokenizer
from transformers import BertModel,BertTokenizer
from transformers import RobertaModel,RobertaTokenizer
from transformers import DebertaModel, DebertaTokenizer
from transformers import XLNetModel,XLNetTokenizer,XLNetConfig


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
all_preds = []

In [None]:
_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
if _df.shape[0] == 7537:
    _df = _df.head(320)
    
def load_pickle(filename):
    with open(filename, mode='rb') as f:
        p = pickle.load(f)
    return p 
# 1
# =================================================================================
vec = load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/tfidf1.pkl')
model1=load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/Ridge1.pkl')
df_test = _df.copy()
test=vec.transform(df_test['text'])
jr_preds=model1.predict(test)
df_test['score1']=rankdata( jr_preds, method='ordinal') 

vec = load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/tfidf2.pkl')
model1=load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/Ridge2.pkl')

test=vec.transform(df_test['text'])
rud_preds=model1.predict(test)
df_test['score2']=rankdata( rud_preds, method='ordinal')
df_test['score']=df_test['score1']+df_test['score2']
df_test['score']=rankdata( df_test['score'], method='ordinal')
df_test[['comment_id', 'score']].to_csv("submission1.csv", index=False)
# 2
# =================================================================================
def dummy_fun(doc):
    return doc
tokenizer = load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/tokenizer1.pkl')
vectorizer = load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/tfidf3.pkl')
regressor = load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/Redge3.pkl')

df_test2 = _df.copy()
texts = df_test2['text']
texts = tokenizer(texts.to_list())['input_ids']
texts = vectorizer.transform(texts)

df_test2['prediction'] = regressor.predict(texts)
df_test2 = df_test2[['comment_id','prediction']]

df_test2['score'] = df_test2['prediction']
df_test2 = df_test2[['comment_id','score']]

df_test2.to_csv('./submission2.csv', index=False)
# 3
# =================================================================================
def text_cleaning2(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

vec = load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/tfidf4.pkl')
model = load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/Redge4.pkl')
l_model = load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/Redge5.pkl')
s_model = load_pickle('../input/overfitting-lb-is-easier-than-solving-the-problem/Redge6.pkl')

df_sub = _df.copy()
df_sub['text'] = df_sub['text'].apply(text_cleaning2)
X_test = vec.transform(df_sub['text'])
p3 = model.predict(X_test)
p4 = l_model.predict(X_test)
p5 = s_model.predict(X_test)
df_sub['score'] = (p3 + p4 + p5) / 3.
df_sub['score'] = df_sub['score']
df_sub[['comment_id', 'score']].to_csv("submission3.csv", index=False)

# アンサンブル
# =================================================================================
data = pd.read_csv("./submission1.csv",index_col="comment_id")
data["score1"] = data["score"]

data["score2"] = pd.read_csv("./submission2.csv",index_col="comment_id")["score"]
data["score2"] = rankdata( data["score2"], method='ordinal')

data["score3"] = pd.read_csv("./submission3.csv",index_col="comment_id")["score"]
data["score3"] = rankdata( data["score3"], method='ordinal')

data["score"] = 2*data["score1"] + .66*data["score2"] + data["score3"]*.33
data["score"] = rankdata( data["score"], method='ordinal')
data.head()

In [None]:
del _df, vec, model1, df_test,test,jr_preds,rud_preds,tokenizer,vectorizer,regressor,df_test2,texts,l_model,s_model,X_test,p3,p4,p5,df_sub
gc.collect()

In [None]:
# ====================================================
# text_cleaning
# ====================================================
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
# ====================================================
# Dataset
# ====================================================
class JigsawDataset(Dataset):
    def __init__(self, cfg, df, tokenizer):
        self.df = df
        self.max_len = cfg.max_len
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']  

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }
# ====================================================
# model
# ====================================================
class bert_model(nn.Module):
    def __init__(self, cfg):
        super(bert_model, self).__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(self.cfg.model_name)
        self.bert = AutoModel.from_pretrained(self.cfg.model_name,
                                              hidden_dropout_prob=0,
                                              attention_probs_dropout_prob=0)
        # self.bert = transformers.BertForSequenceClassification.from_pretrained(BERT_MODEL,num_labels=1)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self.out = nn.Linear(self.config.hidden_size, 1)

    def forward(self, ids, mask):
        # pooler
        emb, _ = self.bert(ids, attention_mask=mask,return_dict=False)
        emb = torch.mean(emb, axis=1)
        output = self.ln(emb)
        output = self.out(output)
        return output

class distilbert_model(nn.Module):
    def __init__(self, cfg):
        super(distilbert_model, self).__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(self.cfg.model_name)
        self.bert = AutoModel.from_pretrained(self.cfg.model_name,
                                              attention_dropout=0,
                                              dropout=0)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self.out = nn.Linear(self.config.hidden_size, 1)

    def forward(self, ids, mask):
        # pooler
        emb = self.bert(ids, attention_mask=mask)['last_hidden_state'][:, 0, :]
#         emb = torch.mean(emb, axis=1)
        output = self.ln(emb)
        output = self.out(output)
        return output

class roberta_model(nn.Module):
    # https://github.com/TakoiHirokazu/kaggle_commonLit_readability_prize/blob/master/exp/ex072.py
    def __init__(self, cfg):
        super(roberta_model, self).__init__()
        self.cfg = cfg
        self.roberta = AutoModel.from_pretrained(self.cfg.model_name)
        self.num_features = self.roberta.pooler.dense.out_features
        
        # self.dropout = nn.Dropout(p=0.2)
        self.ln = nn.LayerNorm(self.num_features)
        self.out = nn.Linear(self.num_features, 1)
        
    def forward(self, ids, mask):
        # pooler
        emb= self.roberta(input_ids=ids, attention_mask=mask)[
            "last_hidden_state"]
        emb = torch.mean(emb, axis=1)
        output = self.ln(emb)
        # output = self.dropout(output)
        output = self.out(output)
        return output

class deberta_model(nn.Module):
    # https://github.com/TakoiHirokazu/kaggle_commonLit_readability_prize/blob/master/exp/ex182.py
    def __init__(self,cfg):
        super(deberta_model, self).__init__()
        self.cfg = cfg
        self.deberta_model = AutoModel.from_pretrained(self.cfg.model_name,
                                                          hidden_dropout_prob=0,
                                                          attention_probs_dropout_prob=0,
                                                          hidden_act="gelu_new")
        self.num_features = self.deberta_model.encoder.rel_embeddings.embedding_dim
        # self.dropout = nn.Dropout(p=0.2)
        self.ln = nn.LayerNorm(self.num_features)
        self.out = nn.Linear(self.num_features, 1)

    def forward(self, ids, mask):
        # pooler
        emb = self.deberta_model(ids, attention_mask=mask)[
            'last_hidden_state'][:, 0, :]
        output = self.ln(emb)
        # output = self.dropout(output)
        output = self.out(output)
        return output
    
class xlnet_model(nn.Module):
    # https://github.com/TakoiHirokazu/kaggle_commonLit_readability_prize/blob/master/exp/ex182.py
    def __init__(self,cfg):
        super(xlnet_model, self).__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained('../input/xlnet-pretrained-models-pytorch/xlnet-base-cased-config.json')
        self.config.hidden_dropout_prob = 0
        self.config.attention_probs_dropout_prob = 0
        self.config.dropout = 0
        self.xlnet_model = AutoModel.from_pretrained(self.cfg.model_name,
                                                    config=self.config)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self.out = nn.Linear(self.config.hidden_size, 1)

    def forward(self, ids, mask):
        # pooler
        emb = self.xlnet_model(ids, attention_mask=mask)['last_hidden_state']
        emb = torch.mean(emb,axis=1)
        output = self.ln(emb)
        # output = self.dropout(output)
        output = self.out(output)
        return output
    
class gpt2_model(nn.Module):
    # https://github.com/TakoiHirokazu/kaggle_commonLit_readability_prize/blob/master/exp/ex429.py
    def __init__(self,cfg):
        super(gpt2_model, self).__init__()
        self.cfg = cfg
        self.gpt2_model = AutoModel.from_pretrained(self.cfg.model_name,
                                                    attn_pdrop=0,
                                                    embd_pdrop=0,
                                                    resid_pdrop=0,
                                                    summary_first_dropout=0)
        self.gpt2_model.resize_token_embeddings(len(cfg.tokenizer))

        # self.dropout = nn.Dropout(p=0.2)
        self.ln = nn.LayerNorm(1024)
        self.out = nn.Linear(1024, 1)

    def forward(self, ids, mask):
        # pooler
        emb = self.gpt2_model(ids, attention_mask=mask)["last_hidden_state"]
        emb = torch.mean(emb, axis=1)
        output = self.ln(emb)
        # output = self.dropout(output)
        output = self.out(output)
        return output
    
class albert_model(nn.Module):
    def __init__(self, cfg):
        super(albert_model, self).__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(self.cfg.model_name)
        self.albert = AlbertModel.from_pretrained(
            self.cfg.model_name,
            hidden_dropout_prob=0,
            attention_probs_dropout_prob=0
        )

        # self.dropout = nn.Dropout(p=0.2)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self.out = nn.Linear(self.config.hidden_size, 1)

    def forward(self, ids, mask, token_type_ids):
        # pooler
        emb = self.albert(ids, attention_mask=mask, token_type_ids=token_type_ids)[
            "last_hidden_state"]
        emb = torch.mean(emb, axis=1)
        output = self.ln(emb)
        # output = self.dropout(output)
        output = self.out(output)
        return output#, emb
    
class bart_model(nn.Module):
    # https://github.com/TakoiHirokazu/kaggle_commonLit_readability_prize/blob/master/exp/ex107.py
    def __init__(self, cfg):
        super(bart_model, self).__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(self.cfg.model_name)
        self.config.hidden_dropout_prob = 0
        self.bart = AutoModel.from_pretrained(
            self.cfg.model_name,
            dropout=0.0, attention_dropout=0.0
        )

        # self.dropout = nn.Dropout(p=0.2)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self.out = nn.Linear(self.config.hidden_size, 1)

    def forward(self, ids, mask):
        # pooler
        emb = self.bart(ids, attention_mask=mask)['last_hidden_state']
        emb = torch.mean(emb, axis=1)
        output = self.ln(emb)
        # output = self.dropout(output)
        output = self.out(output)
        return output
    
# ====================================================
# comon prediction func
# ====================================================
def predict(model, loader):
    model = model.to(device)
    model = model.eval()
    predicts = []
    for data in tqdm(loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        with torch.no_grad():
            predict = model(ids, mask).squeeze(1)
        predict = predict.sigmoid().detach().cpu()
        predicts.append(predict)
    predicts = np.concatenate(predicts)
    return predicts

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

if df.shape[0] == 7537:
    df = df.head(320)
df_cleaned = df.copy()
df.head()

In [None]:
df_cleaned['text'] = df_cleaned['text'].apply(text_cleaning)

## Trained Binaly Data Models

In [None]:
# %%time
# """====================================================
# roberta_base binary exp001
# ===================================================="""
# roberta_base_MODEL_PATH = '../input/roberta-transformers-pytorch/roberta-base'
# roberta_base_tokenizer = RobertaTokenizer.from_pretrained(roberta_base_MODEL_PATH)

# class CFG_BI_001:
#     CV = 0.6981
#     model_name = roberta_base_MODEL_PATH
#     max_len = 256
#     # DataLoader
#     loader = {
#         "batch_size": 32,
#         "num_workers": 2,
#         "shuffle": False,
#         "pin_memory": True,
#         "drop_last": False,
#     }
# _CFG = CFG_BI_001 # edit
# ds = JigsawDataset(_CFG, df_cleaned, roberta_base_tokenizer) # edit
# loader = DataLoader(ds, **_CFG.loader)

# predictions = []
# for fold in tqdm(range(5)):
#     model = roberta_model(_CFG) # edit
#     model.load_state_dict(torch.load(f"../input/jigsaw4-model2/bi001_roberta-base/Bi_exp001_fold{fold}_best.pth")) # edit
#     p = predict(model, loader)
#     predictions.append(p)
#     del model
#     torch.cuda.empty_cache()
#     gc.collect()

    
# pred = np.mean(predictions,axis=0)
# all_preds.append(pred)
# # plot
# plt.hist(pred)
# plt.show()

# del ds, loader, pred
# torch.cuda.empty_cache()
# gc.collect()



In [None]:
# %%time
# """====================================================
# # distilroberta-base binary exp002
# ===================================================="""
# distilroberta_base_MODEL_PATH = '../input/distilroberta-base'
# distilroberta_base_tokenizer = RobertaTokenizer.from_pretrained(distilroberta_base_MODEL_PATH)

# class CFG_BI_002:
#     CV = 0.6955
#     model_name = distilroberta_base_MODEL_PATH
#     max_len = 256
#     # DataLoader
#     loader = {
#         "batch_size": 64,
#         "num_workers": 2,
#         "shuffle": False,
#         "pin_memory": True,
#         "drop_last": False,
#     }
# _CFG = CFG_BI_002 # edit
# ds = JigsawDataset(_CFG, df_cleaned, distilroberta_base_tokenizer) # edit
# loader = DataLoader(ds, **_CFG.loader)

# predictions = []
# for fold in tqdm(range(5)):
#     model = roberta_model(_CFG) # edit
#     model.load_state_dict(torch.load(f"../input/jigsaw4-binary-task/Bi_exp002_fold{fold}_best.pth")) # edit
#     p = predict(model, loader)
#     predictions.append(p)
#     del model
#     torch.cuda.empty_cache()
#     gc.collect()

    
# pred = np.mean(predictions,axis=0)
# all_preds.append(pred)
# # plot
# plt.hist(pred)
# plt.show()
# del ds, loader, pred
# torch.cuda.empty_cache()
# gc.collect()


In [None]:
%%time
"""====================================================
# deberta_learge binary exp003
===================================================="""
deberta_large_MODEL_PATH = '../input/deberta/large'
deberta_large_tokenizer = DebertaTokenizer.from_pretrained(deberta_large_MODEL_PATH)

class CFG_BI_003:
    CV = 0.7032
    model_name = deberta_large_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 4,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG_BI_003 # edit
ds = JigsawDataset(_CFG, df_cleaned, deberta_large_tokenizer) # edit
loader = DataLoader(ds, **_CFG.loader)

predictions = []
for fold in tqdm(range(5)):
    model = deberta_model(_CFG)
    model.load_state_dict(torch.load(f"../input/jigsaw4-model2/bi003_deberta-large/Bi_exp003_fold{fold}_best.pth")) # edit
    p = predict(model, loader)
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

In [None]:
# %%time
# """====================================================
# # distilbert-base-uncased-distilled-squad binary exp004
# ===================================================="""
# distilbert_base_MODEL_PATH = '../input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad'
# distilbert_base_tokenizer = AutoTokenizer.from_pretrained(distilbert_base_MODEL_PATH)

# class CFG_BI_004:
#     CV = 0.6967
#     model_name = distilbert_base_MODEL_PATH
#     max_len = 256
#     # DataLoader
#     loader = {
#         "batch_size": 72,
#         "num_workers": 2,
#         "shuffle": False,
#         "pin_memory": True,
#         "drop_last": False,
#     }
# _CFG = CFG_BI_004 # edit
# ds = JigsawDataset(_CFG, df_cleaned, distilbert_base_tokenizer) # edit
# loader = DataLoader(ds, **_CFG.loader) # edit

# predictions = []
# for fold in tqdm(range(5)):
#     model = distilbert_model(_CFG)
#     model.load_state_dict(torch.load(f"../input/jigsaw4-binary-task-exp004/Bi_exp004_fold{fold}_best.pth")) # edit
#     p = predict(model, loader)
#     predictions.append(p)
#     del model
#     torch.cuda.empty_cache()
#     gc.collect()

    
# pred = np.mean(predictions,axis=0)
# all_preds.append(pred)
# # plot
# plt.hist(pred)
# plt.show()
# del ds, loader, pred
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
%%time
"""====================================================
# gpt2-medium binary exp006
===================================================="""
from transformers import GPT2Model, GPT2Tokenizer, GPT2Config
gpt2_medium_MODEL_PATH = "../input/jigsaw4-gpt-medium-save/gpt-medium/"
gpt2_medium_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_medium_MODEL_PATH)

class CFG_BI_006:
    CV = 0.6993
    model_name = gpt2_medium_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 16,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG_BI_006 # edit
_CFG.tokenizer = gpt2_medium_tokenizer
ds = JigsawDataset(_CFG, df_cleaned, gpt2_medium_tokenizer) # edit
loader = DataLoader(ds, **_CFG.loader) # edit

predictions = []
for fold in tqdm(range(5)):
    model = gpt2_model(_CFG) # edit
    model.load_state_dict(torch.load(f"../input/jigsaw4-model2/Bi_006_gpt2-medium/Bi_exp006_fold{fold}_best.pth")) # edit
    p = predict(model, loader)
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

In [None]:
# %%time
# """====================================================
# # albert-xlarge-v2 binary exp007
# ===================================================="""
# albert_xlarge_MODEL_PATH = '../input/jigsaw4-albert-xlarge-v2-save/albert-xlarge-v2/'
# albert_xlarge_tokenizer = AlbertTokenizer.from_pretrained(albert_xlarge_MODEL_PATH)

# class CFG_BI_007:
#     CV = 0.6962
#     model_name = albert_xlarge_MODEL_PATH
#     max_len = 256
#     # DataLoader
#     loader = {
#         "batch_size": 4,
#         "num_workers": 2,
#         "shuffle": False,
#         "pin_memory": True,
#         "drop_last": False,
#     }
# _CFG = CFG_BI_007 # edit
# ds = JigsawDataset(_CFG, df_cleaned, albert_xlarge_tokenizer) # edit
# loader = DataLoader(ds, **_CFG.loader)

# predictions = []
# for fold in tqdm(range(5)):
#     model = albert_model(_CFG)
#     model.load_state_dict(torch.load(f"../input/jigsaw4-model2/bi007_albert-xlarge-v2/Bi_exp007_fold{fold}_best.pth")) # edit
#     p = predict(model, loader)
#     predictions.append(p)
#     del model
#     torch.cuda.empty_cache()
#     gc.collect()

    
# pred = np.mean(predictions,axis=0)
# all_preds.append(pred)
# # plot
# plt.hist(pred)
# plt.show()
# del ds, loader, pred
# torch.cuda.empty_cache()
# gc.collect()

## Trained Valid and extra valid data models

In [None]:
%%time
"""====================================================
# roberta_base exp008
===================================================="""
roberta_base_MODEL_PATH = '../input/roberta-transformers-pytorch/roberta-base'
roberta_base_tokenizer = RobertaTokenizer.from_pretrained(roberta_base_MODEL_PATH)

class CFG008:
    CV = 0.7017
    model_name = roberta_base_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 32,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG008
ds = JigsawDataset(_CFG, df, roberta_base_tokenizer)
loader = DataLoader(ds, **_CFG.loader)

predictions = []
for fold in tqdm(range(5)):
    model = roberta_model(_CFG)
    model.load_state_dict(torch.load(f"../input/jigsaw4-model1/exp008_roberta-base/exp008_fold{fold}_best.pth"))
    p = predict(model, loader)
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

In [None]:
%%time
"""====================================================
# deberta_learge exp022
===================================================="""
deberta_large_MODEL_PATH = '../input/deberta/large'
deberta_large_tokenizer = DebertaTokenizer.from_pretrained(deberta_large_MODEL_PATH)

class CFG022:
    CV = 0.7003
    model_name = deberta_large_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 4,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG022
ds = JigsawDataset(_CFG, df, deberta_large_tokenizer)
loader = DataLoader(ds, **_CFG.loader)

predictions = []
for fold in tqdm(range(5)):
    model = deberta_model(_CFG)
    model.load_state_dict(torch.load(f"../input/jigsaw4-model1/022_deberta-large/exp022_fold{fold}_best.pth"))
    p = predict(model, loader)
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

In [None]:
# %%time
# # ====================================================
# # Hate-speech-CNERG/dehatebert-mono-english exp023
# # ====================================================
# dehatebert_MODEL_PATH = '../input/hatespeechcnergdehatebertmonoenglish/dehatebert-mono-english'
# dehatebert_tokenizer = BertTokenizer.from_pretrained(dehatebert_MODEL_PATH)

# class CFG023:
#     CV = 0.6743
#     model_name = dehatebert_MODEL_PATH
#     max_len = 256
#     # DataLoader
#     loader = {
#         "batch_size": 32,
#         "num_workers": 2,
#         "shuffle": False,
#         "pin_memory": True,
#         "drop_last": False,
#     }
# _CFG = CFG023
# ds = JigsawDataset(_CFG, df, dehatebert_tokenizer)
# loader = DataLoader(ds, **_CFG.loader)

# predictions = []
# for fold in tqdm(range(5)):
#     model = bert_model(_CFG)
#     model.load_state_dict(torch.load(f"../input/jigsaw4-model1/023_hatebert/exo023_fold{fold}_best.pth"))
#     p = predict(model, loader)
#     predictions.append(p)
#     del model
#     torch.cuda.empty_cache()
#     gc.collect()

    
# pred = np.mean(predictions,axis=0)
# all_preds.append(pred)
# # plot
# plt.hist(pred)
# plt.show()
# del ds, loader, pred
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
%%time
"""====================================================
# distilroberta-base exp026
===================================================="""
distilroberta_base_MODEL_PATH = '../input/distilroberta-base'
distilroberta_base_tokenizer = RobertaTokenizer.from_pretrained(distilroberta_base_MODEL_PATH)

class CFG026:
    CV = 0.7012
    model_name = distilroberta_base_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 64,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG026
ds = JigsawDataset(_CFG, df, distilroberta_base_tokenizer)
loader = DataLoader(ds, **_CFG.loader)

predictions = []
for fold in tqdm(range(5)):
    model = roberta_model(_CFG)
    model.load_state_dict(torch.load(f"../input/jigsaw4-model1/026_distilroberta-base/exp026_fold{fold}_best.pth"))
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

In [None]:
%%time
"""====================================================
# deberta_base exp032
===================================================="""
deberta_base_MODEL_PATH = '../input/deberta/base'
deberta_base_tokenizer = DebertaTokenizer.from_pretrained(deberta_base_MODEL_PATH)

class CFG032:
    CV = 0.7043
    model_name = deberta_base_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 32,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG032
ds = JigsawDataset(_CFG, df, deberta_base_tokenizer)
loader = DataLoader(ds, **_CFG.loader)

predictions = []
for fold in tqdm(range(5)):
    model = deberta_model(_CFG)
    model.load_state_dict(torch.load(f"../input/jigsaw4-model2/032_deberta-base/exp032_fold{fold}_best.pth"))
    p = predict(model, loader)
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

In [None]:
%%time
"""====================================================
# deberta_base exp033
===================================================="""
xlnet_base_MODEL_PATH  = '../input/xlnet-pretrained-models-pytorch/xlnet-base-cased-pytorch_model.bin'
xlnet_base_tokenizer = XLNetTokenizer.from_pretrained("../input/xlnet-pretrained-models-pytorch/xlnet-base-cased-spiece.model")

class CFG033:
    CV = 0.6997
    model_name = xlnet_base_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 32,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG033
ds = JigsawDataset(_CFG, df, xlnet_base_tokenizer)
loader = DataLoader(ds, **_CFG.loader)

predictions = []
for fold in tqdm(range(5)):
    model = xlnet_model(_CFG)
    model.load_state_dict(torch.load(f"../input/jigsaw4-model2/033_xlnet-base-cased/exp033_fold{fold}_best.pth"))
    p = predict(model, loader)
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

In [None]:
%%time
"""====================================================
# robelta-large exp034
===================================================="""
robeota_large_MODEL_PATH = '../input/roberta-transformers-pytorch/roberta-large'
robeota_large_tokenizer = RobertaTokenizer.from_pretrained(robeota_large_MODEL_PATH)

class CFG034:
    CV = 0.702
    model_name = robeota_large_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 8,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG034
ds = JigsawDataset(_CFG, df, robeota_large_tokenizer)
loader = DataLoader(ds, **_CFG.loader)

predictions = []
for fold in tqdm(range(5)):
    model = roberta_model(_CFG)
    model.load_state_dict(torch.load(f"../input/jigsaw4-model3/034_robelta-large/exp034_fold{fold}_best.pth"))
    p = predict(model, loader)
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

In [None]:
%%time
"""====================================================
# facebook/bart-base exp035
===================================================="""
bart_base_MODEL_PATH  = '../input/bart-models-hugging-face-model-repository/bart-base'
bart_base_tokenizer  = AutoTokenizer.from_pretrained('../input/bartbase/')

class CFG035:
    CV = 0.7027
    model_name = bart_base_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 16,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG035
ds = JigsawDataset(_CFG, df, bart_base_tokenizer)
loader = DataLoader(ds, **_CFG.loader)

predictions = []
for fold in tqdm(range(5)):
    model = bart_model(_CFG)
    model.load_state_dict(torch.load(f"../input/jigsaw4-model3/exp035_bart-base/exp035_fold{fold}_best.pth"))
    p = predict(model, loader)
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

In [None]:
%%time
"""====================================================
# Hate-speech-CNERG/dehatebert-mono-english exp036
===================================================="""
dehatebert_MODEL_PATH = '../input/hatespeechcnergdehatebertmonoenglish/dehatebert-mono-english'
dehatebert_tokenizer = BertTokenizer.from_pretrained(dehatebert_MODEL_PATH)

class CFG036:
    CV = 0.6769
    model_name = dehatebert_MODEL_PATH
    max_len = 256
    # DataLoader
    loader = {
        "batch_size": 32,
        "num_workers": 2,
        "shuffle": False,
        "pin_memory": True,
        "drop_last": False,
    }
_CFG = CFG036
ds = JigsawDataset(_CFG, df, dehatebert_tokenizer)
loader = DataLoader(ds, **_CFG.loader)

predictions = []
for fold in tqdm(range(5)):
    model = bert_model(_CFG)
    model.load_state_dict(torch.load(f"../input/jigsaw4-model3/036_hatebert/exp036_fold{fold}_best.pth"))
    p = predict(model, loader)
    predictions.append(p)
    del model
    torch.cuda.empty_cache()
    gc.collect()

    
pred = np.mean(predictions,axis=0)
all_preds.append(pred)
# plot
plt.hist(pred)
plt.show()
del ds, loader, pred
torch.cuda.empty_cache()
gc.collect()

best score: 0.7141623488773747  
best weight: [0.18375869 0.00485101 0.18612081 0.20428201 0.06021268 0.2105402
 0.061306   0.02023979 0.05850578 0.01018304]  
 https://www.kaggle.com/teyosan1229/jigsaw4-optuna?scriptVersionId=87206038

In [None]:
weight = [0.18375869, 0.00485101, 0.18612081, 0.20428201, 0.06021268, 0.2105402, 0.061306, 0.02023979, 0.05850578, 0.01018304]
pred = np.zeros(all_preds[0].shape)
for p, w in zip(all_preds, weight):
    pred += p*w

In [None]:
pred[:5]

## make submission

In [None]:
# df['score'] = np.mean(all_preds,axis=0)
df['score'] = pred
df.head()

In [None]:
df['score'] = df['score'].rank(method='first')
df.head()

In [None]:
# 公開ノートアンサンブルとbertのアンサンブル
df["score"] = 0.33*df["score"].values + 0.66*data["score"].values
df["score"] = rankdata( df["score"], method='ordinal')
df.head(20)

In [None]:
df.drop('text', axis=1, inplace=True)
df.to_csv("submission.csv", index=False)