In [None]:
import os
import copy
import pickle
import random
from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from gensim.models import KeyedVectors, FastText

#torch packages
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

#transformer packages
from transformers import BertTokenizer, BertModel
from transformers import AdamW
from transformers import get_scheduler
from transformers import AutoTokenizer, AutoModel

In [None]:
def set_seed(seed=5080):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
class JigsawTestDataset(Dataset):
    def __init__(
        self, df, max_length, tokenizer = None, use_tfidf=False, 
        tfidf_matrix=None, use_sentence_embedding=False, embed_matrix=None
    ):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        self.use_tfidf = use_tfidf
        self.use_sentence_embedding = use_sentence_embedding
        if use_tfidf:
            self.tfidf_matrix = tfidf_matrix
        elif use_sentence_embedding:
            self.embed_matrix = embed_matrix
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs_text = self.tokenizer.encode_plus(
                                text,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        text_ids = inputs_text['input_ids']
        text_mask = inputs_text['attention_mask']
        
        if self.use_tfidf:
            tfidf = self.tfidf_matrix[index]
            return {
                'text_ids': torch.tensor(text_ids, dtype=torch.long),
                'text_mask': torch.tensor(text_mask, dtype=torch.long),
                'tfidf': torch.tensor(tfidf, dtype=torch.long)
            }
        elif use_sentence_embedding:
            sent_embed = self.embed_matrix[index]
            return {
                'text_ids': torch.tensor(text_ids, dtype=torch.long),
                'text_mask': torch.tensor(text_mask, dtype=torch.long),
                'sent_embed': torch.tensor(sent_embed, dtype=torch.long)
            }
        else:
            return {
                'text_ids': torch.tensor(text_ids, dtype=torch.long),
                'text_mask': torch.tensor(text_mask, dtype=torch.long)
            }

In [None]:
class NN(nn.Module):
    def __init__(
        self, bert_drop_out, HID_DIM=768, tfidf_len=0, use_tfidf=False, 
        use_sentence_embedding=False, embed_len=0
    ):
        super().__init__()
        if use_tfidf:
            self.net = nn.Sequential(
                nn.Dropout(p=bert_drop_out), #dropout for bert
                nn.Linear(768+tfidf_len, 1)
            )
        elif use_sentence_embedding:
            self.net = nn.Sequential(
                nn.Dropout(p=bert_drop_out), #dropout for bert
                nn.Linear(768+embed_len, 1)      
            )
        else:
            self.net = nn.Sequential(
                nn.Dropout(p=bert_drop_out), #dropout for bert
                nn.Linear(768, 1)
            )
            
    def forward(self, x):
        score = self.net(x)
        return score
                
class JigsawModel(nn.Module):
    def __init__(self, BERT, NN):
        super(JigsawModel, self).__init__()
        self.bert = BERT
        self.fc = NN
        
    def forward(
        self, ids, mask, tfidf_vec=None, use_tfidf=False, 
        sent_embed=None, use_sentence_embedding=False
    ):        
        out = self.bert(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        if use_tfidf:
            fc_in = torch.cat(
                (out["pooler_output"], tfidf_vec), dim=1
            )
        elif use_sentence_embedding:
            fc_in = torch.cat(
                (out["pooler_output"], sent_embed), dim=1
            )        
        else:
            fc_in = out["pooler_output"]
        outputs = self.fc(fc_in)
        return outputs

In [None]:
def create_test_corpus(df_test):
    return df_test["text"].to_list()

def tokenize_test_by_bert_tokenizer(bert_tokenizer, corpus):
    corpus_tokenized = [
        bert_tokenizer.tokenize(sentence) for sentence in corpus
    ]
    return corpus_tokenized

def testCorpus2tfidf(tfidf_transfomer, corpus_tokenized):
    tfidf_matrix_sparse = tfidf_transfomer.transform(corpus_tokenized)
    tfidf_matrix = tfidf_matrix_sparse.toarray()
    return tfidf_matrix

def identity_tokenizer(text):
    return text

In [None]:
set_seed(5080)
data_test = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
PRETRAINED_MODEL_NAME = "../input/transformers/roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
TFIDF_PATH = "../input/temp-model/tfidf_roberta_obj (1).pickle"
with open(TFIDF_PATH, 'rb') as f:
    tfidf_transformer = pickle.load(f)
    
test_corpus = create_test_corpus(data_test)
corpus_tokenized = tokenize_test_by_bert_tokenizer(bert_tokenizer, test_corpus)
tfidf_matrix = testCorpus2tfidf(tfidf_transformer, corpus_tokenized)

In [None]:
token2idx = tfidf_transformer.vocabulary_
token_list = list(
    tfidf_transformer.vocabulary_.keys()
)

In [None]:
tfidf_matrix.shape

# Build sentence embedding

In [None]:
# fmodel = FastText.load(
#     '../input/jigsaw-regression-based-data/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin'
# )

In [None]:
# w2v_embed_dim = 256
# w2v = np.zeros(
#     (len(token2idx), w2v_embed_dim)
# )
# for tok in token_list:
#     token_idx = token2idx[tok]
#     w2v[token_idx] = fmodel.wv[tok]

In [None]:
# sentence_embedding = np.dot(
#     tfidf_matrix, w2v
# )

In [None]:
# sentence_embedding.shape

# Dataset and Dataloader

In [None]:
batch_size = 16
max_token_length = 128
use_tfidf = True
use_sentence_embedding = False
if use_tfidf:
    tfidf_len = tfidf_matrix.shape[1]
    test_dataset = JigsawTestDataset(
        data_test, max_length=max_token_length, tokenizer=bert_tokenizer,
        use_tfidf=use_tfidf, tfidf_matrix=tfidf_matrix
    )
elif use_sentence_embedding:
    embed_len = sentence_embedding.shape[1]
    test_dataset = JigsawTestDataset(
        data_test, max_length=max_token_length, tokenizer=bert_tokenizer,
        use_sentence_embedding=use_sentence_embedding, embed_matrix=sentence_embedding
    )
else:
    test_dataset = JigsawTestDataset(
        data_test, max_length=max_token_length, tokenizer=bert_tokenizer
    )

test_loader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False,
    num_workers=2)

In [None]:
LR = 1e-5
EPOCH = 10
HID_DIM = 768
MARGIN = 0.5
DATE = "0115"
LR = 1e-5
WD = 0
BDR = 0.2

MODEL_PATH = f"../input/temp-model/0116_roberta_LR_0.0001_WD_1e-06_BDR_0.3.pth"

bert = AutoModel.from_pretrained(PRETRAINED_MODEL_NAME).to(device)
if use_tfidf:
    dnn = NN(
        BDR, HID_DIM, 
        tfidf_len, use_tfidf
    ).to(device)
elif use_sentence_embedding:
    dnn = NN(BDR, HID_DIM, 
             embed_len=embed_len, use_sentence_embedding=True
            ).to(device)   
else:
    dnn = NN(
        BDR, HID_DIM
    ).to(device)

model = JigsawModel(bert, dnn).to(device)
if torch.cuda.is_available():
    checkpoint = torch.load(MODEL_PATH)
else:
    checkpoint = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
model.bert.load_state_dict(checkpoint["BERT"])
model.fc.load_state_dict(checkpoint["NN"])
# bert.load_state_dict(checkpoint["BERT"])
# dnn.load_state_dict(checkpoint["NN"])

# Check training set accuracy 
## (Please annotate the following sections before submit to competition)

In [None]:
# class JigsawDataset(Dataset):
#     def __init__(self, df, tokenizer, max_length, use_tfidf=False, tfidf_matrix=None):
#         self.df = df
#         self.max_len = max_length
#         self.tokenizer = tokenizer
#         self.more_toxic = df['more_toxic'].values
#         self.less_toxic = df['less_toxic'].values
#         self.use_tfidf = use_tfidf
#         if use_tfidf:
#             self.more_toxic_tfidf_idx = df['more_toxic_tfidf_idx'].values
#             self.less_toxic_tfidf_idx = df['less_toxic_tfidf_idx'].values
#             self.tfidf_matrix = tfidf_matrix

#     def __len__(self):
#         return len(self.df)
    
#     def __getitem__(self, index):
#         more_toxic = self.more_toxic[index]
#         less_toxic = self.less_toxic[index]
#         inputs_more_toxic = self.tokenizer.encode_plus(
#                                 more_toxic,
#                                 truncation=True,
#                                 add_special_tokens=True,
#                                 max_length=self.max_len,
#                                 padding='max_length'
#                             )
#         inputs_less_toxic = self.tokenizer.encode_plus(
#                                 less_toxic,
#                                 truncation=True,
#                                 add_special_tokens=True,
#                                 max_length=self.max_len,
#                                 padding='max_length'
#                             )
#         target = 1
        
#         more_toxic_ids = inputs_more_toxic['input_ids']
#         more_toxic_mask = inputs_more_toxic['attention_mask']        
#         less_toxic_ids = inputs_less_toxic['input_ids']
#         less_toxic_mask = inputs_less_toxic['attention_mask']
        
#         if self.use_tfidf:
#             more_toxic_tfidf_idx = self.more_toxic_tfidf_idx[index]
#             less_toxic_tfidf_idx = self.less_toxic_tfidf_idx[index]
#             more_toxic_tfidf = self.tfidf_matrix[more_toxic_tfidf_idx]
#             less_toxic_tfidf = self.tfidf_matrix[less_toxic_tfidf_idx]
#             return {
#                 'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
#                 'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
#                 'more_toxic_tfidf': torch.tensor(more_toxic_tfidf, dtype=torch.long),
#                 'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
#                 'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
#                 'less_toxic_tfidf': torch.tensor(less_toxic_tfidf, dtype=torch.long),
#                 'target': torch.tensor(target, dtype=torch.long)
#             }
#         else:
#             return {
#                 'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
#                 'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
#                 'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
#                 'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
#                 'target': torch.tensor(target, dtype=torch.long)
#             }

In [None]:
# def validate_all_combine(
#     model, criterion, 
#     valid_loader, device, use_tfidf=False
# ):
#     epoch_loss = 0
#     y_preds = []
    
#     model.eval()
#     with torch.no_grad():
#         for data in valid_loader:
#             more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
#             more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
#             less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
#             less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
#             targets = data['target'].to(device, dtype=torch.long)
            
#             if use_tfidf:
#                 more_toxic_tfidf = data['more_toxic_tfidf'].to(device, dtype = torch.long)
#                 less_toxic_tfidf = data['less_toxic_tfidf'].to(device, dtype = torch.long)
#                 more_out = model(more_toxic_ids, more_toxic_mask, more_toxic_tfidf, use_tfidf=True)
#                 less_out = model(less_toxic_ids, less_toxic_mask, less_toxic_tfidf, use_tfidf=True)
#             else:
#                 more_out = model(more_toxic_ids, more_toxic_mask)
#                 less_out = model(less_toxic_ids, less_toxic_mask)
            
#             loss = criterion(more_out, less_out, targets)

#             epoch_loss += loss.item()
#             for i in range(len(data['more_toxic_ids'])):
#                 y_preds.append([less_out[i].item(), more_out[i].item()])
#         df_score = pd.DataFrame(y_preds,columns=['less','more'])
#         accuracy = validate_accuracy(df_score)
#     return df_score, accuracy, (epoch_loss / len(valid_loader))

# def validate_accuracy(df_score):
#     return len(df_score[df_score['less'] < df_score['more']]) / len(df_score)

# def return_wrong_text(df_score, df_valid):
#     df_score_text = pd.concat((df_valid.reset_index().drop('index',axis=1),df_score),axis=1)
#     return df_score_text[df_score_text['less'] > df_score_text['more']]

In [None]:
# def remove_duplicates(df, used_col):
#     """Combine `less_toxic` text and `more_toxic` text,
#     then remove duplicate pair of comments while keeping the last pair
#     """
#     df["combine"] = df["less_toxic"] + df["more_toxic"]
#     df = df.drop_duplicates(subset=used_col, keep="last")
#     return df

In [None]:
# data_train = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
# #data_train_remove_duplicates = remove_duplicates(data_train, "combine")

# train_dataset = JigsawDataset(
#         data_train, tokenizer=bert_tokenizer, 
#         max_length=max_token_length, use_tfidf=use_tfidf, tfidf_matrix=tfidf_matrix
#     )

# train_loader = DataLoader(
#         train_dataset, batch_size=batch_size, shuffle=True,
#         num_workers=2)
    
# criterion = nn.MarginRankingLoss(margin=MARGIN)

# df_score, valid_acc, valid_loss = validate_all_combine(
#     model, criterion, all_loader, 
#     device, use_tfidf
# )

# print(f"Recheck: Accuracy = {valid_acc}, loss = {valid_loss}")

# Inference

In [None]:
def predict_combine(
    model, test_loader, device, 
    use_tfidf=False, use_sentence_embedding=False
):
    predict = []
    with torch.no_grad():
        model.eval()
        for data in test_loader:
            text_ids = data['text_ids'].to(device, dtype = torch.long)
            text_mask = data['text_mask'].to(device, dtype = torch.long)
            if use_tfidf:
                text_tfidf = data['tfidf'].to(device, dtype = torch.long)
                score = model(text_ids, text_mask, text_tfidf, use_tfidf)
            elif use_sentence_embedding:
                text_sent_embed = data["sent_embed"].to(device, dtype = torch.long)
                score = model(
                    text_ids, text_mask, 
                    sent_embed=text_sent_embed, use_sentence_embedding=use_sentence_embedding
                )
            else:
                score = model(text_ids, text_mask)
            score = score.view(-1).cpu().detach().numpy()
            for pred in score:
                predict.append(pred)
    return predict

In [None]:
predict = predict_combine(
    model, test_loader, device, 
    use_tfidf=use_tfidf, use_sentence_embedding=use_sentence_embedding
)

In [None]:
predict = np.array(predict)

In [None]:
print(f"Total Predictiions: {predict.shape[0]}")
print(f"Total Unique Predictions: {np.unique(predict).shape[0]}")

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df.head()

In [None]:
df['score'] = predict
df.head()

In [None]:
df['score'] = df['score'].rank(method='first')
df.head()

In [None]:
df.drop('text', axis=1, inplace=True)
df.to_csv("submission.csv", index=False)