## 1. Environment setup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
! pip install pytorch-pretrained-bert pytorch-nlp transformers

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data._utils.collate import default_collate
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import Ridge
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import time

import random

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.linear_model import Ridge


### get device

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
print(device)

## 2. Data Load & analysis

In [None]:
comments_to_score = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
sample_submission = pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
validation_data = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
comments_to_score

In [None]:
sample_submission

In [None]:
validation_data

In [None]:
toxic_data_path = "../input/all-in-one-jigsaw/all_in_one_jigsaw.csv"
toxic_data = pd.read_csv(toxic_data_path, low_memory=False)
toxic_data.describe()

In [None]:
toxic_data['severe_toxic'] = toxic_data.severe_toxic * 2
toxic_data['y'] = (toxic_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)).astype(int)
toxic_data['y'] = toxic_data['y'] / toxic_data['y'].max()
toxic_data = toxic_data[['comment_text_processed', 'y']].rename(columns={'comment_text_processed': 'text'})
toxic_data.describe()

In [None]:
toxic_data["text"] = toxic_data["text"].astype("str")
toxic_data["y"].value_counts()

In [None]:
def clean(data, col):  # Replace each occurrence of pattern/regex in the Series/Index
    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')  
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    return data 

In [None]:
toxic_data = clean(toxic_data, "text")
toxic_data

In [None]:
# toxic_data[toxic_data["y"] >=0.8]
# temp_len_df = toxic_data["text"].str.len()
# temp_len_df[temp_len_df > 200]

In [None]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print("seed_everything")

seed_everything(config.seed)

## 3. Model 

### 1. model: bert + fc

In [None]:
from transformers import AlbertTokenizer, AlbertModel

class config:
    bert_path = "../input/torch-bert-weights/bert-base-uncased/bert-base-uncased"
#     bert_path = "../input/pretrained-albert-pytorch/albert-base-v2"
    seed = 2021
    max_len = 512
    batch_size = 256
    test_size = 0.2
    
    
bert = BertModel.from_pretrained(config.bert_path)
tokenizer = BertTokenizer.from_pretrained('../input/torch-bert-weights/bert-base-uncased-vocab.txt')

# bert = AlbertModel.from_pretrained(config.bert_path, return_dict=False)
# vocab_file = '../input/pretrained-albert-pytorch/albert-base-v2/spiece.model'
# tokenizer = AlbertTokenizer(vocab_file)

In [None]:
# class JigsawDataset(Dataset):
#     def __init__(self, samples, max_seq_length, tokenizer):
#         self.samples = samples
#         self.max_seq_length = max_seq_length - 2
#         self.tokenizer = tokenizer
#         self.length = len(self.samples)

#     def convert2ids(self, sample):
#         sample_text = sample["text"]
#         tokens = self.tokenizer.tokenize(sample_text)
#         if len(tokens) > self.max_seq_length:
#             tokens = tokens[: self.max_seq_length]

#         padding_num = self.max_seq_length - len(tokens)
#         tokens = ["[CLS]"] + tokens + ["[SEP]"]

#         one_token = self.tokenizer.convert_tokens_to_ids(tokens) + [0] * padding_num
#         one_mask = [1] * len(tokens) + [0] * padding_num
#         return torch.tensor(one_token, dtype=torch.int32), torch.tensor(one_mask, dtype=torch.int32)

    
#     def __getitem__(self, index):
#         sample = self.samples.iloc[index]
#         one_token, one_mask = self.convert2ids(sample)
#         logit = self.samples.iloc[index]["y"]
#         return one_token, one_mask, torch.tensor(logit, dtype=torch.float32)

    
#     def __len__(self):
#         return self.length

In [None]:
# train_dataset = JigsawDataset(samples=toxic_data, max_seq_length=config.max_len, tokenizer=tokenizer)
# train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=False)

In [None]:
# for i in test_dataloader:
#     print(len(i))
#     print(i[0].dtype, i[1].dtype, i[2].dtype)
#     print(i[0].shape, i[1].shape, i[2].shape)
#     break

In [None]:
# class ToxicModel(nn.Module):
    
#     def __init__(self, bert_encoder, n_hidden, device):
#         super().__init__()
#         # 加载并冻结bert模型参数
#         self.bert = bert_encoder.to(device)
#         for param in self.bert.parameters():
#             param.requires_grad = False
            
#         self.output = nn.Sequential(
#             nn.Dropout(0.2),
#             nn.Linear(768, n_hidden),
#             nn.ReLU(),
#             nn.Linear(n_hidden, 1),
#             nn.Sigmoid()
#         ).to(device)

#     def forward(self, seqs, attention_mask):
#         _, pooled = self.bert(seqs, attention_mask=attention_mask, output_all_encoded_layers=False) #output_all_encoded_layers=False
#         logits = self.output(pooled)
#         return logits

In [None]:
# class Trainner:
#     def __init__(self, num_epochs, lr, device):
#         self.device = device
#         self.net = ToxicModel(bert, 768, device)
#         self.num_epochs = num_epochs
#         self.opt = optim.AdamW(self.net.parameters(), lr=lr)
#         self.mseloss = nn.MSELoss(reduction="sum").to(device)
#         print("ToxicModel is on the", next(self.net.parameters()).device)
    
#     def train(self, train_loader):
#         self.net.train()
#         total_acc, total_count = 0, 0
#         log_interval = 500
    
#         for epoch in range(self.num_epochs):
#             running_loss = 0.0
#             total_count = 0
#             start_time = time.time()
#             for idx, data in enumerate(tqdm(train_loader)):
#                 texts, masks, logits = [item.to(self.device) for item in data]

#                 # zero the parameter gradients
#                 self.opt.zero_grad()

#                 # forward + backward + optimize
#                 outputs = self.net(texts, masks)
#                 loss = self.mseloss(outputs.squeeze(-1), logits)
#                 loss.backward()
#                 self.opt.step()

#                 # print statistics
#                 running_loss += loss.item()
                
#                 if idx % log_interval == 0 and idx > 0:
#                     total_count += log_interval
#                     elapsed = time.time() - start_time
#                     print('| epoch {:3d} | {:5d}/{:5d} batches '
#                           '| loss {:8.3f}'.format(epoch, idx, len(train_loader), running_loss / total_count))
#                     start_time = time.time()

#             print('| epoch {:3d} | {:5d} batches '
#                   '| loss {:8.3f}'.format(epoch, len(train_loader), running_loss / len(train_loader)))
#             print('Finished Training')
            
            
#     def save_model(self, path):
#         torch.save(self.net, path)
    
#     def load_model(self, path):
#         self.net = torch.load(path)
        
#     def evaluate(self, test_loader):
#         self.net.eval()
#         test_loss = 0.
#         with torch.no_grad():
#             for idx, data in enumerate(tqdm(test_loader)):
#                 texts, masks, logits = [item.to(self.device) for item in data]
#                 pred = self.net(texts, masks)
#                 loss = self.mseloss(pred, logits)
#                 test_loss += loss.item()
#         print(test_loss)
    
#     def infer(self, test_loader):
#         self.net.eval()
#         result = []
#         with torch.no_grad():
#             for idx, data in enumerate(tqdm(test_loader)):
#                 texts, masks = [item.to(self.device) for item in data]
#                 pred = self.net(texts, masks)
#                 result.append(pred)
#         return torch.cat(result, dim=0)
    
#     def evaluate_by_contrast(self, less_toxic_loader, more_toxic_loader):
#         less_toxic_pred = trainer.infer(less_toxic_loader).cpu().numpy()
#         more_toxic_pred = trainer.infer(more_toxic_loader).cpu().numpy()
#         valid_result = np.array(less_toxic_pred < more_toxic_pred, dtype=np.int32)
#         acc = valid_result.sum() / len(valid_result)
#         return acc


# trainer = Trainner(1, 1e-5, device)

In [None]:
# mini_data = toxic_data.iloc[0: 2000]
# mini_dataset = JigsawDataset(samples=mini_data, max_seq_length=config.max_len, tokenizer=tokenizer)
# mini_dataloader = DataLoader(mini_dataset, batch_size=config.batch_size, shuffle=False)
# trainer.train(train_dataloader)
# trainer.train(mini_dataloader)

In [None]:
# class EvaluateDataset(Dataset):
#     def __init__(self, samples, max_seq_length, tokenizer):
#         self.samples = samples
#         self.max_seq_length = max_seq_length - 2
#         self.tokenizer = tokenizer
#         self.length = len(self.samples)

#     def convert2ids(self, sample_text):
#         tokens = self.tokenizer.tokenize(sample_text)
#         if len(tokens) > self.max_seq_length:
#             tokens = tokens[: self.max_seq_length]

#         padding_num = self.max_seq_length - len(tokens)
#         tokens = ["[CLS]"] + tokens + ["[SEP]"]

#         one_token = self.tokenizer.convert_tokens_to_ids(tokens) + [0] * padding_num
#         one_mask = [1] * len(tokens) + [0] * padding_num
#         return torch.tensor(one_token, dtype=torch.int32), torch.tensor(one_mask, dtype=torch.int32)

    
#     def __getitem__(self, index):
#         sample = self.samples.iloc[index]
#         return self.convert2ids(sample)

    
#     def __len__(self):
#         return self.length

In [None]:
# temp_dataset = EvaluateDataset(validation_data["less_toxic"], max_seq_length=config.max_len, tokenizer=tokenizer)
# less_toxic_loader = DataLoader(temp_dataset, batch_size=config.batch_size, shuffle=False)

# temp_dataset = EvaluateDataset(validation_data["more_toxic"], max_seq_length=config.max_len, tokenizer=tokenizer)
# more_toxic_loader = DataLoader(temp_dataset, batch_size=config.batch_size, shuffle=False)

In [None]:
# trainer.evaluate_by_contrast(less_toxic_loader, more_toxic_loader)

In [None]:
# temp_dataset = EvaluateDataset(comments_to_score["text"], max_seq_length=config.max_len, tokenizer=tokenizer)
# res_loader = DataLoader(temp_dataset, batch_size=config.batch_size, shuffle=False)
# pred_res = trainer.infer(res_loader).cpu().numpy()

In [None]:
# comment_id = comments_to_score["comment_id"].to_numpy().reshape(-1, 1)
# result = np.hstack((comment_id, pred_res))
# result = pd.DataFrame(result, columns=["comment_id", "score"])
# result.to_csv("submission.csv", index=False)

### 2. Model: Bert + Ridge