In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# import warnings
# warnings.filterwarnings('ignore')

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> Loading the **train** and **test datasets** </a>
---

In [3]:
INPUT_PATH="../input/metinvest-test-task-toxic-comments/"
train = pd.read_csv(INPUT_PATH+"tox_train.csv", index_col="id")
test = pd.read_csv(INPUT_PATH+"tox_test.csv", index_col="id")

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> Preparing **test dataset** for future processing </a>
---

In [4]:
test["list"] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] * test.shape[0]
test.head(5)

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> Processing datasets via **CustomDataset**, making **train-test split** using stratifying </a>
---

In [5]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
from sklearn.model_selection import train_test_split

target_min = np.amin(train.target)
target_max = np.amax(train.target)

bins = np.linspace(start=target_min, stop=target_max, num=10)
target_binned = np.digitize(train.target, bins, right=True)

FRAC_VALUE = 1


train_data, val_data, _,_ = train_test_split(train, train.target,test_size=0.1, stratify=target_binned)

train_data = train_data.sample(frac = FRAC_VALUE).reset_index(drop=True)
val_data = val_data.sample(frac = FRAC_VALUE).reset_index(drop=True)
print(train_data.shape)

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test.reset_index(drop=True)

print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("VALIDATION Dataset: {}".format(val_data.shape))
print("TEST Dataset: {}".format(test_data.shape))



In [6]:
# test_data = test.reset_index(drop=True)

In [None]:
features =  [
            "target", "severe_toxicity", "obscene",
            "identity_attack", "insult", "threat"
            ]
train_data[features] = np.where(train_data[features] > 0.5, 1, 0)

In [None]:
train_data['list'] = train_data[["target", "severe_toxicity", "obscene", "identity_attack", "insult", "threat"]].values.tolist()
train_data = train_data[['comment_text', 'list']]

val_data['list'] = val_data[["target", "severe_toxicity", "obscene", "identity_attack", "insult", "threat"]].values.tolist()
val_data = val_data[['comment_text', 'list']]


In [7]:
MAX_LEN = 220

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=False)

In [12]:
training_set = CustomDataset(train_data, tokenizer, MAX_LEN)
val_set = CustomDataset(val_data, tokenizer, MAX_LEN)


In [8]:
test_set = CustomDataset(test_data, tokenizer, MAX_LEN)

In [9]:

TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
# EPOCHS = 1
LEARNING_RATE = 1e-05
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [10]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 4
                }


training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)
test_loader = DataLoader(test_set, **val_params)


## <a class="anchor" id="1.5_bullet" style="color:#b14404"> Setting **BERT model** </a>
---

In [11]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
#         _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)   
        
        output_2 = self.l2(output_1[1])
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)
1

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> Defining the **Loss function** and the **Optimizer** </a>
---

In [12]:
# loss_fn = torch.nn.MSELoss()

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [13]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> Defining the **Train function** </a>
---

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _ % 1000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> Code for **model weights** loading </a>
---

In [14]:
MODEL_WEIGHTS_FILE = 'bert_model_weights_v3.pth'

model.load_state_dict(torch.load(INPUT_PATH + MODEL_WEIGHTS_FILE))
model.eval()

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> **Training** of the model </a>
---

In [None]:
EPOCHS = 2
for epoch in range(EPOCHS):
    train(epoch)

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> **Saving** the finished model **weights** </a>
---

In [None]:
# torch.save(model, 'model.bin')
torch.save(model.state_dict(), 'bert_model_weights_v3.pth')

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> **Validation** </a>
---

In [None]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(val_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
from sklearn.metrics import mean_squared_error


outputs, targets = validation()
mse = mean_squared_error(targets, outputs)
print(f"MSE Score = {mse}")

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> **Prediction** </a>
---

In [15]:
def predict():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(test_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [16]:

outputs, targets = predict()



In [19]:
len(outputs)

In [20]:
test_ids = pd.read_csv(INPUT_PATH+"tox_test.csv")["id"]

In [24]:
test_ids.shape

In [23]:
test_ids.T.shape

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> Making the **resulting DataFrame** for the **submission** </a>
---

In [25]:
predicted = pd.DataFrame(outputs, index=test_ids).rename(columns={0: 'target',
                                                                    1: 'severe_toxicity',
                                                                    2: 'obscene',
                                                                    3: 'identity_attack',
                                                                    4: 'insult',
                                                                    5: 'threat'})

In [32]:
predicted

## <a class="anchor" id="1.5_bullet" style="color:#b14404"> **Saving the submission**</a>
---

In [31]:
predicted.to_csv("predicted_v3.csv")