In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 3.1 MB 4.4 MB/s 
[K     |████████████████████████████████| 56 kB 4.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 23.3 MB/s 
[K     |████████████████████████████████| 895 kB 51.4 MB/s 
[K     |████████████████████████████████| 596 kB 43.1 MB/s 
[?25h

In [2]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.metrics import log_loss

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import os
import random
import re

import warnings
warnings.filterwarnings('ignore')

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
def seed_everything(seed: int):
    """Seeds and fixes every possible random state."""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 3407
seed_everything(SEED)

In [5]:
!nvidia-smi

Mon Nov  1 15:41:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8    27W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Training & Validating

In [6]:
!wget -q -O train.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/fmWGQJvwU5ejog

In [7]:
train_df = pd.read_csv("train.csv")
train_df['list'] = train_df[train_df.columns[2:]].values.tolist()

new_train_df = train_df[['text', 'list']].copy()
new_train_df.head()

Unnamed: 0,text,list
0,"Корова, видимо вставая, раздавила себе сосок. ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
1,Корове 8 лет! Месяц назад промеж четвертей вым...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
2,"Молоко течёт само у коровы. Что делать, если у...","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]"
3,У нетели болячки на вымени.\nЗдравствуйте. Нет...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
4,"У меня первотелка, на днях отёл, у неё левый п...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"


In [8]:
model_checkpoint = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/400 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [9]:
MAX_LEN = 1024
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4

In [10]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [11]:
train_size = 0.8
train_dataset = new_train_df.sample(frac=train_size, random_state=200)

valid_dataset = new_train_df.drop(train_dataset.index).reset_index(drop=True)

train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, None)
validating_set = CustomDataset(valid_dataset, tokenizer, None)

FULL Dataset: (294, 2)
TRAIN Dataset: (235, 2)
VALID Dataset: (59, 2)


In [12]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32

g = torch.Generator()
g.manual_seed(SEED)

<torch._C.Generator at 0x7ff530211d50>

In [13]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

training_loader = DataLoader(training_set, **train_params)
if train_size != 1:
  validating_loader = DataLoader(validating_set, **valid_params)

In [14]:
from lsep_loss import LSEPLoss
def lsep_fn(outputs, targets):
    return LSEPLoss()(outputs, targets)

def loss_fn(outputs, targets):
    return torch.nn.MultiLabelSoftMarginLoss()(outputs, targets)

In [15]:
LR = 0.0001
EPOCH = 16

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=11, output_attentions=True)
model.to(device);

optimizer = torch.optim.Adam(params=model.parameters(), lr=LR)

Downloading:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [16]:
def train(epoch):

    for ep in range(1, epoch+1):
        model.train()
        for _, data in enumerate(training_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)

            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)['logits']
            
            loss = lsep_fn(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # if _ % 5000 == 0:
            #     print(f'Epoch: {ep}, Loss:  {loss.item()}')

        # model.eval()
        # val_outputs = []
        # val_targets = []
        # for _, data in enumerate(validating_loader, 0):
        #     ids = data['ids'].to(device, dtype = torch.long)
        #     mask = data['mask'].to(device, dtype = torch.long)
            
        #     token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        #     targets = data['targets'].to(device, dtype = torch.float)

        #     outputs = model(ids, mask, token_type_ids)['logits']

        #     val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        #     val_targets.extend(targets.cpu().detach().numpy().tolist())

        
        # val_outputs = np.array(val_outputs)
        # lg = log_loss_score(val_targets, val_outputs)
        # lg = (1 - lg) * 0.8

        print(f'Epoch: {ep}, Loss:  {loss.item()}, Vall score: {0}')

In [17]:
def log_loss_score(gt, pr):
    
    log_loss_ = 0
    
    gt = np.array(gt)
    
    for i in range(10):
        log_loss_ += log_loss(gt[:, i], pr[:, i])
        
    return log_loss_ / 10

    
def validation():
    model.eval()

    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():
        for _, data in enumerate(validating_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)['logits']

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets

def get_val_metrics():
    outputs, targets = validation()
    outputs = np.array(outputs)
    lg = log_loss_score(targets, outputs)

    return (1 - lg) * 0.8

In [18]:
train(EPOCH)

Epoch: 1, Loss:  3.540661334991455, Vall score: 0
Epoch: 2, Loss:  3.7506957054138184, Vall score: 0
Epoch: 3, Loss:  2.9695541858673096, Vall score: 0
Epoch: 4, Loss:  2.4830992221832275, Vall score: 0
Epoch: 5, Loss:  2.6292104721069336, Vall score: 0
Epoch: 6, Loss:  1.7622911930084229, Vall score: 0
Epoch: 7, Loss:  1.7835524082183838, Vall score: 0
Epoch: 8, Loss:  1.4629855155944824, Vall score: 0
Epoch: 9, Loss:  1.0915048122406006, Vall score: 0
Epoch: 10, Loss:  0.5894798040390015, Vall score: 0
Epoch: 11, Loss:  0.4304092824459076, Vall score: 0
Epoch: 12, Loss:  1.1250470876693726, Vall score: 0
Epoch: 13, Loss:  0.8867586851119995, Vall score: 0
Epoch: 14, Loss:  0.14032068848609924, Vall score: 0
Epoch: 15, Loss:  0.26743024587631226, Vall score: 0
Epoch: 16, Loss:  0.15810543298721313, Vall score: 0


In [19]:
if train_size != 1:
  outputs, targets = validation()
  outputs = np.array(outputs)
  lg = log_loss_score(targets, outputs)

  print(f"log_loss = {lg}")
  print(f"log_score = {1 - lg}")
  print(f"log_score * 0.8 = {(1 - lg) * 0.8}")


log_loss = 0.22490507134255103
log_score = 0.775094928657449
log_score * 0.8 = 0.6200759429259592


In [None]:
# model.save_pretrained('./my_model_directory/')

# Submission


In [None]:
!wget -q -O test.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Wo70d4_PAwujqA

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test['list'] = test[test.columns[2:]].values.tolist()

new_df_test = test[['text', 'list']].copy()
new_df_test.head()

In [None]:
test_dataset = new_df_test.reset_index(drop=True)

print("TEST Dataset: {}".format(test_dataset.shape))

testing_set = CustomDataset(test_dataset, tokenizer, None)

In [None]:
test_params = {'batch_size': 4,
                'shuffle': False,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

testing_loader = DataLoader(testing_set, **test_params)

In [None]:
model.eval()
prediction = []

with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)

        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)['logits']

        prediction.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

In [None]:
submission_columns = ['text_id'] + list(train_df.columns[2:-2])
submission = pd.concat([test['text_id'], pd.DataFrame(np.array(prediction)[:, :10])], axis=1)
submission.columns = submission_columns

In [None]:
from span_words import span_words
def find_spans(string):
    spans = []
    for word in span_words:
        if len(word) > 3:
            for match in re.finditer(word, string):
                spans.append([match.span()[0], match.span()[1]])
            
    return spans

In [None]:
submission_json = {str(k): {"span": find_spans(test[test.text_id == k].text.item()), "label": list(v.values())} \
                   for k,v in submission.set_index('text_id').to_dict('index').items()}

In [None]:
import json
with open('sample_submission_top2.json', 'w') as final_submit:
    json.dump(submission_json, final_submit, indent=4)