In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 3.1 MB 12.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 16.1 MB/s 
[K     |████████████████████████████████| 596 kB 38.1 MB/s 
[K     |████████████████████████████████| 59 kB 6.3 MB/s 
[K     |████████████████████████████████| 895 kB 39.2 MB/s 
[?25h

In [2]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.metrics import log_loss

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import set_seed

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import os
import random
import re

import warnings
warnings.filterwarnings('ignore')

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
def seed_everything(seed: int):
    """Seeds and fixes every possible random state."""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    set_seed(seed)


SEED = 3407
seed_everything(SEED)

In [6]:
!nvidia-smi

Sat Nov  6 13:36:29 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    32W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Training & Validating

In [7]:
!wget -q -O train.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/fmWGQJvwU5ejog

In [135]:
train_df = pd.read_csv("train.csv")
train_df['list'] = train_df[train_df.columns[2:]].values.tolist()

new_train_df = train_df[['text', 'list']].copy()
new_train_df.head()

Unnamed: 0,text,list
0,"Корова, видимо вставая, раздавила себе сосок. ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
1,Корове 8 лет! Месяц назад промеж четвертей вым...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
2,"Молоко течёт само у коровы. Что делать, если у...","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]"
3,У нетели болячки на вымени.\nЗдравствуйте. Нет...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
4,"У меня первотелка, на днях отёл, у неё левый п...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"


In [9]:
!wget -q -O train_labels.json https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/0nJ2QTRb9-U7tA
!wget -q -O labeled_train_by_hand.json https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Oq154PAxDGeIFg

In [10]:
train_labels = pd.read_json('train_labels.json').T
labeled_by_hand_df = pd.read_json('labeled_train_by_hand.json')
labeled_by_hand_df['id'] = labeled_by_hand_df['id'] - np.ones(len(labeled_by_hand_df['id']))
labeled_by_hand_df['id'] = labeled_by_hand_df['id'].astype(int)

In [52]:
dct = {'text_id':[], 'span_text':[]}

for i in range(0, len(labeled_by_hand_df)):
    data = labeled_by_hand_df['annotations'][i][0]['result']
    id = labeled_by_hand_df.id[i]

    spans_text = []
    for span in data:
        spans_text.append(span['value']['text'])
    dct['span_text'].append(spans_text)
    dct['text_id'].append(id)


for i in range(0, 30):
    spans = train_labels.span[i]
    spans_text = []
    for span in spans:
        spans_text.append(
            train_df.text[i][span[0]:span[1]])
    
    dct['span_text'].append(spans_text)
    dct['text_id'].append(i)

In [64]:
dct = pd.DataFrame(dct)

In [136]:
for i in range(0, len(new_train_df)):
    new_train_df['text'][i] = " ".join(dct[dct['text_id'] == i]['span_text'].values[0]) +  " | " + new_train_df['text'][i]

In [138]:
model_checkpoint = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [139]:
MAX_LEN = 1024
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4

In [140]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [141]:
train_size = 0.8
train_dataset = new_train_df.sample(frac=train_size, random_state=200)

valid_dataset = new_train_df.drop(train_dataset.index).reset_index(drop=True)

train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, None)
validating_set = CustomDataset(valid_dataset, tokenizer, None)

FULL Dataset: (294, 2)
TRAIN Dataset: (235, 2)
VALID Dataset: (59, 2)


In [142]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32

g = torch.Generator()
g.manual_seed(SEED)

<torch._C.Generator at 0x7f61d56a7bf0>

In [143]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

training_loader = DataLoader(training_set, **train_params)
if train_size != 1:
  validating_loader = DataLoader(validating_set, **valid_params)

In [144]:
from lsep_loss import LSEPLoss
def lsep_fn(outputs, targets):
    return LSEPLoss()(outputs, targets)

def loss_fn(outputs, targets):
    return torch.nn.MultiLabelSoftMarginLoss()(outputs, targets)

In [145]:
LR = 0.0001
EPOCH = 16

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=11, output_attentions=True)
model.to(device);

optimizer = torch.optim.Adam(params=model.parameters(), lr=LR)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [146]:
def train(epoch):

    for ep in range(1, epoch+1):
        model.train()
        for _, data in enumerate(training_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)

            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)['logits']
            
            loss = lsep_fn(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # if _ % 5000 == 0:
            #     print(f'Epoch: {ep}, Loss:  {loss.item()}')

        # model.eval()
        # val_outputs = []
        # val_targets = []
        # for _, data in enumerate(validating_loader, 0):
        #     ids = data['ids'].to(device, dtype = torch.long)
        #     mask = data['mask'].to(device, dtype = torch.long)
            
        #     token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        #     targets = data['targets'].to(device, dtype = torch.float)

        #     outputs = model(ids, mask, token_type_ids)['logits']

        #     val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        #     val_targets.extend(targets.cpu().detach().numpy().tolist())

        
        # val_outputs = np.array(val_outputs)
        # lg = log_loss_score(val_targets, val_outputs)
        # lg = (1 - lg) * 0.8

        print(f'Epoch: {ep}, Loss:  {loss.item()}, Vall score: {0}')

In [147]:
def log_loss_score(gt, pr):
    
    log_loss_ = 0
    
    gt = np.array(gt)
    
    for i in range(10):
        log_loss_ += log_loss(gt[:, i], pr[:, i])
        
    return log_loss_ / 10

    
def validation():
    model.eval()

    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():
        for _, data in enumerate(validating_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)['logits']

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets

def get_val_metrics():
    outputs, targets = validation()
    outputs = np.array(outputs)
    lg = log_loss_score(targets, outputs)

    return (1 - lg) * 0.8

In [148]:
train(EPOCH)

Epoch: 1, Loss:  3.6025917530059814, Vall score: 0
Epoch: 2, Loss:  3.565387487411499, Vall score: 0
Epoch: 3, Loss:  2.851747751235962, Vall score: 0
Epoch: 4, Loss:  2.3248748779296875, Vall score: 0
Epoch: 5, Loss:  2.4597034454345703, Vall score: 0
Epoch: 6, Loss:  1.48008394241333, Vall score: 0
Epoch: 7, Loss:  2.0557000637054443, Vall score: 0
Epoch: 8, Loss:  1.1140085458755493, Vall score: 0
Epoch: 9, Loss:  0.7736775875091553, Vall score: 0
Epoch: 10, Loss:  0.6339202523231506, Vall score: 0
Epoch: 11, Loss:  0.5432386994361877, Vall score: 0
Epoch: 12, Loss:  1.2711057662963867, Vall score: 0
Epoch: 13, Loss:  0.8317217826843262, Vall score: 0
Epoch: 14, Loss:  0.12835459411144257, Vall score: 0
Epoch: 15, Loss:  0.1850109100341797, Vall score: 0
Epoch: 16, Loss:  0.08588054031133652, Vall score: 0


In [149]:
if train_size != 1:
  outputs, targets = validation()
  outputs = np.array(outputs)
  lg = log_loss_score(targets, outputs)

  print(f"log_loss = {lg}")
  print(f"log_score = {1 - lg}")
  print(f"log_score * 0.8 = {(1 - lg) * 0.8}")


log_loss = 0.188512780847221
log_score = 0.811487219152779
log_score * 0.8 = 0.6491897753222232


# Submission


In [109]:
!wget -q -O test.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Wo70d4_PAwujqA

In [150]:
test = pd.read_csv('test.csv')
test_spans = pd.read_json('test_df_with_spans20.json')

In [151]:
test_dct = {'text_id':[], 'span_text':[]}

for i in range(0, len(test_spans)):
    data = test_spans.span[i]
    id = test_spans.text_id[i]

    spans_text = []
    for span in data:
        text = test_spans.text[i][span[0]:span[1]]
        if text not in spans_text:
            spans_text.append(text)

    test_dct['span_text'].append(spans_text)
    test_dct['text_id'].append(id)

test_dct = pd.DataFrame(test_dct)

In [152]:
test['list'] = test[test.columns[2:]].values.tolist()

new_df_test = test[['text', 'list']].copy()
new_df_test.head()

Unnamed: 0,text,list
0,Понос у месячных телят. Подскажите методы и сп...,[]
1,"Понос у телят, чем лечить? \nЧем можно вылечит...",[]
2,По какой причине у телёнка отнимаются ноги?\nП...,[]
3,"Срочно! Ребятки, помогите, корову что-то укуси...",[]
4,"Сгустки у коровы.\nЗдравствуйте, помогите пожа...",[]


In [153]:
for i in range(0, len(new_df_test)):
    new_df_test['text'][i] = " ".join(test_dct['span_text'][i]) +  " | " + new_df_test['text'][i]

In [154]:
new_df_test

Unnamed: 0,text,list
0,Понос | Понос у месячных телят. Подскажите мет...,[]
1,Понос понос понос с жидкий стул поносить плохо...,[]
2,отнимаются ноги отнимаются аппетит есть не пон...,[]
3,"укусило недомогание вялость | Срочно! Ребятки,...",[]
4,Сгустки сгустки крови | Сгустки у коровы.\nЗдр...,[]
...,...,...
94,"поносит | Большой бык уже неделю поносит, чем ...",[]
95,дерти запоносила | Как запустить желудок у кор...,[]
96,пропоносился какойто прозрачной слизью | Здрав...,[]
97,отошел вымя уплотнения вымени | После отела у ...,[]


In [155]:
test_dataset = new_df_test.reset_index(drop=True)

print("TEST Dataset: {}".format(test_dataset.shape))

testing_set = CustomDataset(test_dataset, tokenizer, None)

TEST Dataset: (99, 2)


In [156]:
test_params = {'batch_size': 4,
                'shuffle': False,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

testing_loader = DataLoader(testing_set, **test_params)

In [157]:
model.eval()
prediction = []

with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)

        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)['logits']

        prediction.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

In [158]:
submission_columns = ['text_id'] + list(train_df.columns[2:-2])
submission = pd.concat([test['text_id'], pd.DataFrame(np.array(prediction)[:, :10])], axis=1)
submission.columns = submission_columns

In [159]:
# submission_json = {str(k): {"span": find_spans(test[test.text_id == k].text.item()), "label": list(v.values())} \
#                    for k,v in submission.set_index('text_id').to_dict('index').items()}

submission_json = {str(k): {"span": test_spans[test_spans.text_id == k].span.item(), "label": list(v.values())} \
                   for k,v in submission.set_index('text_id').to_dict('index').items()}

In [160]:
import json
with open('submisson_test_spans20.json', 'w') as final_submit:
    json.dump(submission_json, final_submit, indent=4)

In [161]:
model.save_pretrained('clf_bert20/')
tokenizer.save_pretrained('clf_bert20/')

('clf_bert20/tokenizer_config.json',
 'clf_bert20/special_tokens_map.json',
 'clf_bert20/vocab.txt',
 'clf_bert20/added_tokens.json',
 'clf_bert20/tokenizer.json')

In [162]:
!zip -r /content/clf_bert20.zip /content/clf_bert20

  adding: content/clf_bert20/ (stored 0%)
  adding: content/clf_bert20/vocab.txt (deflated 64%)
  adding: content/clf_bert20/pytorch_model.bin (deflated 8%)
  adding: content/clf_bert20/special_tokens_map.json (deflated 40%)
  adding: content/clf_bert20/tokenizer.json (deflated 65%)
  adding: content/clf_bert20/tokenizer_config.json (deflated 42%)
  adding: content/clf_bert20/config.json (deflated 57%)


In [163]:
from google.colab import files
files.download("/content/clf_bert20.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>