In [1]:
!pip install -q transformers

In [2]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.metrics import log_loss

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import set_seed

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import os
import random
import re

import warnings
warnings.filterwarnings('ignore')

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
def seed_everything(seed: int):
    """Seeds and fixes every possible random state."""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    set_seed(seed)


SEED = 3407
seed_everything(SEED)

In [6]:
!nvidia-smi

Sun Nov 14 11:21:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    28W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Training & Validating

In [7]:
!wget -q -O train.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/fmWGQJvwU5ejog

In [8]:
train_df = pd.read_csv("train.csv")
train_df['list'] = train_df[train_df.columns[2:]].values.tolist()

new_train_df = train_df[['text', 'list']].copy()
new_train_df.head()

Unnamed: 0,text,list
0,"Корова, видимо вставая, раздавила себе сосок. ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
1,Корове 8 лет! Месяц назад промеж четвертей вым...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
2,"Молоко течёт само у коровы. Что делать, если у...","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]"
3,У нетели болячки на вымени.\nЗдравствуйте. Нет...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
4,"У меня первотелка, на днях отёл, у неё левый п...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"


In [9]:
!wget -q -O train_labels.json https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/0nJ2QTRb9-U7tA
!wget -q -O labeled_train_by_hand.json https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Oq154PAxDGeIFg

In [10]:
train_labels = pd.read_json('train_labels.json').T
labeled_by_hand_df = pd.read_json('labeled_train_by_hand.json')
labeled_by_hand_df['id'] = labeled_by_hand_df['id'] - np.ones(len(labeled_by_hand_df['id']))
labeled_by_hand_df['id'] = labeled_by_hand_df['id'].astype(int)

In [11]:
dct = {'text_id':[], 'span_text':[]}

for i in range(0, len(labeled_by_hand_df)):
    data = labeled_by_hand_df['annotations'][i][0]['result']
    id = labeled_by_hand_df.id[i]

    spans_text = []
    for span in data:
        spans_text.append(span['value']['text'])

    # spans_text = np.unique(spans_text)
    dct['span_text'].append(spans_text)
    dct['text_id'].append(id)


for i in range(0, 30):
    spans = train_labels.span[i]
    spans_text = []
    for span in spans:
        spans_text.append(
            train_df.text[i][span[0]:span[1]])
    
    # spans_text = np.unique(spans_text)
    dct['span_text'].append(spans_text)
    dct['text_id'].append(i)

In [12]:
dct = pd.DataFrame(dct)

In [13]:
for i in range(0, len(new_train_df)):
    new_train_df['text'][i] = " ".join(dct[dct['text_id'] == i]['span_text'].values[0]) +  " | " + new_train_df['text'][i]

In [14]:
new_train_df

Unnamed: 0,text,list
0,раздавила себе сосок постоянно бежит молоко | ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
1,промеж четвертей вымени появились бугорки гной...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
2,Молоко течёт слабые соски молоко бежит Молоко ...,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]"
3,болячки на вымени вымя покрылось маленькими бо...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
4,"сосок как будто пустой | У меня первотелка, на...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
...,...,...
289,"идет пена изо рта грызет кормушку не жевала ""ж...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]"
290,понос с кровью понос с кровью | Чем лечить пон...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
291,больного легкого воспаленное | Здравствуйте!\n...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]"
292,"отек | всем привет,корова сегодня отелилась от...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"


In [15]:
model_checkpoint = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [16]:
MAX_LEN = 1024
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4

In [17]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [18]:
train_size = 0.8
train_dataset = new_train_df.sample(frac=train_size, random_state=200)

valid_dataset = new_train_df.drop(train_dataset.index).reset_index(drop=True)

train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, None)
validating_set = CustomDataset(valid_dataset, tokenizer, None)

FULL Dataset: (294, 2)
TRAIN Dataset: (235, 2)
VALID Dataset: (59, 2)


In [19]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32

g = torch.Generator()
g.manual_seed(SEED)

<torch._C.Generator at 0x7f5e6a9d1590>

In [20]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

training_loader = DataLoader(training_set, **train_params)
if train_size != 1:
  validating_loader = DataLoader(validating_set, **valid_params)

In [21]:
from lsep_loss import LSEPLoss
def lsep_fn(outputs, targets):
    return LSEPLoss()(outputs, targets)

def loss_fn(outputs, targets):
    return torch.nn.MultiLabelSoftMarginLoss()(outputs, targets)

def bce_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [22]:
LR = 0.0001
EPOCH = 17

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=11, output_attentions=True)
model.to(device);

optimizer = torch.optim.Adam(params=model.parameters(), lr=LR)

# optimizer = torch.optim.Adam([
#                 {'params': model.bert.parameters()},
#                 {'params': model.classifier.parameters(), 'lr': 5e-4}
#             ], lr=1e-4)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [23]:
def train(epoch):

    for ep in range(1, epoch+1):
        model.train()
        for _, data in enumerate(training_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)

            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)['logits']
            
            loss = lsep_fn(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            

        print(f'Epoch: {ep}, Loss:  {loss.item()}')

In [24]:
def log_loss_score(gt, pr):
    
    log_loss_ = 0
    
    gt = np.array(gt)
    
    for i in range(10):
        log_loss_ += log_loss(gt[:, i], pr[:, i])
        
    return log_loss_ / 10

    
def validation():
    model.eval()

    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():
        for _, data in enumerate(validating_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)['logits']

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets

def get_val_metrics():
    outputs, targets = validation()
    outputs = np.array(outputs)
    lg = log_loss_score(targets, outputs)

    return (1 - lg) * 0.8

In [25]:
train(EPOCH)

Epoch: 1, Loss:  3.5059585571289062
Epoch: 2, Loss:  3.4137165546417236
Epoch: 3, Loss:  2.961862564086914
Epoch: 4, Loss:  2.337578296661377
Epoch: 5, Loss:  2.162369728088379
Epoch: 6, Loss:  1.597132921218872
Epoch: 7, Loss:  2.150876760482788
Epoch: 8, Loss:  1.2317945957183838
Epoch: 9, Loss:  0.8949902653694153
Epoch: 10, Loss:  0.5747641921043396
Epoch: 11, Loss:  0.3210470378398895
Epoch: 12, Loss:  0.8705685138702393
Epoch: 13, Loss:  0.9996351003646851
Epoch: 14, Loss:  0.1059616208076477
Epoch: 15, Loss:  0.32470327615737915
Epoch: 16, Loss:  0.12000083923339844
Epoch: 17, Loss:  0.14696013927459717


In [26]:
if train_size != 1:
  outputs, targets = validation()
  outputs = np.array(outputs)
  lg = log_loss_score(targets, outputs)

  print(f"log_loss = {lg}")
  print(f"log_score = {1 - lg}")
  print(f"log_score * 0.8 = {(1 - lg) * 0.8}")


log_loss = 0.19816434424859902
log_score = 0.801835655751401
log_score * 0.8 = 0.6414685246011209


# Submission


In [27]:
!wget -q -O test.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Wo70d4_PAwujqA

In [30]:
test = pd.read_csv('test.csv')
test_spans = pd.read_json('test_df_with_spans229.json')

In [31]:
test_dct = {'text_id':[], 'span_text':[]}

for i in range(0, len(test_spans)):
    data = test_spans.span[i]
    id = test_spans.text_id[i]

    spans_text = []
    for span in data:
        text = test_spans.text[i][span[0]:span[1]]
        if text not in spans_text:
            spans_text.append(text)
    
    # spans_text = np.unique(spans_text)
    test_dct['span_text'].append(spans_text)
    test_dct['text_id'].append(id)

test_dct = pd.DataFrame(test_dct)

In [32]:
test['list'] = test[test.columns[2:]].values.tolist()

new_df_test = test[['text', 'list']].copy()
new_df_test.head()

Unnamed: 0,text,list
0,Понос у месячных телят. Подскажите методы и сп...,[]
1,"Понос у телят, чем лечить? \nЧем можно вылечит...",[]
2,По какой причине у телёнка отнимаются ноги?\nП...,[]
3,"Срочно! Ребятки, помогите, корову что-то укуси...",[]
4,"Сгустки у коровы.\nЗдравствуйте, помогите пожа...",[]


In [33]:
for i in range(0, len(new_df_test)):
    new_df_test['text'][i] = " ".join(test_dct['span_text'][i]) +  " | " + new_df_test['text'][i]

In [34]:
test_dataset = new_df_test.reset_index(drop=True)

print("TEST Dataset: {}".format(test_dataset.shape))

testing_set = CustomDataset(test_dataset, tokenizer, None)

TEST Dataset: (99, 2)


In [35]:
test_params = {'batch_size': 4,
                'shuffle': False,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

testing_loader = DataLoader(testing_set, **test_params)

In [36]:
model.eval()
prediction = []

with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)

        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)['logits']

        prediction.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

In [37]:
submission_columns = ['text_id'] + list(train_df.columns[2:-2])
submission = pd.concat([test['text_id'], pd.DataFrame(np.array(prediction)[:, :10])], axis=1)
submission.columns = submission_columns

In [38]:
submission_json = {str(k): {"span": test_spans[test_spans.text_id == k].span.item(), "label": list(v.values())} \
                   for k,v in submission.set_index('text_id').to_dict('index').items()}

In [39]:
import json
with open('submisson_test_spans231_229.json', 'w') as final_submit:
    json.dump(submission_json, final_submit, indent=4)