In [1]:
!pip install -q transformers

In [2]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.metrics import log_loss

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
import os
import random


def seed_everything(seed: int):
    """Seeds and fixes every possible random state."""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
SEED = 3407

In [6]:
seed_everything(SEED)

In [7]:
!nvidia-smi

Sat Oct 30 16:40:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8    28W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
!wget -q -O train.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/fmWGQJvwU5ejog

In [9]:
df = pd.read_csv("train.csv")
df['list'] = df[df.columns[2:]].values.tolist()

new_df = df[['text', 'list']].copy()
new_df.head()

Unnamed: 0,text,list
0,"Корова, видимо вставая, раздавила себе сосок. ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
1,Корове 8 лет! Месяц назад промеж четвертей вым...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
2,"Молоко течёт само у коровы. Что делать, если у...","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]"
3,У нетели болячки на вымени.\nЗдравствуйте. Нет...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]"
4,"У меня первотелка, на днях отёл, у неё левый п...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"


In [10]:
model_checkpoint = "cointegrated/rubert-tiny2"
# model_checkpoint = "sberbank-ai/ruRoberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [11]:
MAX_LEN = 1024
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4

In [12]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [14]:
train_size = 1
train_dataset = new_df.sample(frac=train_size, random_state=200)
# test_dataset = new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
# print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, None)
# testing_set = CustomDataset(test_dataset, tokenizer, None)

FULL Dataset: (294, 2)
TRAIN Dataset: (294, 2)


In [15]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    # numpy.random.seed(worker_seed)
    # random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(SEED)

<torch._C.Generator at 0x7ff77ddffd50>

In [16]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

# test_params = {'batch_size': VALID_BATCH_SIZE,
#                 'shuffle': True,
#                 'num_workers': 0,
#                 'worker_init_fn' : seed_worker,
#                 'generator': g
#                 }

training_loader = DataLoader(training_set, **train_params)
# testing_loader = DataLoader(testing_set, **test_params)

In [17]:
from lsep_loss import LSEPLoss
def lsep_fn(outputs, targets):
    return LSEPLoss()(outputs, targets)

def loss_fn(outputs, targets):
    return torch.nn.MultiLabelSoftMarginLoss()(outputs, targets)

In [18]:
# class BERTClass(torch.nn.Module):
#     def __init__(self):
#         super(BERTClass, self).__init__()
#         self.l1 = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=11, output_attentions=True)
# #         self.l2 = torch.nn.Dropout(0.3)
# #         self.l3 = torch.nn.Linear(768, 11)
    
#     def forward(self, ids, mask, token_type_ids):
#         output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
# #         output_2 = self.l2(output_1)
# #         output = self.l3(output_2)
#         return output

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=11, output_attentions=True)
model.to(device);

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [19]:
LR = 0.0001
EPOCH = 16

optimizer = torch.optim.Adam(params=model.parameters(), lr=LR)
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.95)

In [20]:
def train(epoch):
    model.train()

    for ep in range(1, epoch+1):
        for _, data in enumerate(training_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)['logits']
            
            loss = lsep_fn(outputs, targets)
            if _ % 5000 == 0:
                print(f'Epoch: {ep}, Loss:  {loss.item()}')
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # scheduler.step()

In [21]:
train(EPOCH)



Epoch: 1, Loss:  4.908163070678711
Epoch: 2, Loss:  4.733373641967773
Epoch: 3, Loss:  4.61601448059082
Epoch: 4, Loss:  3.78558087348938
Epoch: 5, Loss:  3.3926949501037598
Epoch: 6, Loss:  2.975013256072998
Epoch: 7, Loss:  2.3513426780700684
Epoch: 8, Loss:  2.345792055130005
Epoch: 9, Loss:  1.6584057807922363
Epoch: 10, Loss:  1.5757291316986084
Epoch: 11, Loss:  1.2364106178283691
Epoch: 12, Loss:  0.9897606372833252
Epoch: 13, Loss:  0.6624155044555664
Epoch: 14, Loss:  1.1023290157318115
Epoch: 15, Loss:  0.3205821216106415
Epoch: 16, Loss:  0.3095798194408417


In [22]:
def log_loss_score(gt, pr):
    
    log_loss_ = 0
    
    gt = np.array(gt)
    
    for i in range(10):
        log_loss_ += log_loss(gt[:, i], pr[:, i])
        
    return log_loss_ / 10

    
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)['logits']

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation()
outputs = np.array(outputs)
lg = log_loss_score(targets, outputs)

print(f"log_loss = {lg}")
print(f"log_score = {1 - lg}")
print(f"log_score * 0.8 = {(1 - lg) * 0.8}")


In [None]:
model.save_pretrained('./my_model_directory/')

In [23]:
!wget -O test.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Wo70d4_PAwujqA

--2021-10-30 16:52:55--  https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Wo70d4_PAwujqA
Resolving getfile.dokpub.com (getfile.dokpub.com)... 78.46.92.107
Connecting to getfile.dokpub.com (getfile.dokpub.com)|78.46.92.107|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://downloader.disk.yandex.ru/disk/8022b04c52486df8c82d7be76e7902aed39ad28b23d6bf2a7c35a0e2fcd23e73/617db096/Qm4uol8YwDWoNMl-QrexyAzTHoiTcsXriLWHBMrXfPyevn-ErDVwHXuhEdaJZb3YBlvB00IkXfr6I2b0I_fWfw%3D%3D?uid=0&filename=test.csv&disposition=attachment&hash=p%2B0DopV%2BDMXvVr6WS79GMCvyj9ce6f/MKRhEK/8iSFQUuH/GP3sKCJPonx3kgX%2Bbq/J6bpmRyOJonT3VoXnDag%3D%3D%3A&limit=0&content_type=text%2Fplain&owner_uid=331117880&fsize=57137&hid=1b8254245ab772eb965757e08d9a8211&media_type=spreadsheet&tknv=v2 [following]
--2021-10-30 16:52:55--  https://downloader.disk.yandex.ru/disk/8022b04c52486df8c82d7be76e7902aed39ad28b23d6bf2a7c35a0e2fcd23e73/617db096/Qm4uol8YwDWoNMl-QrexyAzTHoiTc

In [33]:
test = pd.read_csv('test.csv')

In [34]:
test['list'] = test[test.columns[2:]].values.tolist()

new_df_test = test[['text', 'list']].copy()
new_df_test.head()

Unnamed: 0,text,list
0,Понос у месячных телят. Подскажите методы и сп...,[]
1,"Понос у телят, чем лечить? \nЧем можно вылечит...",[]
2,По какой причине у телёнка отнимаются ноги?\nП...,[]
3,"Срочно! Ребятки, помогите, корову что-то укуси...",[]
4,"Сгустки у коровы.\nЗдравствуйте, помогите пожа...",[]


In [36]:
test_dataset = new_df_test.reset_index(drop=True)


# print("FULL Dataset: {}".format(new_df.shape))
# print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

# training_set = CustomDataset(train_dataset, tokenizer, None)
testing_set = CustomDataset(test_dataset, tokenizer, None)

TEST Dataset: (99, 2)


In [37]:
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0,
                'worker_init_fn' : seed_worker,
                'generator': g
                }

testing_loader = DataLoader(testing_set, **test_params)

In [39]:
model.eval()
fin_targets = []
fin_outputs=[]
with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)['logits']

        # fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.984777,0.028533,0.989156,0.972996,0.016312,0.017636,0.010771,0.017411,0.019658,0.024005
1,0.968251,0.018059,0.968448,0.904556,0.007197,0.034855,0.005033,0.011681,0.020398,0.017916
2,0.161530,0.029631,0.030454,0.043099,0.007818,0.034194,0.005407,0.929650,0.032584,0.013099
3,0.183778,0.004575,0.012023,0.014413,0.017741,0.097294,0.010190,0.036662,0.646961,0.054183
4,0.015656,0.076258,0.008621,0.006102,0.433398,0.012469,0.248799,0.005941,0.063254,0.023349
...,...,...,...,...,...,...,...,...,...,...
94,0.982812,0.020811,0.978432,0.950763,0.011397,0.014183,0.009994,0.016664,0.015911,0.022440
95,0.823035,0.005850,0.499800,0.302551,0.177266,0.005724,0.013169,0.009183,0.019087,0.013825
96,0.026396,0.138370,0.017271,0.012831,0.029274,0.009364,0.010733,0.025791,0.080594,0.012028
97,0.012397,0.209737,0.013350,0.014576,0.104449,0.009924,0.838528,0.058541,0.039586,0.063909


In [48]:
submission_columns

['text_id']

In [50]:
submission_columns = ['text_id'] + list(df.columns[2:-2])
submission = pd.concat([test['text_id'], pd.DataFrame(np.array(fin_outputs)[:, :10])], axis=1)
submission.columns = submission_columns

In [105]:
submission_json = {str(k): {"span": find_spans(test[test.text_id == k].text.item()), "label": list(v.values())} \
                   for k,v in submission.set_index('text_id').to_dict('index').items()}



In [106]:
import json
with open('sample_submission1.json', 'w') as final_submit:
    json.dump(submission_json, final_submit, indent=4)

In [57]:
span_words = {'Лысые',
 'Молоко',
 'Отек',
 'Отёк',
 'Серку',
 'бежит',
 'бока',
 'болячками',
 'болячки',
 'брюху',
 'бугорки',
 'будто',
 'было',
 'в',
 'впалые',
 'вымени',
 'вымя',
 'вяло',
 'вялое',
 'вялой',
 'газы',
 'гной',
 'гнойные',
 'горечи',
 'горчить',
 'горькие',
 'горячий',
 'густой',
 'дойках',
 'дугой',
 'еды',
 'ела',
 'ест,',
 'есть',
 'жвачки',
 'жиденький',
 'жидкий',
 'жидковат',
 'жует',
 'заднюю',
 'зеленый',
 'и',
 'как',
 'кал',
 'кашлять',
 'кожи',
 'коричневые',
 'корка',
 'крем',
 'кровью',
 'лейкозной',
 'лейкозными',
 'лохо',
 'маленькими',
 'меньше',
 'может',
 'молоко',
 'на',
 'наступает',
 'наступить',
 'не',
 'небыло',
 'немного',
 'нечего',
 'нога',
 'ногах',
 'ногу',
 'нос',
 'объеме',
 'опухла',
 'опухоли',
 'опухоль',
 'от',
 'отказался',
 'отказываются',
 'отравилась',
 'отрыжки',
 'отрыжкой',
 'отёк',
 'отёка',
 'отёком',
 'паратифа',
 'парез',
 'пить',
 'плохо',
 'по',
 'подкожные',
 'покашливает',
 'покрылось',
 'понос',
 'поноса',
 'поносили',
 'поносит',
 'поносить',
 'послеродовой',
 'постоянно',
 'похудела',
 'почти',
 'появились',
 'появляется',
 'приподнят',
 'промеж',
 'пустой',
 'пьёт',
 'раздавила',
 'с',
 'себе',
 'слабые',
 'сливках',
 'сливки',
 'сопельками',
 'сосках',
 'соски',
 'сосок',
 'стал',
 'стул',
 'сухой',
 'твёрдый',
 'температура',
 'течёт',
 'точки',
 'трястись',
 'убавила',
 'увеличение',
 'удой',
 'упала',
 'уплотнение',
 'участки',
 'хвост',
 'хромала',
 'худеет',
 'худеть',
 'худоба',
 'худой',
 'четвертей',
 'шишки',
 'язвочек',
 'язвочки',
 'язвы'}

In [58]:
test

Unnamed: 0,text_id,text,list
0,294,Понос у месячных телят. Подскажите методы и сп...,[]
1,295,"Понос у телят, чем лечить? \nЧем можно вылечит...",[]
2,296,По какой причине у телёнка отнимаются ноги?\nП...,[]
3,297,"Срочно! Ребятки, помогите, корову что-то укуси...",[]
4,298,"Сгустки у коровы.\nЗдравствуйте, помогите пожа...",[]
...,...,...,...
94,388,"Большой бык уже неделю поносит, чем помочь?\nП...",[]
95,389,Как запустить желудок у коровы?\nУ меня объела...,[]
96,390,"Здрав твуйте, искал похожий случай в темах фор...",[]
97,391,"После отела у коровы ,не отошел послед и было ...",[]


In [69]:
test.text[1]

'Понос у телят, чем лечить? \nЧем можно вылечить понос 1,5 месячного телёнка в домашних условиях?\nВетврача у нас нет! Есть лекарство Детрим. Но может быть ещё что-то необходимо? Телёнок месяц сосал корову, потом пил молоко (2 литра за раз).\nВ чём причина? И сколько нужно давать пойла? Только молоко или ещё с чем то смешивать?\nЕщё темы про понос у телят:\nУ телёнка понос с кровью, чем лечить?\nПомогите остановить понос у 10 дневного телёнка\nПонос у месячных телят\nПри поносе у тёлочки может дать геркулесовую кашу?\nУ телёнка жидкий стул с кровью\nТелёнок стал поносить, что это и как остановить понос?\nРотавирусная инфекция телят\nСовет тем, кто выпаивает телят\nДиспепсия новорожденных у сельскохозяйственных животных\nКоронавирусный энтерит телят\nТелёнок плохо встаёт и ест после лечения поноса'

In [79]:
import re
def find_spans(string):
    spans = []
    for word in span_words:
        if len(word) > 3:
            for match in re.finditer(word, string):
                spans.append([match.span()[0], match.span()[1]])
                # print(match.span())
                # print(word, string[match.span()[0]:match.span()[1]])

    return spans

In [80]:
find_spans(test.text[1])

[[47, 52],
 [339, 344],
 [364, 369],
 [412, 417],
 [467, 472],
 [558, 563],
 [593, 598],
 [785, 790],
 [142, 147],
 [484, 489],
 [558, 565],
 [558, 566],
 [531, 535],
 [752, 757],
 [553, 557],
 [214, 220],
 [291, 297],
 [372, 378],
 [538, 544],
 [524, 530],
 [785, 791],
 [753, 757]]