In [462]:
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import math
from tqdm.auto import tqdm
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler

from transformers import BertTokenizer, BertForSequenceClassification, BertModel

In [227]:
lines_train = [line.strip('\n').rsplit(',', 1) for line in open('train.csv')]
lines_test = [line.strip('\n').rsplit(',', 1) for line in open('test.csv')]

In [200]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [305]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

Цель: зафайнтюнить берт на классификацию по контекстному эмбеддингу токена (или среднего) от ENT.

# Preparing for sequence bucketing 

In [306]:

def crop_len(seq):

    ENT_indexes = [ind for ind, token in enumerate(seq) if token == '[MASK]']
    if len(ENT_indexes) > 1:
        # нужно найти среднюю точку между ENT и обрезать по ней
        middle_point = int(np.mean([ENT_indexes[0], ENT_indexes[-1]]))
        left = middle_point-254
        right = middle_point+254
        return seq[max(0,left):right]
    ind = ENT_indexes[0]
    if ind >= 254:
        return seq[ind-254:ind+254]
    remained = 254 - ind
    return seq[:ind+254+remained]
     
            
    

In [307]:
tokenized_train = []
labels_train = []

batch_size=256

for i in tqdm(lines_train):
    tokens = tokenizer.tokenize(i[0].replace('ENT', '[MASK]'))
    if len(tokens) > 508:
        tokens=crop_len(tokens)
    tokenized_train.append(tokenizer.encode(tokens))
    labels_train.append(i[1])


HBox(children=(FloatProgress(value=0.0, max=23253.0), HTML(value='')))




In [308]:
tokenized_test = []
labels_test = []

batch_size=256

for i in tqdm(lines_test):
    tokens = tokenizer.tokenize(i[0].replace('ENT', '[MASK]'))
    if len(tokens) > 508:
        tokens=crop_len(tokens)
    tokenized_test.append(tokenizer.encode(tokens))
    labels_test.append(i[1])


HBox(children=(FloatProgress(value=0.0, max=9966.0), HTML(value='')))




In [315]:
zipped_train = zip(tokenized_train, labels_train)
zipped_test = zip(tokenized_test, labels_test)

In [316]:
tokenized_train = sorted(zipped_train, key=lambda x: len(x[0]), reverse=True)
tokenized_test = sorted(zipped_test, key=lambda x: len(x[0]), reverse=True)

# Dataset-forming

In [317]:
batch_size = 32

batches_train = []

batches_test = []

for i_batch in range(math.ceil(len(tokenized_train) / batch_size)):
    
    batches_train.append(tokenized_train[i_batch*batch_size:(i_batch+1)*batch_size])
    

for i_batch in range(math.ceil(len(tokenized_test) / batch_size)):
    
    batches_test.append(tokenized_test[i_batch*batch_size:(i_batch+1)*batch_size])

In [318]:
random.shuffle(batches_train)
random.shuffle(batches_test)

In [384]:
class SequenceBucketingData(torch.utils.data.Dataset):
    
    def __init__(self, data, pad_index):
        
        self.data = data
        self.pad_index = pad_index
        self.map = {'1':0, '0':1, '-1':2}

        
    def __len__(self):
        
        return len(self.data)
    
    def pad_seq(self, sequence, max_len):
        
        x = sequence[0]
        label = sequence[1]
        
                
        pads_x = [self.pad_index] * (max_len - len(x))
        
        x += pads_x
        
        y = np.zeros([3])
        y[self.map[label]] = 1
        
        return x, y
    
    
    def make_mask(self,x):
        return [float(i>0) for i in x]

    
    def __getitem__(self, index):
        
        batch = self.data[index]

        max_len = max([len(sample[0]) for sample in batch])
        
        batch_x = []
        batch_y = []
        attn_masks = []
        
        for sample in batch:
            x, y = self.pad_seq(sample, max_len)
            batch_x.append(x)
            batch_y.append(y)
            
            attn_masks.append(self.make_mask(x))
        
        batch_x = torch.tensor(batch_x).long()
        batch_y = torch.tensor(batch_y).long()
        
        attn_masks = torch.tensor(attn_masks).long()
        
        
        return batch_x, attn_masks, batch_y

In [385]:
pad_index = 0

In [386]:
train_loader = SequenceBucketingData(batches_train, pad_index)

In [387]:
test_loader = SequenceBucketingData(batches_test, pad_index)

# Model

In [451]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, num_labels=3):
        super(BertForSequenceClassification, self).__init__()
        
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, self.num_labels)
        
        nn.init.xavier_normal_(self.classifier.weight)
       
    
    
    
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        token_embs, _ = self.bert(input_ids, token_type_ids, attention_mask)
        token_embs = self.dropout(token_embs)
        
        # по-хорошему это надо делать через torch.gather
        # а не циклом
        # но я пока не разобралась, как его правильно использовать
        
        avg_for_element = []
        
        for i,line in enumerate(input_ids):
            embs = [token_embs[i][ind] for ind, tok in enumerate(line) if tok==103]
            avg_for_element.append(torch.mean(torch.stack(embs),dim=0))

            
            
        avg_for_element = torch.stack(avg_for_element)
        
        logits = self.classifier(avg_for_element)
        return logits
    
    
    
    def freeze(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [452]:

model=BertForSequenceClassification()
model=model.to(device)

In [466]:
lrlast = .001
lrmain = .00001


optim = optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

In [459]:
lr_scheduler = lr_scheduler.StepLR(optim, step_size=3, gamma=0.1)

In [461]:
criterion = nn.CrossEntropyLoss(ignore_index = 0)

In [492]:
def train(model, iterator, optimizer, criterion, epoch_number):
    model.train()
    progress_bar = tqdm(enumerate(iterator), total=len(iterator), desc='Epoch {}'.format(epoch_number + 1))
    epoch_loss = 0
    losses_list=[]
    for i, batch in enumerate(progress_bar):
        input_ids = batch[1][0].to(device)
        input_mask = batch[1][1].to(device)
        labels = batch[1][2].to(device)
        
        
        optimizer.zero_grad()
        
        with torch.set_grad_enabled(True):
            outputs = model(input_ids)
            outputs = F.softmax(outputs,dim=1)
            outputs = outputs.float()
            

            labels = torch.argmax(labels, dim=1)
  
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            losses_list.append(float(loss))
            progress_bar.set_postfix(train_loss = np.mean(losses_list[-100:]))
            

In [1]:
train(model, train_loader, optim, criterion, 0)

NameError: name 'train' is not defined