In [98]:
!git clone https://github.com/KuzmaKhrabrov/character-tokenizer.git

fatal: destination path 'character-tokenizer' already exists and is not an empty directory.


In [99]:
!pip install transformers



In [100]:
import string
import sys
sys.path.append("character-tokenizer")
from charactertokenizer import CharacterTokenizer

chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
model_max_length = 64
tokenizer = CharacterTokenizer(chars, model_max_length)

In [101]:
len(chars)

66

In [102]:
example = "Привет"
tokens = tokenizer(example)
print(tokens)

{'input_ids': [0, 39, 42, 26, 12, 18, 46, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [103]:
example = "Привет"
tokens = tokenizer.encode_plus(
    example,
    add_special_tokens = True,
    max_length = 20,
    padding = 'max_length'
)
print(tokens)

{'input_ids': [0, 39, 42, 26, 12, 18, 46, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [104]:
#!rm -rf all_accents.zip

In [105]:
!wget https://github.com/Koziev/NLP_Datasets/raw/master/Stress/all_accents.zip
!unzip all_accents.zip

--2023-11-22 15:18:20--  https://github.com/Koziev/NLP_Datasets/raw/master/Stress/all_accents.zip
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Koziev/NLP_Datasets/master/Stress/all_accents.zip [following]
--2023-11-22 15:18:20--  https://raw.githubusercontent.com/Koziev/NLP_Datasets/master/Stress/all_accents.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10624775 (10M) [application/zip]
Saving to: ‘all_accents.zip.1’


2023-11-22 15:18:20 (163 MB/s) - ‘all_accents.zip.1’ saved [10624775/10624775]

Archive:  all_accents.zip
replace all_accents.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename

In [114]:
import pandas as pd
file_path = 'all_accents.tsv'
df = pd.read_csv(file_path, delimiter='\t')

df.sample(20)

Unnamed: 0,-де,-д^е
1508487,триенале,триен^але
834827,обтиснуло,обт^иснуло
56703,атрофирующимся,атроф^ирующимся
276290,голубевшему,голуб^евшему
751551,наяривающей,на^яривающей
679202,моддингу,м^оддингу
1590761,фунгицидному,фунгиц^идному
139501,венчурному,в^енчурному
72629,басовитых,басов^итых
1014626,пленении,плен^ении


In [115]:
len(df)

1680534

In [116]:
max_length = df['-де'].apply(len).max()

longest_string = df[df['-де'].apply(len) == max_length]['-де'].values[0]

print("Самая длинная строка:", longest_string)
print("Длина строки:", max_length)

Самая длинная строка: лланвайрпуллгуингиллгогерихуирндробуллллантисилиогогогох
Длина строки: 56


In [117]:
df.iloc[624999]

-де     лланвайрпуллгуингиллгогерихуирндробуллллантиси...
-д^е    лланвайрпуллгуингиллгогерихуирндробуллллантиси...
Name: 624999, dtype: object

*Жесткач :)*

In [118]:
import torch
from torch.utils.data import Dataset

In [119]:
from sklearn.model_selection import train_test_split

In [120]:
class StressDataset(Dataset):
    def __init__(self, file_path, max_length=56, test_size=0.5, random_seed=42):
        self.df = pd.read_csv(file_path, delimiter='\t')
        self.chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
        self.model_max_length = 64
        self.tokenizer = CharacterTokenizer(self.chars, self.model_max_length)
        self.word_max_length = max_length

        self.train_data_index, self.test_data_index = (
            train_test_split(
                self.df.index,
                test_size=test_size,
                random_state=random_seed
            )
        )
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.loc[idx, '-де']
        label = self.df.loc[idx, '-д^е']

        tokens = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.word_max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        # [CLS]востфак[SEP][PAD]...[PAD]
        # востфак востф^ак
        number_stress_token = label.find('^') + 1
        labels = [0]*len(tokens['attention_mask'].squeeze())
        labels[number_stress_token] = 1
        
        labels = torch.tensor(labels, dtype=torch.long)
        tokens['labels'] = labels
        return tokens
        

In [121]:
file_path = 'all_accents.tsv'
stress_dataset = StressDataset(file_path)

In [122]:
INDEX = 107500
word_tokens = stress_dataset[INDEX]
(
word_tokens['labels'],
df.iloc[INDEX].values[0],
df.iloc[INDEX].values[1],
tokenizer.decode(word_tokens['input_ids'].squeeze().tolist())
)

(tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'бордель',
 'борд^ель',
 '[CLS]бордель[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]')

In [123]:
from torch.utils.data import Subset, DataLoader

In [124]:
file_path = 'all_accents.tsv'
stress_dataset = StressDataset(file_path)

In [125]:
def collate_fn(batch):
    input_ids = torch.stack([sample['input_ids'] for sample in batch])
    attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
    labels = torch.stack([sample['labels'] for sample in batch])

    return [input_ids.squeeze(1), attention_mask.squeeze(1), labels]

batch_size = 256

train_loader = DataLoader(
    Subset(stress_dataset, stress_dataset.train_data_index),
    batch_size=batch_size,
    shuffle=True,
    drop_last = True,
    collate_fn = collate_fn,
)

test_loader = DataLoader(
    Subset(stress_dataset, stress_dataset.test_data_index),
    batch_size=batch_size,
    shuffle=False,
    collate_fn = collate_fn,
)

In [126]:
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [127]:
!pip install transformers
!pip install pynvml



In [128]:
#TEST
cnt = 0
for batch in train_loader:
    print(len(batch[0]), len(batch[1]), len(batch[2]))
    print(type(batch[0]), type(batch[1]), type(batch[2]))
    cnt += 1
    if cnt == 3:
        break

256 256 256
<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>
256 256 256
<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>
256 256 256
<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>


In [131]:
#TEST
for batch in train_loader:
    print(batch[0].shape, batch[1].shape, batch[2].shape)
    break

torch.Size([256, 56]) torch.Size([256, 56]) torch.Size([256, 56])


In [133]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [134]:
import torch
from torch.utils.data import DataLoader
from transformers import BertForTokenClassification, AdamW

from transformers import BertConfig

config = BertConfig(
    hidden_size=512,
    num_labels=2,  # 2 класса: "NO", "PRIMARY"
    output_attentions=False,
    output_hidden_states=False,
    num_attention_heads=8,
    num_hidden_layers=4,
    max_position_embeddings=60,
    pad_token_id=0,
    id2label={
        "0": "NO",
        "1": "PRIMARY",
    },
    label2id={
        "NO": 0,
        "PRIMARY": 1,
    },
)

model = BertForTokenClassification(config)
model.cuda()


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(60, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, eleme

In [135]:
from transformers import get_linear_schedule_with_warmup

In [136]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1

total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
criterion = torch.nn.CrossEntropyLoss()



In [137]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_labels_flat = np.argmax(preds, axis=-1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_labels_flat == labels_flat) / len(labels_flat)

In [138]:
# TEST
#for batch in train_loader:
#    with torch.no_grad():
#        output = model(
#            batch[0].to(device),
#            token_type_ids=None,
#            attention_mask=batch[1].to(device),
#            labels=batch[2].to(device)
#        )
#    
#    logits = output.logits.detach().cpu().numpy()
#    label_ids = batch[2].to('cpu').numpy()
#    break
#

In [None]:
#logits.shape

In [None]:
#label_ids.shape

In [None]:
#np.argmax(logits, axis=-1)[0], label_ids[0]


In [139]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## пайплайн обучения

In [140]:
import random
import numpy as np

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


training_stats = []


total_t0 = time.time()

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()

    total_train_loss = 0
    total_train_accuracy = 0
    
    model.train()

    for step, batch in enumerate(train_loader):
        # Progress update every 100 batches.
        if step % 100 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()
        output = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
            return_dict=True
        )

        total_train_loss += output.loss.item()

        output.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        logits = output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()


        total_train_accuracy += flat_accuracy(logits, label_ids)


    avg_train_accuracy = total_train_accuracy / len(train_loader)

    avg_train_loss = total_train_loss / len(train_loader)

    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Accuracy: {0:.2f}".format(avg_train_accuracy))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()


    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in test_loader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():


            output = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)

        # Accumulate the validation loss.
        total_eval_loss += output.loss.item()

        # Move logits and labels to CPU
        logits = output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()


        total_eval_accuracy += flat_accuracy(logits, label_ids)


    avg_val_accuracy = total_eval_accuracy / len(test_loader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(test_loader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch   100  of  3,282.    Elapsed: 0:00:31.
  Batch   200  of  3,282.    Elapsed: 0:01:01.
  Batch   300  of  3,282.    Elapsed: 0:01:31.
  Batch   400  of  3,282.    Elapsed: 0:02:01.
  Batch   500  of  3,282.    Elapsed: 0:02:31.
  Batch   600  of  3,282.    Elapsed: 0:03:01.
  Batch   700  of  3,282.    Elapsed: 0:03:31.
  Batch   800  of  3,282.    Elapsed: 0:04:01.
  Batch   900  of  3,282.    Elapsed: 0:04:31.
  Batch 1,000  of  3,282.    Elapsed: 0:05:01.
  Batch 1,100  of  3,282.    Elapsed: 0:05:31.
  Batch 1,200  of  3,282.    Elapsed: 0:06:01.
  Batch 1,300  of  3,282.    Elapsed: 0:06:31.
  Batch 1,400  of  3,282.    Elapsed: 0:07:01.
  Batch 1,500  of  3,282.    Elapsed: 0:07:31.
  Batch 1,600  of  3,282.    Elapsed: 0:08:02.
  Batch 1,700  of  3,282.    Elapsed: 0:08:31.
  Batch 1,800  of  3,282.    Elapsed: 0:09:02.
  Batch 1,900  of  3,282.    Elapsed: 0:09:32.
  Batch 2,000  of  3,282.    Elapsed: 0:10:02.
  Batch 2,100  of  3,282.    Elapsed: 0:10:32.


## Accuracy: 0.99