In [8]:
import torch
import numpy as np
import pandas as pd
from sklearn import preprocessing
import gc
from tqdm import tqdm
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AdamW, get_scheduler
model_name = "DeepPavlov/rubert-base-cased-sentence"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2648)
#load best model
#model.load_state_dict(torch.load("../../../DeepPavlov_model10d/pytorch_model.bin", map_location=device))
model.to(device)

df = pd.read_csv("../data/mergedcleared1307.csv", sep=';', names=['id', 'label'], dtype={'id': str, 'label': str})
df = df[df['label'].notnull()]
#df['g_id'] = df.id.str.slice(start=0, stop=4)
Label_encoder = preprocessing.LabelEncoder()
Label_encoder.fit(df['id'])
df['id_enc'] = Label_encoder.fit_transform(df['id'])
df.columns = ['id', 'description', 'labels']
#data = df.drop(['id', 'g_id'], axis=1, inplace=False)
df.to_csv('../sets/sets10d/df-id-label.csv', index=False)
#data.to_csv('../sets/sets10d/df-label.csv', index=False)
np.save('../sets/sets10d/classes10d.npy', Label_encoder.classes_)
data = df.sample(frac=1).reset_index(drop=True)
train = data[:int(len(df)*0.8)]
test =  data[int(len(df)*0.8):]
train.to_csv('../sets/sets10d/train_with_ids.csv', index=False)
test.to_csv('../sets/sets10d/test_with_ids.csv', index=False)
test.drop(['id'], axis=1, inplace=True)
train.drop(['id'], axis=1, inplace=True)
train.to_csv('../sets/sets10d/train.csv', index=False)
test.to_csv('../sets/sets10d/test.csv', index=False)

dataset = load_dataset('csv', data_files={'train': '../sets/sets10d/train.csv', 'test': '../sets/sets10d/test.csv'})
dataset = dataset.map(lambda e: tokenizer(e['description'], truncation = True, max_length=100, padding='max_length'), batched=True)
pytorch_style_columns = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']  # 
dataset = dataset.remove_columns(set(dataset['train'].features.keys()) - set(pytorch_style_columns))
dataset.set_format(type='torch', columns=pytorch_style_columns, device='cuda')

train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=8)
test_dataloader = DataLoader(dataset['test'], shuffle=False, batch_size=8)
#если есть видеопамять около 10гб, можно поставить 8


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(['id'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(['id'], axis=1, inplace=True)


Downloading and preparing dataset csv/default to C:/Users/yeril/.cache/huggingface/datasets/csv/default-14a4d5336b7d2ddf/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files: 100%|██████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 1999.67it/s]
Extracting data files: 100%|████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 399.93it/s]
                                                                     

Dataset csv downloaded and prepared to C:/Users/yeril/.cache/huggingface/datasets/csv/default-14a4d5336b7d2ddf/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 95.24it/s]
                                                                                                                       

In [9]:
optimizer = AdamW(model.parameters(), lr=2e-6)
num_epochs = 15
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "constant_with_warmup",
    optimizer=optimizer,
    num_warmup_steps=2000,
    num_training_steps=num_training_steps
)

torch.cuda.empty_cache()
gc.collect()



42

In [18]:
best_f1 = 0.
show_train_loss_every_num_epoch = 0.1

for epoch in range(num_epochs):
    print(40*'-', '\nepoch', epoch+1)
    model.train()
    losses = []

    for i, batch in enumerate(tqdm(train_dataloader)):
        model.train()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        losses.append(loss.item())

        #f i%int(len(train_dataloader)*show_train_loss_every_num_epoch)==int(len(train_dataloader)*show_train_loss_every_num_epoch)-1:
    print(f'train loss [{i*100/len(train_dataloader):.2f}%]: {np.array(losses).mean():.3f}')
    losses = []
    print('\nvalidating')

    f1 = load_metric('f1')
    acc = load_metric('accuracy')
    precision = load_metric('precision')
    recall = load_metric('recall')
    with torch.no_grad():
        model.eval()
                
        for batch in tqdm(test_dataloader):
        ##for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            f1.add_batch(predictions=predictions, references=batch["labels"])    
            acc.add_batch(predictions=predictions, references=batch["labels"])    
            precision.add_batch(predictions=predictions, references=batch["labels"])    
            recall.add_batch(predictions=predictions, references=batch["labels"])    

        print('weighted summary:')
        print('Test acc:', acc.compute()['accuracy'])
        print('Test precision:', precision.compute(average = 'weighted')['precision'])
        print('Test recall:', recall.compute(average = 'weighted')['recall'])
        f1_weighted = f1.compute(average = 'weighted')['f1']
        print('Test f1:', f1_weighted, '\n')

        if f1_weighted > best_f1:
            best_f1 = f1_weighted
            model.save_pretrained("../../../DeepPavlov_model10d_over15")

    model.train()

---------------------------------------- 
epoch 1


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:05<00:00,  1.39it/s]


train loss [100.00%]: 0.464

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:37<00:00,  9.13it/s]


weighted summary:
Test acc: 0.8771568374967809


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.8824715774844861


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.8771568374967809
Test f1: 0.8728268540480849 

---------------------------------------- 
epoch 2


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:39:10<00:00,  1.39it/s]


train loss [100.00%]: 0.399

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:36<00:00,  9.15it/s]


weighted summary:
Test acc: 0.8836595415915529


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.888417385126726


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.8836595415915529
Test f1: 0.8804334943149661 

---------------------------------------- 
epoch 3


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:39:24<00:00,  1.39it/s]


train loss [100.00%]: 0.347

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.16it/s]


weighted summary:
Test acc: 0.8885741265344664


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.8937002150602869


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.8885741265344664
Test f1: 0.8857027738038495 

---------------------------------------- 
epoch 4


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:32<00:00,  1.38it/s]


train loss [100.00%]: 0.301

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:40<00:00,  9.09it/s]


weighted summary:
Test acc: 0.8928448793887888


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.8976570332847337


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.8928448793887888
Test f1: 0.8905906119911222 

---------------------------------------- 
epoch 5


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:49:24<00:00,  1.34it/s]


train loss [100.00%]: 0.263

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:39<00:00,  9.11it/s]


weighted summary:
Test acc: 0.8945188428191261


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.8998392194353556


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.8945188428191261
Test f1: 0.8925878984872464 

---------------------------------------- 
epoch 6


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:43:38<00:00,  1.37it/s]


train loss [100.00%]: 0.229

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.16it/s]


weighted summary:
Test acc: 0.8994548888316594


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.904406800985408


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.8994548888316594
Test f1: 0.8977618642060878 

---------------------------------------- 
epoch 7


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:17<00:00,  1.39it/s]


train loss [100.00%]: 0.201

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.16it/s]


weighted summary:
Test acc: 0.9000772598506309


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9047034610205023


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9000772598506309
Test f1: 0.8984078773891271 

---------------------------------------- 
epoch 8


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:25<00:00,  1.38it/s]


train loss [100.00%]: 0.176

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.16it/s]


weighted summary:
Test acc: 0.9026955103442356


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9075372324108698


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9026955103442356
Test f1: 0.9011426732451399 

---------------------------------------- 
epoch 9


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:24<00:00,  1.38it/s]


train loss [100.00%]: 0.155

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.16it/s]


weighted summary:
Test acc: 0.9034251867113057


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9082483481183024


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9034251867113057
Test f1: 0.9020561032285196 

---------------------------------------- 
epoch 10


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:27<00:00,  1.38it/s]


train loss [100.00%]: 0.136

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.16it/s]


weighted summary:
Test acc: 0.9043694737745729


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9097239138216695


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9043694737745729
Test f1: 0.9033401562306322 

---------------------------------------- 
epoch 11


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:23<00:00,  1.38it/s]


train loss [100.00%]: 0.119

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.16it/s]


weighted summary:
Test acc: 0.9058932097175724


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9110581217429085


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9058932097175724
Test f1: 0.9049147497321325 

---------------------------------------- 
epoch 12


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:25<00:00,  1.38it/s]


train loss [100.00%]: 0.106

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.16it/s]


weighted summary:
Test acc: 0.9065155807365439


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9118305441604269


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9065155807365439
Test f1: 0.9055244841153054 

---------------------------------------- 
epoch 13


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:05<00:00,  1.39it/s]


train loss [100.00%]: 0.093

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.17it/s]


weighted summary:
Test acc: 0.9057429822302343


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.911757203168865


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9057429822302343
Test f1: 0.9047897612372952 

---------------------------------------- 
epoch 14


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:21<00:00,  1.39it/s]


train loss [100.00%]: 0.083

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:35<00:00,  9.16it/s]


weighted summary:
Test acc: 0.9076530174263885


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9128308713999452


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9076530174263885
Test f1: 0.9067911068721427 

---------------------------------------- 
epoch 15


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [4:40:25<00:00,  1.38it/s]


train loss [100.00%]: 0.074

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:36<00:00,  9.16it/s]


weighted summary:
Test acc: 0.9084470770023177


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9135803848846313


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9084470770023177
Test f1: 0.9076070623710335 



In [12]:
from sklearn import preprocessing
from sklearn.metrics import classification_report
Label_encoder = preprocessing.LabelEncoder()
Label_encoder.classes_ = np.load('../sets/sets10d/classes10d.npy', allow_pickle=True)

true = []
preds = []

model.eval()
for batch in tqdm(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    true += batch["labels"].detach().cpu().numpy().tolist()
    preds += predictions.detach().cpu().numpy().tolist()

print(classification_report(Label_encoder.inverse_transform(true), Label_encoder.inverse_transform(preds)))

100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:32<00:00,  9.20it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

  0207149909       1.00      1.00      1.00        10
  0304999800       0.93      1.00      0.97        14
  0305542000       1.00      1.00      1.00        19
  0305549000       0.90      1.00      0.95         9
  0305599000       1.00      0.90      0.95        10
  0604909900       0.79      0.73      0.76        15
  0701905000       1.00      1.00      1.00        17
  0703200000       1.00      1.00      1.00        15
  0711400000       1.00      1.00      1.00        15
  0711590000       1.00      1.00      1.00         6
  0712909000       1.00      1.00      1.00        11
  0713339000       1.00      1.00      1.00        17
  0801110000       1.00      1.00      1.00        13
  0802310000       0.96      1.00      0.98        23
  0802320000       1.00      0.97      0.98        33
  0803901000       1.00      1.00      1.00        22
  0804300009       1.00      1.00      1.00        18
  0804500001       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
def predict_class(text):
    model.to(torch.device('cpu'))
    inputs = tokenizer(text, truncation = True, max_length=100, padding='max_length', return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
        predicted_class_id = logits.argmax().item()
        return Label_encoder.inverse_transform([predicted_class_id])


In [14]:
print(predict_class("Велосипед"), predict_class("велосипед"), predict_class("Велосипедов"))

['8712007000'] ['8712003000'] ['8712007000']


In [15]:
print(predict_class("презерватив"), predict_class("презервативов"), predict_class("гандоны"), predict_class("гандон"))

['4014100000'] ['4014100000'] ['4016995709'] ['3926909709']


In [17]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12