In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn import preprocessing
import gc
from tqdm import tqdm
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AdamW, get_scheduler
model_name = "DeepPavlov/rubert-base-cased-sentence"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=660)
model.to(device)

df = pd.read_csv("../data/mergedcleared1307.csv", sep=';', names=['id', 'label'], dtype={'id': str, 'label': str})
df = df[df['label'].notnull()]
df['g_id'] = df.id.str.slice(start=0, stop=4)
Label_encoder = preprocessing.LabelEncoder()
Label_encoder.fit(df['g_id'])
df['g_id_enc'] = Label_encoder.fit_transform(df['g_id'])
df.columns = ['id', 'description', 'g_id', 'labels']
#data = df.drop(['id', 'g_id'], axis=1, inplace=False)
df.to_csv('../sets/df-id-gid-label.csv', index=False)
#data.to_csv('../sets/df-label.csv', index=False)
np.save('../sets/classes4d.npy', Label_encoder.classes_)
data = df.sample(frac=1).reset_index(drop=True)
train = data[:int(len(df)*0.8)]
test =  data[int(len(df)*0.8):]
train.to_csv('../sets/train_with_ids.csv', index=False)
test.to_csv('../sets/test_with_ids.csv', index=False)
test.drop(['id', 'g_id'], axis=1, inplace=True)
train.drop(['id', 'g_id'], axis=1, inplace=True)
train.to_csv('../sets/train.csv', index=False)
test.to_csv('../sets/test.csv', index=False)

dataset = load_dataset('csv', data_files={'train': '../sets/train.csv', 'test': '../sets/test.csv'})
dataset = dataset.map(lambda e: tokenizer(e['description'], truncation = True, max_length=100, padding='max_length'), batched=True)
pytorch_style_columns = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']  # 
dataset = dataset.remove_columns(set(dataset['train'].features.keys()) - set(pytorch_style_columns))
dataset.set_format(type='torch', columns=pytorch_style_columns, device='cuda')

train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=4)
test_dataloader = DataLoader(dataset['test'], shuffle=False, batch_size=4)
#если есть видеопамять около 10гб, можно поставить 8


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(['id', 'g_id'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(['id', 'g_id'], axis=1, inplace=True)


Downloading and preparing dataset csv/default to C:/Users/yeril/.cache/huggingface/datasets/csv/default-16101415d53253b2/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files: 100%|██████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2061.08it/s]
Extracting data files: 100%|████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 400.12it/s]
                                                                     

Dataset csv downloaded and prepared to C:/Users/yeril/.cache/huggingface/datasets/csv/default-16101415d53253b2/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 83.95it/s]
                                                                                                                       

In [4]:
optimizer = AdamW(model.parameters(), lr=2e-6)
num_epochs = 15
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "constant_with_warmup",
    optimizer=optimizer,
    num_warmup_steps=2000,
    num_training_steps=num_training_steps
)

torch.cuda.empty_cache()
gc.collect()

20

In [5]:
best_f1 = 0.
show_train_loss_every_num_epoch = 0.1

for epoch in range(num_epochs):
    print(40*'-', '\nepoch', epoch+1)
    model.train()
    losses = []

    for i, batch in enumerate(tqdm(train_dataloader)):
        model.train()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        losses.append(loss.item())

        #f i%int(len(train_dataloader)*show_train_loss_every_num_epoch)==int(len(train_dataloader)*show_train_loss_every_num_epoch)-1:
    print(f'train loss [{i*100/len(train_dataloader):.2f}%]: {np.array(losses).mean():.3f}')
    losses = []
    print('\nvalidating')

    f1 = load_metric('f1')
    acc = load_metric('accuracy')
    precision = load_metric('precision')
    recall = load_metric('recall')
    with torch.no_grad():
        model.eval()
                
        for batch in tqdm(test_dataloader):
        ##for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            f1.add_batch(predictions=predictions, references=batch["labels"])    
            acc.add_batch(predictions=predictions, references=batch["labels"])    
            precision.add_batch(predictions=predictions, references=batch["labels"])    
            recall.add_batch(predictions=predictions, references=batch["labels"])    

        print('weighted summary:')
        print('Test acc:', acc.compute()['accuracy'])
        print('Test precision:', precision.compute(average = 'weighted')['precision'])
        print('Test recall:', recall.compute(average = 'weighted')['recall'])
        f1_weighted = f1.compute(average = 'weighted')['f1']
        print('Test f1:', f1_weighted, '\n')

        if f1_weighted > best_f1:
            best_f1 = f1_weighted
            model.save_pretrained("../../../DeepPavlov_model")

    model.train()

---------------------------------------- 
epoch 1


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:54:51<00:00,  1.65it/s]
  f1 = load_metric('f1')


train loss [100.00%]: 4.249

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:30<00:00,  9.23it/s]


weighted summary:
Test acc: 0.5494463044038115


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.44686225834453636


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.5494463044038115
Test f1: 0.45306600284587395 

---------------------------------------- 
epoch 2


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:51:45<00:00,  1.68it/s]


train loss [100.00%]: 2.247

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:31<00:00,  9.23it/s]


weighted summary:
Test acc: 0.7196111254184908


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.6512836849254209
Test recall: 0.7196111254184908
Test f1: 0.6583117883751676 

---------------------------------------- 
epoch 3


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:51:42<00:00,  1.68it/s]


train loss [100.00%]: 1.405

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:30<00:00,  9.24it/s]


weighted summary:
Test acc: 0.809017941454202


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.769648826402909
Test recall: 0.809017941454202
Test f1: 0.770145492224197 

---------------------------------------- 
epoch 4


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:51:44<00:00,  1.68it/s]


train loss [100.00%]: 0.943

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:29<00:00,  9.25it/s]


weighted summary:
Test acc: 0.8669842904970384


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.8519589276486657
Test recall: 0.8669842904970384
Test f1: 0.8454398400865679 

---------------------------------------- 
epoch 5


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:53:49<00:00,  1.66it/s]


train loss [100.00%]: 0.658

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:29<00:00,  9.25it/s]


weighted summary:
Test acc: 0.9052279165593613


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.8988398246904225
Test recall: 0.9052279165593613
Test f1: 0.8940048680448462 

---------------------------------------- 
epoch 6


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:55:40<00:00,  1.65it/s]


train loss [100.00%]: 0.468

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:50<00:00,  8.96it/s]


weighted summary:
Test acc: 0.9258949266031419


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9236027187844431
Test recall: 0.9258949266031419
Test f1: 0.9197142948325676 

---------------------------------------- 
epoch 7


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:56:24<00:00,  1.64it/s]


train loss [100.00%]: 0.339

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:30<00:00,  9.23it/s]


weighted summary:
Test acc: 0.9388359515838269


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9377031747620691
Test recall: 0.9388359515838269
Test f1: 0.9356008178723766 

---------------------------------------- 
epoch 8


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:53:04<00:00,  1.67it/s]


train loss [100.00%]: 0.248

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:29<00:00,  9.25it/s]


weighted summary:
Test acc: 0.9457464160013735


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9472029017705119
Test recall: 0.9457464160013735
Test f1: 0.9441380244831442 

---------------------------------------- 
epoch 9


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:52:51<00:00,  1.67it/s]


train loss [100.00%]: 0.185

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:31<00:00,  9.23it/s]


weighted summary:
Test acc: 0.9495021031848228
Test precision: 0.9512916947010742


  _warn_prf(average, modifier, msg_start, len(result))


Test recall: 0.9495021031848228
Test f1: 0.9488496749252348 

---------------------------------------- 
epoch 10


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:52:48<00:00,  1.67it/s]


train loss [100.00%]: 0.141

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:31<00:00,  9.23it/s]


weighted summary:
Test acc: 0.9519057429822302


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9525739812292136
Test recall: 0.9519057429822302
Test f1: 0.9513583800832051 

---------------------------------------- 
epoch 11


100%|██████████████████████████████████████████████████████████████████████████| 23298/23298 [3:56:19<00:00,  1.64it/s]


train loss [100.00%]: 0.109

validating


100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:47<00:00,  8.99it/s]


weighted summary:
Test acc: 0.9532792514378917


  _warn_prf(average, modifier, msg_start, len(result))


Test precision: 0.9542291150877494
Test recall: 0.9532792514378917
Test f1: 0.9529606409473039 

---------------------------------------- 
epoch 12


 32%|███████████████████████▌                                                 | 7539/23298 [1:15:26<2:37:42,  1.67it/s]


KeyboardInterrupt: 

In [6]:
from sklearn import preprocessing
from sklearn.metrics import classification_report
Label_encoder = preprocessing.LabelEncoder()
Label_encoder.classes_ = np.load('./cl_classes2610.npy', allow_pickle=True)

true = []
preds = []

model.eval()
for batch in tqdm(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    true += batch["labels"].detach().cpu().numpy().tolist()
    preds += predictions.detach().cpu().numpy().tolist()

print(classification_report(Label_encoder.inverse_transform(true), Label_encoder.inverse_transform(preds)))

100%|██████████████████████████████████████████████████████████████████████████████| 5825/5825 [10:24<00:00,  9.33it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        0207       0.94      1.00      0.97        17
        0304       1.00      0.91      0.95        11
        0305       0.95      1.00      0.98        42
        0604       1.00      1.00      1.00        11
        0701       1.00      1.00      1.00        12
        0703       1.00      1.00      1.00        18
        0711       0.97      1.00      0.99        35
        0712       1.00      1.00      1.00        13
        0713       1.00      1.00      1.00        21
        0801       0.95      1.00      0.97        19
        0802       1.00      1.00      1.00        47
        0803       1.00      1.00      1.00        22
        0804       0.97      1.00      0.99        38
        0805       1.00      1.00      1.00        69
        0806       1.00      1.00      1.00        13
        0808       1.00      1.00      1.00       139
        0813       0.97      1.00      0.99        34
        0901       1.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
def predict_class(text):
    model.to(torch.device('cpu'))
    inputs = tokenizer(text, truncation = True, max_length=100, padding='max_length', return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
        predicted_class_id = logits.argmax().item()
        return Label_encoder.inverse_transform([predicted_class_id])


In [8]:
print(predict_class("Велосипед"), predict_class("велосипед"), predict_class("Велосипедов"))

['8712'] ['8712'] ['8712']


In [9]:
print(predict_class("презерватив"), predict_class("презервативов"), predict_class("гандоны"), predict_class("гандон"))

['4014'] ['4014'] ['9608'] ['5607']
