# Моя первая попытка работы с трансформерами

Частично прошел курс на платформе [huggingface](https://huggingface.co/) и реализовал классификацию с помощью их моделей.

Взял модель [DeepPavlov/rubert-base-cased-conversational] (https://huggingface.co/DeepPavlov/rubert-base-cased-conversational)


## Настройки, импорты, установка

In [None]:
!pip install transformers

In [None]:
!nvidia-smi -L

In [None]:
# For data manipulation
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
from pylab import rcParams

rcParams['figure.figsize'] = 10, 5

%config InlineBackend.figure_format = 'svg' 
%matplotlib inline

import torch
import torch.nn as nn

# For Transformer Models
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [None]:
MODEL_NAME = "DeepPavlov/rubert-base-cased-conversational"
DATA_PATH = '/kaggle/input/scan-classification-challange/'
RANDOM_STATE = 42

BATCH_SIZE = 32
NUM_WORKERS = 2
MAX_LEN = 100
NUM_EPOCHS = 10

## Загрузка и первичный анализ данных

In [None]:
train = pd.read_csv(DATA_PATH+'df_train.csv',)
train.head(20)

In [None]:
test = pd.read_csv(DATA_PATH+'df_test.csv',)
test.head()

In [None]:
train["class"].value_counts()[2:].plot(kind='bar',figsize=(12,4),fontsize=10)
plt.xlabel("Class",fontsize=10)
plt.ylabel("Counts",fontsize=10);

## Разбиение на трайн и валидацию, кодирование меток LabelEncoder

In [None]:
label_encoder = LabelEncoder()
train["label"] = label_encoder.fit_transform(train["class"])
train.drop(["class"], inplace=True, axis=1)

In [None]:
# train, validation = train_test_split(train, random_state=RANDOM_STATE, test_size=0.2, stratify = train['label'])

## Подготовим текст для обучения сетей. Токенизация

In [None]:
class Dataset:
    def __init__(self, text, target, tokenizer, max_len, num_labels):
        self.text = text
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
#         target = np.eye(self.num_labels, dtype='uint8')[self.target[item]]
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(self.target[item], dtype=torch.long)
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = Dataset(
    text=train.text.values, 
    target=train.label.values, 
    tokenizer=tokenizer, 
    max_len=MAX_LEN,
    num_labels=len(label_encoder.classes_)
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE
)

# valid_dataset = Dataset(
#     text=validation.text.values, 
#     target=validation.label.values, 
#     tokenizer=tokenizer, 
#     max_len=MAX_LEN,
#     num_labels=len(label_encoder.classes_)
# )

# valid_data_loader = torch.utils.data.DataLoader(
#     valid_dataset, 
#     batch_size=BATCH_SIZE
# )

test_dataset = Dataset(
    text=test.text.values, 
    target=np.zeros(test.shape[0]),
    tokenizer=tokenizer, 
    max_len=MAX_LEN,
    num_labels=len(label_encoder.classes_)
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer);


In [None]:
for batch in test_data_loader:
    break
{k: v.shape for k, v in batch.items()}

## Обучим модель на тренировочном наборе

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label_encoder.classes_))

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_training_steps = NUM_EPOCHS * len(train_data_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(NUM_EPOCHS):
    for batch in train_data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
model.eval()
result = []
for i,batch in enumerate(test_data_loader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    result.append(predictions)
    


In [None]:
predictions = torch.cat(result,0)

In [None]:
test["predictions"] = label_encoder.inverse_transform(predictions.cpu().numpy())

In [None]:
test

In [None]:
submission = pd.DataFrame({'id':range(len(test)),
                           'class':test['predictions'].values},
                          columns=['id', 'class'])
submission.to_csv('submission.csv', index=False)
submission.head()