# Ноутбук для обучения моделей transformer

Идея обучить три модели на странифицированных фолдах трейна, чтобы сделать потом из них ансамбль

## Подготовка ноутбука

In [1]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-bd73484b-7a93-5bb0-5769-9756e5d8fb5d)


In [2]:
%%capture dev_null
!pip install transformers
!pip install GPUtil

In [3]:
import os

# For data manipulation
import numpy as np
import pandas as pd
import warnings
import pickle

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
from pylab import rcParams

import torch
import torch.nn as nn

# For Transformer Models
from transformers import (
    AdamW,
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    DataCollatorForLanguageModeling, 
    get_scheduler,
    Trainer, 
    TrainingArguments
)

from tqdm.auto import tqdm
import gc
from GPUtil import showUtilization as gpu_usage

In [4]:
warnings.filterwarnings('ignore')

rcParams['figure.figsize'] = 10, 5

%config InlineBackend.figure_format = 'svg' 
%matplotlib inline


## Настройки ноутбука

Здесь описываю настройки модели и параметры обучения

In [5]:
MODEL_NAME = "sberbank-ai/sbert_large_nlu_ru"

DATA_PATH = "/kaggle/input/scan-classification-challange/"
MODEL_PATH = "trained-models/"
ENCODER_PATH = "label_encoder.pkl"

RANDOM_STATE = 13
BATCH_SIZE = 32
NUM_WORKERS = 2
MAX_LEN = 100
NUM_EPOCHS = 5
NUM_FOLDS = 1

DEBUG = False

torch.manual_seed(RANDOM_STATE);
if DEBUG:
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
    
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)

## Подготваливаем обучающие наборы

In [6]:
train = pd.read_csv(DATA_PATH+'df_train.csv',)
test = pd.read_csv(DATA_PATH+'df_test.csv',)


In [7]:
if os.path.exists(ENCODER_PATH):
    with open(ENCODER_PATH, "rb") as file_pickle:
        label_encoder = pickle.load(file_pickle)
else:
    label_encoder = LabelEncoder().fit(train["class"])
    with open(ENCODER_PATH, "wb") as file_pickle:
        pickle.dump(label_encoder, file_pickle)

In [8]:
if DEBUG:
    valid = train.iloc[400:500]
    train = train.iloc[:5000]
    test = test.iloc[:100]
    
train.shape

(60000, 2)

## Класс помощник

In [9]:
class ScanClassifier:
    def __init__(self, model_name, label_encoder, max_len):
        self._model_name = model_name
        self._label_encoder = label_encoder
        self._tokenizer = AutoTokenizer.from_pretrained(self._model_name)
        self._model = AutoModelForSequenceClassification.from_pretrained(
            self._model_name,
            num_labels=len(self._label_encoder.classes_)
        )
        self._data_collator = DataCollatorForLanguageModeling(tokenizer=self._tokenizer);
        self._max_len = max_len
        self._is_trained = False
        
    def save(self, path):
        print(f"saving model to {path}")
        torch.save(self._model.state_dict(), path)
    
    def load(self, path):
        print(f"loading model from {path}")
        self._model.load_state_dict(torch.load(path))
    
    def train(self, train_dataset : pd.DataFrame, valid_dataset : pd.DataFrame, num_epochs=NUM_EPOCHS, batch_size=BATCH_SIZE):
        train_data_loader = self._prepare_data_loader(train_dataset, batch_size)
        valid_data_loader = self._prepare_data_loader(valid_dataset, batch_size)
        
        num_training_steps = NUM_EPOCHS * len(train_data_loader)
        
        optimizer = AdamW(self._model.parameters(), lr=5e-5)
        
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps,
        )
        
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self._model.to(device)
        
        progress_bar = tqdm(range(num_training_steps))

        for epoch in range(NUM_EPOCHS):
            self._model.train()
            train_loss = 0.0
            min_valid_loss = np.inf
            for batch in train_data_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = self._model(**batch)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                train_loss += loss.detach().item()
            
            valid_loss = 0.0
#             self._model.eval()
#             for batch in valid_data_loader:
#                 batch = {k: v.to(device) for k, v in batch.items()}
#                 outputs = self._model(**batch)
#                 valid_loss += outputs.loss.detach().item()

            print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss / len(train_data_loader)} \t\t Validation Loss: {valid_loss / len(valid_data_loader)}')
#             if min_valid_loss > valid_loss:
#                 print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
#                 min_valid_loss = valid_loss
                # Saving State Dict
                # torch.save(model.state_dict(), 'saved_model.pth')
    
    
    def predict_proba(self, dataset, batch_size=BATCH_SIZE):
        self._model.eval()
        
        data_loader = self._prepare_data_loader(dataset, batch_size, for_train=False)
        
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self._model.to(device)
        
        results = []
        for i,batch in enumerate(data_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = self._model(**batch)

            predictions = outputs.logits
            results.append(predictions)
        
        predictions = torch.cat(results,0).cpu().numpy()
        
        return(predictions)
    
    def predict(self, dataset, batch_size=BATCH_SIZE):
        pred_proba = self.predict_proba(dataset, batch_size=batch_size)
        pred_labels = np.argmax(pred_proba,1)
        pred_class = self._label_encoder.inverse_transform(pred_labels)
        return(pred_class)
        
    def _prepare_data_loader(self, dataset, batch_size, for_train=True):
        if for_train:
            labels = self._label_encoder.transform(dataset["class"])
        else:
            labels = np.zeros(dataset.shape[0])
        
        torch_dataset = Dataset(
            text=dataset.text.values, 
            target=labels, 
            tokenizer=self._tokenizer, 
            max_len=self._max_len,
            num_labels=len(self._label_encoder.classes_)
        )

        data_loader = torch.utils.data.DataLoader(
            torch_dataset, 
            batch_size=batch_size,
            shuffle=for_train,
            num_workers=NUM_WORKERS
        )
        
        return data_loader

        
class Dataset:
    def __init__(self, text, target, tokenizer, max_len, num_labels):
        self.text = text
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        target = self.target[item]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(target, dtype=torch.long)
        }
        


In [10]:
gpu_usage()    

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |


In [11]:
# del model
# gc.collect()
# torch.cuda.empty_cache()

In [12]:
gc.collect()
torch.cuda.empty_cache()

In [13]:
gpu_usage()   

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |


In [14]:
# skf = StratifiedKFold(n_splits=NUM_FOLDS)

In [15]:
# for i, (train_index, valid_index) in enumerate(skf.split(train, train["class"])):
#     model = ScanClassifier(
#         model_name=MODEL_NAME, 
#         label_encoder=label_encoder,
#         max_len=MAX_LEN
#     )
    
#     model.train(
#         train_dataset=train.iloc[train_index],
#         valid_dataset=train.iloc[valid_index],
#     )
    
#     test["pred"] = model.predict(test)
#     display(test.head())
    
# #     model_path = f"{MODEL_PATH}model_{i}_fold.h5"
#     model.save(model_path)
    
    


In [16]:
model = ScanClassifier(
        model_name=MODEL_NAME, 
        label_encoder=label_encoder,
        max_len=MAX_LEN
    )
model.train(train,train.iloc[0:1])

model_path = f"{MODEL_PATH}model_overall.h5"
model.save(model_path)

Downloading:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/655 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/sbert_large_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/9375 [00:00<?, ?it/s]

Epoch 1 		 Training Loss: 0.3989200198630492 		 Validation Loss: 0.0
Epoch 2 		 Training Loss: 0.1913813027198116 		 Validation Loss: 0.0
Epoch 3 		 Training Loss: 0.1305583148173988 		 Validation Loss: 0.0
Epoch 4 		 Training Loss: 0.09981405398802211 		 Validation Loss: 0.0
Epoch 5 		 Training Loss: 0.06161518390754548 		 Validation Loss: 0.0
saving model to trained-models/model_overall.h5


In [17]:
# test["pred"] = model.predict(test)

In [18]:
# test.head()

In [19]:
# gpu_usage()   

In [20]:
# # del model
# gc.collect()
# torch.cuda.empty_cache()

In [21]:
# gpu_usage()   

In [22]:
# model = ScanClassifier(
#     model_name=MODEL_NAME, 
#     label_encoder=label_encoder,
#     max_len=100
# )

In [23]:
# model.train(train,valid)

In [24]:
# gpu_usage()   

In [25]:
test = pd.read_csv(DATA_PATH+'df_test.csv',)

In [26]:
print("Predicting for last model")

pred = model.predict(test)

Predicting for last model


In [27]:
submission = pd.DataFrame({'id':range(pred.shape[0]),
                           'class':pred},
                          columns=['id', 'class'])
submission.to_csv('submission.csv', index=False)
submission.head(20)

Unnamed: 0,id,class
0,0,O
1,1,Утечка нефти
2,2,O
3,3,O
4,4,O
5,5,Утечка нефти
6,6,O
7,7,Уголовное обвинение
8,8,O
9,9,Экстремизм
