# Построение ансамбля

Ноутбук использует модели натренированные ранее (в других ноутбуках подкюченных к этому)
Суммирует вероятности предсказанные моделями, и предсказывает класс с наиболшей вероятностью.

У меня было 2 модели в ансамбле.
Но предсказание оказалось точно таким же, какое было у моей лучшей модели.

Использовал код с классом, который написал во время тренировки моделей. Ну и очень удобно получилось вызывать модели
```
result = []
for model_path in MODELS_PATH:
    model = ScanClassifier(
        model_name=MODEL_NAME, 
        label_encoder=label_encoder,
        max_len=MAX_LEN
    )
    model.load(model_path)
    result += [model.predict_proba(test)]
 ```

In [None]:
!nvidia-smi -L

In [None]:
%%capture dev_null
!pip install transformers
!pip install GPUtil

In [None]:
import os
import functools

# For data manipulation
import numpy as np
import pandas as pd
import warnings
import pickle

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
from pylab import rcParams

import torch
import torch.nn as nn

# For Transformer Models
from transformers import (
    AdamW,
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    DataCollatorForLanguageModeling, 
    get_scheduler,
    Trainer, 
    TrainingArguments
)

from tqdm.auto import tqdm
import gc
from GPUtil import showUtilization as gpu_usage

In [None]:
warnings.filterwarnings('ignore')

rcParams['figure.figsize'] = 10, 5

%config InlineBackend.figure_format = 'svg' 
%matplotlib inline

In [None]:
MODEL_NAME = "sberbank-ai/sbert_large_nlu_ru"

DATA_PATH = "/kaggle/input/scan-classification-challange/"
ENCODER_PATH = "../input/scan-training-models-for-ansamble/label_encoder.pkl"

MODELS_PATH = [
     "../input/try-another-model-transformer-sber/sb_model.h5",
     "../input/scan-training-models-for-ansamble/trained-models/model_overall.h5"
]

RANDOM_STATE = 42
BATCH_SIZE = 6
NUM_WORKERS = 2
MAX_LEN = 200
NUM_EPOCHS = 3
NUM_FOLDS = 3

DEBUG = True

torch.manual_seed(RANDOM_STATE);
if DEBUG:
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [None]:
with open(ENCODER_PATH, "rb") as file_pickle:
    label_encoder = pickle.load(file_pickle)

In [None]:
class ScanClassifier:
    def __init__(self, model_name, label_encoder, max_len):
        self._model_name = model_name
        self._label_encoder = label_encoder
        self._tokenizer = AutoTokenizer.from_pretrained(self._model_name)
        self._model = AutoModelForSequenceClassification.from_pretrained(
            self._model_name,
            num_labels=len(self._label_encoder.classes_)
        )
        self._data_collator = DataCollatorForLanguageModeling(tokenizer=self._tokenizer);
        self._max_len = max_len
        self._is_trained = False
        
    def save(self, path):
        torch.save(self._model.state_dict(), path)
    
    def load(self, path):
        self._model.load_state_dict(torch.load(path))
    
    def train(self, train_dataset : pd.DataFrame, valid_dataset : pd.DataFrame, num_epochs=NUM_EPOCHS, batch_size=BATCH_SIZE):
        train_data_loader = self._prepare_data_loader(train_dataset, batch_size)
        valid_data_loader = self._prepare_data_loader(valid_dataset, batch_size)
        
        num_training_steps = NUM_EPOCHS * len(train_data_loader)
        
        optimizer = AdamW(self._model.parameters(), lr=5e-5)
        
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps,
        )
        
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self._model.to(device)
        
        progress_bar = tqdm(range(num_training_steps))

        for epoch in range(NUM_EPOCHS):
            self._model.train()
            train_loss = 0.0
            min_valid_loss = np.inf
            for batch in train_data_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = self._model(**batch)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                train_loss += loss.detach().item()
            
            valid_loss = 0.0
            self._model.eval()
            for batch in valid_data_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = self._model(**batch)
                valid_loss += outputs.loss.detach().item()

            print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss / len(train_data_loader)} \t\t Validation Loss: {valid_loss / len(valid_data_loader)}')
            if min_valid_loss > valid_loss:
                print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
                min_valid_loss = valid_loss
                # Saving State Dict
                # torch.save(model.state_dict(), 'saved_model.pth')
    
    
    def predict_proba(self, dataset, batch_size=BATCH_SIZE):
        self._model.eval()
        
        data_loader = self._prepare_data_loader(dataset, batch_size, for_train=False)
        
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self._model.to(device)
        
        results = []
        for i,batch in enumerate(data_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = self._model(**batch)

            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            results.append(predictions)
        
        predictions = torch.cat(results,0).cpu().numpy()
        
        return(predictions)
    
    def predict(self, dataset, batch_size=BATCH_SIZE):
        pred_proba = self.predict_proba(dataset, batch_size=batch_size)
        pred_labels = np.argmax(pred_proba,1)
        pred_class = self._label_encoder.inverse_transform(pred_labels)
        return(pred_class)
        
    def _prepare_data_loader(self, dataset, batch_size, for_train=True):
        if for_train:
            labels = self._label_encoder.transform(dataset["class"])
        else:
            labels = np.zeros(dataset.shape[0])
        
        torch_dataset = Dataset(
            text=dataset.text.values, 
            target=labels, 
            tokenizer=self._tokenizer, 
            max_len=self._max_len,
            num_labels=len(self._label_encoder.classes_)
        )

        data_loader = torch.utils.data.DataLoader(
            torch_dataset, 
            batch_size=batch_size,
            shuffle=for_train,
            num_workers=NUM_WORKERS
        )
        
        return data_loader

        
class Dataset:
    def __init__(self, text, target, tokenizer, max_len, num_labels):
        self.text = text
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        target = self.target[item]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(target, dtype=torch.long)
        }
        

In [None]:
test = pd.read_csv(DATA_PATH+'df_test.csv',)

In [None]:
# test = test.iloc[:100]

In [None]:
result = []
for model_path in MODELS_PATH:
    model = ScanClassifier(
        model_name=MODEL_NAME, 
        label_encoder=label_encoder,
        max_len=MAX_LEN
    )
    model.load(model_path)
    result += [model.predict_proba(test)]

In [None]:
result

In [None]:
overall_proba = functools.reduce(np.add, result)

In [None]:
pred_labels = np.argmax(result[0],1)
pred_class = label_encoder.inverse_transform(pred_labels)

In [None]:
# test["predictions"] = pred_class

In [None]:
# test

In [None]:
submission = pd.DataFrame({'id':range(pred_class.shape[0]),
                           'class':pred_class},
                          columns=['id', 'class'])
submission.to_csv('submission.csv', index=False)
submission.head()