# Overview

This notebook combines three models.

In [None]:
!pip install ../input/lama-whl/efficientnet_pytorch-0.7.0/dist/efficientnet_pytorch-0.7.0.tar ../input/lama-whl/log_calls-0.3.2/log_calls-0.3.2/ ../input/lama-whl/sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_htmlhelp-1.0.3-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_applehelp-1.0.2-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_serializinghtml-1.1.4-py2.py3-none-any.whl ../input/lama-whl/importlib_metadata-1.7.0-py2.py3-none-any.whl ../input/lama-whl/poetry_core-1.0.3-py2.py3-none-any.whl ../input/lama-whl/imagesize-1.2.0-py2.py3-none-any.whl ../input/lama-whl/docutils-0.16-py2.py3-none-any.whl ../input/lama-whl/alabaster-0.7.12-py2.py3-none-any.whl ../input/lama-whl/snowballstemmer-2.1.0-py2.py3-none-any.whl ../input/lama-whl/Sphinx-3.5.4-py3-none-any.whl ../input/lama-whl/sphinx_autodoc_typehints-1.11.1-py3-none-any.whl ../input/lama-whl/nbsphinx-0.8.0-py3-none-any.whl ../input/lama-whl/nbsphinx_link-1.3.0-py2.py3-none-any.whl ../input/lama-whl/cssselect-1.1.0-py2.py3-none-any.whl ../input/lama-whl/pyquery-1.4.3-py3-none-any.whl ../input/lama-whl/chuanconggao-html2json-0.2.4.1-0-g99d7fbb/chuanconggao-html2json-99d7fbb/ ../input/lama-whl/json2html-1.3.0/json2html-1.3.0 ../input/lama-whl/lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl ../input/lama-whl/AutoWoE-1.2.1-py3-none-any.whl ../input/lama-whl/LightAutoML-0.2.14-py3-none-any.whl > /dev/null

In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig

from sklearn.model_selection import KFold
from sklearn.svm import SVR

import gc
gc.enable()

In [None]:
import transformers
from transformers import BertTokenizer

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold

import lightgbm as lgb

from fastprogress.fastprogress import  progress_bar
 
from sklearn.metrics import mean_squared_error
from lightautoml.automl.presets.text_presets import TabularAutoML
from lightautoml.tasks import Task

In [None]:
BATCH_SIZE = 32
MAX_LEN = 248
EVAL_SCHEDULE = [(0.5, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1, 1)]
ROBERTA_PATH = "../input/roberta-transformers-pytorch/roberta-base"
TOKENIZER_PATH = "../input/roberta-transformers-pytorch/roberta-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

DEVICE

In [None]:
train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Dataset

In [None]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

# Model 1
Inspired from: https://www.kaggle.com/maunish/clrp-roberta-svm

In [None]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.25,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

# Inference

In [None]:
### train ###

NUM_MODELS = 5

train_all_predictions = np.zeros((NUM_MODELS, len(train_df)))

train_dataset = LitDataset(train_df, inference_only=True)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for model_index in tqdm(range(NUM_MODELS)):            
    model_path = f"../input/commonlit-roberta-0467/model_{model_index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
        
    train_all_predictions[model_index] = predict(model, train_loader)
            
    del model
    gc.collect()

In [None]:
train_model1_predictions = train_all_predictions.mean(axis=0)

In [None]:
### test ###

NUM_MODELS = 5

all_predictions = np.zeros((NUM_MODELS, len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for model_index in tqdm(range(NUM_MODELS)):            
    model_path = f"../input/commonlit-roberta-0467/model_{model_index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
        
    all_predictions[model_index] = predict(model, test_loader)
            
    del model
    gc.collect()

In [None]:
model1_predictions = all_predictions.mean(axis=0)

# Model 2
Inspired from: [https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-3](https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-3)

In [None]:
test = test_df
train = train_df

from glob import glob
import os
import matplotlib.pyplot as plt
import json
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)
from transformers import RobertaConfig
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup
)
from transformers import RobertaTokenizer
from transformers import RobertaModel
from IPython.display import clear_output

# Dataset

In [None]:
def convert_examples_to_features(data, tokenizer, max_len, is_test=False):
    data = data.replace('\n', '')
    tok = tokenizer.encode_plus(
        data, 
        max_length=max_len, 
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True
    )
    curr_sent = {}
    padding_length = max_len - len(tok['input_ids'])
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)
    return curr_sent

class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.values.tolist()
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt, label = self.excerpts[item], self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.double),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

In [None]:
class CommonLitModel(nn.Module):
    def __init__(
        self, 
        model_name, 
        config,  
        multisample_dropout=False,
        output_hidden_states=False
    ):
        
        super(CommonLitModel, self).__init__()
        self.config = config
        self.roberta = RobertaModel.from_pretrained(
            model_name, 
            output_hidden_states=output_hidden_states
        )
        
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
            
        self.regressor = nn.Linear(config.hidden_size, 1)
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
 
    def _init_weights(self, module):
        
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
                
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
                
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
 
        # multi-sample dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.regressor(dropout(sequence_output))
            else:
                logits += self.regressor(dropout(sequence_output))
        
        logits /= len(self.dropouts)
 
        # calculate loss
        loss = None
        
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

In [None]:
def make_model(model_name, num_labels=1):
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    config = RobertaConfig.from_pretrained(model_name)
    config.update({'num_labels':num_labels})
    model = CommonLitModel(model_name, config=config)
    return model, tokenizer

def make_loader(
    data, 
    tokenizer, 
    max_len,
    batch_size,
):
    
    test_dataset = DatasetRetriever(data, tokenizer, max_len, is_test=True)
    test_sampler = SequentialSampler(test_dataset)
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size // 2, 
        sampler=test_sampler, 
        pin_memory=False, 
        drop_last=False, 
        num_workers=0
    )

    return test_loader

In [None]:
class Evaluator:
    def __init__(self, model, scalar=None):
        self.model = model
        self.scalar = scalar

    def evaluate(self, data_loader, tokenizer):
        preds = []
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(data_loader):
                input_ids, attention_mask, token_type_ids = batch_data['input_ids'], \
                    batch_data['attention_mask'], batch_data['token_type_ids']
                input_ids, attention_mask, token_type_ids = input_ids.cuda(), \
                    attention_mask.cuda(), token_type_ids.cuda()
                
                if self.scalar is not None:
                    with torch.cuda.amp.autocast():
                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                        )
                else:
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids
                    )
                
                logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
                preds += logits
        return preds

def config(fold, model_name, load_model_path, test_flag = True):
    torch.manual_seed(2021)
    torch.cuda.manual_seed(2021)
    torch.cuda.manual_seed_all(2021)
    
    max_len = 250
    batch_size = 8

    model, tokenizer = make_model(
        model_name=model_name, 
        num_labels=1
    )
    
    model.load_state_dict(
        torch.load(f'{load_model_path}/model{fold}.bin')
    )
    
    if test_flag:
        test_loader = make_loader(
            test, tokenizer, max_len=max_len,
            batch_size=batch_size
        )
    else:
        test_loader = make_loader(
            train, tokenizer, max_len=max_len,
            batch_size=batch_size
        )

    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')

    # scaler = torch.cuda.amp.GradScaler()
    scaler = None
    return (
        model, tokenizer, 
        test_loader, scaler
    )

# Inference

In [None]:
import time

def run(fold=0, model_name=None, load_model_path=None, test_flag = True):
    model, tokenizer, \
        test_loader, scaler = config(fold, model_name, load_model_path, test_flag)

    evaluator = Evaluator(model, scaler)

    test_time_list = []

    torch.cuda.synchronize()
    tic1 = time.time()

    preds = evaluator.evaluate(test_loader, tokenizer)

    torch.cuda.synchronize()
    tic2 = time.time() 
    test_time_list.append(tic2 - tic1)
    
    del model, tokenizer, test_loader, scaler
    gc.collect()
    torch.cuda.empty_cache()
    
    return preds

In [None]:
pred_df1 = pd.DataFrame()
pred_df2 = pd.DataFrame()
pred_df3 = pd.DataFrame()

for fold in tqdm(range(5)):
    pred_df1[f'fold{fold}'] = run(fold%5, '../input/roberta-transformers-pytorch/roberta-base/', '../input/commonlit-roberta-base-i/')
    pred_df2[f'fold{fold+5}'] = run(fold%5, '../input/roberta-transformers-pytorch/roberta-large', '../input/roberta-large-itptfit/')
    pred_df3[f'fold{fold+10}'] = run(fold%5, '../input/roberta-transformers-pytorch/roberta-large', '../input/commonlit-roberta-large-ii/')

pred_df1 = np.array(pred_df1)
pred_df2 = np.array(pred_df2)
pred_df3 = np.array(pred_df3)

model2_predictions = (pred_df2.mean(axis=1) * 0.5) + (pred_df1.mean(axis=1) * 0.3) + (pred_df3.mean(axis=1) * 0.2)

In [None]:
#### train ###

train_pred_df1 = pd.DataFrame()
train_pred_df2 = pd.DataFrame()
train_pred_df3 = pd.DataFrame()

for fold in tqdm(range(5)):
    train_pred_df1[f'fold{fold}'] = run(fold%5, '../input/roberta-transformers-pytorch/roberta-base/', '../input/commonlit-roberta-base-i/', test_flag=False)
    train_pred_df2[f'fold{fold+5}'] = run(fold%5, '../input/roberta-transformers-pytorch/roberta-large', '../input/roberta-large-itptfit/', test_flag=False)
    train_pred_df3[f'fold{fold+10}'] = run(fold%5, '../input/roberta-transformers-pytorch/roberta-large', '../input/commonlit-roberta-large-ii/', test_flag=False)

train_pred_df1 = np.array(train_pred_df1)
train_pred_df2 = np.array(train_pred_df2)
train_pred_df3 = np.array(train_pred_df3)

train_model2_predictions = (train_pred_df2.mean(axis=1) * 0.5) + (train_pred_df1.mean(axis=1) * 0.3) + (train_pred_df3.mean(axis=1) * 0.2)

## Model 3 

Inspired from: https://www.kaggle.com/jcesquiveld/best-transformer-representations

In [None]:
import os
import numpy as np
import pandas as pd
import random

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader

from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
logging.set_verbosity_error()

In [None]:
INPUT_DIR = '../input/commonlitreadabilityprize'
MODEL_DIR = '../input/roberta-transformers-pytorch/roberta-large'
CHECKPOINT_DIR = '../input/clrp-mean-pooling/'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MAX_LENGTH = 248
TEST_BATCH_SIZE = 1
HIDDEN_SIZE = 1024

NUM_FOLDS = 5
SEEDS = [113]

test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))

In [None]:
class MeanPoolingModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(HIDDEN_SIZE, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        logits = self.linear(mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds

In [None]:
def get_test_loader(data):

    x_test = data.excerpt.tolist()
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

    encoded_test = tokenizer.batch_encode_plus(
        x_test, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )

    dataset_test = TensorDataset(
        encoded_test['input_ids'],
        encoded_test['attention_mask']
    )

    dataloader_test = DataLoader(
        dataset_test,
        sampler = SequentialSampler(dataset_test),
        batch_size=TEST_BATCH_SIZE
    )
    
    return dataloader_test

test_dataloader = get_test_loader(test)
train_dataloader = get_test_loader(train)

In [None]:
all_predictions = []
for seed in SEEDS:
    
    fold_predictions = []
    
    for fold in tqdm(range(NUM_FOLDS)):
        model_path = f"model_{seed + 1}_{fold + 1}.pth"
        
        print(f"\nUsing {model_path}")
        
        model_path = CHECKPOINT_DIR + f"model_{seed + 1}_{fold + 1}.pth"
        model = MeanPoolingModel(MODEL_DIR)
        model.load_state_dict(torch.load(model_path)) 
        model.to(DEVICE)
        model.eval()

        predictions = []
        for batch in test_dataloader:

            batch = tuple(b.to(DEVICE) for b in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         None,
                     }

     
            preds = model(**inputs).item()
            predictions.append(preds)
            
        del model 
        gc.collect()
            
        fold_predictions.append(predictions)
    all_predictions.append(np.mean(fold_predictions, axis=0).tolist())
    
model3_predictions = np.mean(all_predictions,axis=0)

In [None]:
train_all_predictions = []
for seed in SEEDS:
    
    fold_predictions = []
    
    for fold in tqdm(range(NUM_FOLDS)):
        model_path = f"model_{seed + 1}_{fold + 1}.pth"
        
        print(f"\nUsing {model_path}")
        
        model_path = CHECKPOINT_DIR + f"model_{seed + 1}_{fold + 1}.pth"
        model = MeanPoolingModel(MODEL_DIR)
        model.load_state_dict(torch.load(model_path)) 
        model.to(DEVICE)
        model.eval()

        predictions = []
        for batch in train_dataloader:

            batch = tuple(b.to(DEVICE) for b in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         None,
                     }

     
            preds = model(**inputs).item()
            predictions.append(preds)
            
        del model 
        gc.collect()
            
        fold_predictions.append(predictions)
    train_all_predictions.append(np.mean(fold_predictions, axis=0).tolist())
    
train_model3_predictions = np.mean(train_all_predictions,axis=0)

In [None]:
# # predictions = model1_predictions * 0.5 + model2_predictions * 0.3 + model3_predictions * 0.2  # 0.461
# # predictions = model1_predictions * 0.45 + model2_predictions * 0.35 + model3_predictions * 0.2  # 0.461
# predictions = model1_predictions * 0.40 + model2_predictions * 0.25 + model3_predictions * 0.35   #
# predictions

In [None]:
# train_predictions = train_model1_predictions * 0.40 + train_model2_predictions * 0.25 + train_model3_predictions * 0.35   #
# train_predictions


In [None]:
train_results = pd.DataFrame(np.vstack((train_model1_predictions, train_model2_predictions, train_model3_predictions)).transpose(), 
                       columns=['model1','model2','model3'])

train_results['target'] = train_df['target']
train_results.head()

In [None]:
test_results = pd.DataFrame(np.vstack((model1_predictions, model2_predictions, model3_predictions)).transpose(), 
                       columns=['model1','model2','model3'])


In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
 
# Create linear regression object
regr = LinearRegression()

# Train the model using the training sets
regr.fit(train_results.drop('target', axis=1), train_results.target)

# Make predictions using the testing set
y_pred = regr.predict(test_results)

# The coefficients
print('Coefficients: \n', regr.coef_)
print('Coefficients Sum: \n', sum(regr.coef_))

In [None]:
print('Coefficients: \n', regr.coef_)
M1, M2, M3 = regr.coef_
print(M1, M2, M3)

# Stacking with LAMA

In [None]:
# TIMEOUT = 15_000 # Time in seconds for automl run
# TARGET_NAME = 'target' # Target column name

# def rmse(x, y): return np.sqrt(mean_squared_error(x, y))
# task = Task('reg', metric=rmse)
# roles = {
#     'target': TARGET_NAME,
#         }

In [None]:
# automl = TabularAutoML(task=task,
#                           timeout=TIMEOUT,
#                           general_params={'nested_cv': False, 'use_algos': [['linear_l2']]},
#                           reader_params={'cv': 5},
#                           selection_params={'mode': 1},
#                           )

# oof_pred = automl.fit_predict(train_results, roles=roles)
# print('')
# print(rmse(train_results[TARGET_NAME], oof_pred.data[:, 0]))

In [None]:
# predictions = automl.predict(test_results).data[:, 0]

In [None]:
submission_df.target = y_pred # predictions
submission_df

In [None]:
submission_df.to_csv("submission.csv", index=False)