In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch

import sys
import warnings
import csv
import collections
import re
import nltk
import time
import random
import os
import matplotlib.pyplot as plt

from transformers import *
# sys.stderr = open('stderr_output.txt', 'w')
warnings.filterwarnings('ignore')


In [None]:
TRAIN_PATH = '/kaggle/input/commonlitreadabilityprize/train.csv'
TEST_PATH = '/kaggle/input/commonlitreadabilityprize/test.csv'

SEED=777

random.seed(SEED)
os.environ['PYTHONASSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [None]:
ENGLISTH_STOPWORDS = set(nltk.corpus.stopwords.words('english')) 
def preprocess_text(txt):
    # code for removing stop words, but to be honest, would only not be that important...
    filtered_sentence = txt.lower()
#     word_tokens = nltk.tokenize.word_tokenize(filtered_sentence) 
#     filtered_sentence = ' '.join([w.lower() for w in word_tokens if not w.lower() in ENGLISTH_STOPWORDS])

#     # remove digits
    filtered_sentence = ''.join(chr for chr in filtered_sentence if not chr.isdigit())
   
    return filtered_sentence
  


all_data = [] # (id, sentence, score, stderr)
all_tests = [] # (id, sentence)

with open(TRAIN_PATH) as f_dt:
    dt_reader = csv.DictReader(f_dt)
    for dt_row in dt_reader:
        all_data.append((dt_row['id'], preprocess_text(dt_row['excerpt']), dt_row['target'], dt_row['standard_error']))


with open(TEST_PATH) as f_tst:
    tst_reader = csv.DictReader(f_tst)
    for tst_row in tst_reader:
        all_tests.append((tst_row['id'], preprocess_text(tst_row['excerpt'])))

        
n_sent_data = [len(nltk.tokenize.sent_tokenize(dt_i[1])) for dt_i in all_data]
n_sent_tests = [len(nltk.tokenize.sent_tokenize(tst_i[1])) for tst_i in all_tests]
        
print(np.mean(n_sent_data), max(n_sent_data), min(n_sent_data), len(all_data))
print(np.mean(n_sent_tests), max(n_sent_tests), min(n_sent_tests), len(all_tests))


total_data_pts = len(all_data)
np.random.shuffle(all_data)
training_data = all_data[: int(0.9 * total_data_pts)]
val_data = all_data[int(0.9 * total_data_pts): ]
print(len(training_data), len(val_data))
print(len(all_data))

In [None]:
# # debug code
# sample_sentences = [
#     'i am training something on a contest', 
#     'which might make me feel good', 
#     'but to be honest i am a bit worried'
# ]


# sample_tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased')
# sample_model = BertForSequenceClassification.from_pretrained('/kaggle/input/bert-base-uncased')

# for snm, spr in sample_model.named_parameters():
#     print(snm)

# sample_result = sample_tokenizer(sample_sentences, padding=True, return_tensors='pt')
# print(sample_result)
# print(sample_model(sample_result['input_ids'], attention_mask=sample_result['attention_mask']))
# print(sample_model(sample_result['input_ids'], attention_mask=None))

In [None]:
def fold_divide(data, val_fold_idx, n_folds):
    total_data = len(data)
    fold_stride = (total_data // n_folds) + 1
    fold_low, fold_high = val_fold_idx * fold_stride, (val_fold_idx + 1) * fold_stride
    return [*data[:fold_low], *data[fold_high:]], data[fold_low: fold_high]
    

In [None]:
class ProgressBar:
    def __init__(self, total_dt, tabs):
        self.total_data = total_dt
        self.done_data = 0
        self.printed = 0
        
        self.tabs = tabs
    
    def print_init(self):
        print('-' * self.tabs)
    
    def update(self, num):
        self.done_data += num
        while self.printed < int((self.done_data / self.total_data) * self.tabs):
            print('>', end='')
            self.printed += 1


In [None]:
class TextReorderBatchLoader:
    # default no re-order
    def __init__(self, data_lst, batch_size, **overrides):
        sample_params = {
            'trunc_lower': 1.0,
            'trunc_higher': 1.0,
            'trunc_min': 1,
            'trunc_shuffle': False,
            'iter_percentage': 1
        }
        sample_params = {
            **sample_params,
            **overrides
        }
        
        self.sample_lower = sample_params['trunc_lower']
        self.sample_range = sample_params['trunc_higher'] - self.sample_lower
        self.sample_min = sample_params['trunc_min']
        self.sample_shuffle = sample_params['trunc_shuffle']
        
        self.iter_percentage=sample_params['iter_percentage']
        
        self.data_list = [*data_lst]
        self.batch_size = batch_size
        
        np.random.shuffle(self.data_list)
    
    def __iter__(self):
        
        np.random.shuffle(self.data_list)
        # since randomly shuffled each time so only need to worry about only sampling the front
        for idx in range(0, len(self), self.batch_size):
            all_texts = []
            all_scores = []
            all_stds = []
            for itm_i in self.data_list[idx: idx + self.batch_size]:
                txt_i, scr_i, std_i = itm_i[1], float(itm_i[2]), float(itm_i[3])
                all_texts.append(self.random_sample_text(txt_i))
                all_scores.append(scr_i)
                all_stds.append(std_i)
            
            yield all_texts, all_scores, all_stds
    def random_sample_text(self, s):
        # len_kept = int((np.random.random() * self.sample_range + self.sample_lower) * len(s))
        # len_start = np.random.randint(0, high=len(s) - len_kept)
        # return s[len_start: len_start+len_kept]
        s_sentences = nltk.tokenize.sent_tokenize(s)
        sentences_kept = max([
            int((np.random.random() * self.sample_range + self.sample_lower) * len(s_sentences)), 
            self.sample_min
        ])
        if self.sample_shuffle:
            np.random.shuffle(s_sentences) 
        
        sentences_kept_start = np.random.randint(0, high = max([1, len(s_sentences) - sentences_kept]))
        return ' '.join(s_sentences[sentences_kept_start: sentences_kept_start + sentences_kept])
    
    def __len__(self):
        return int(self.iter_percentage * len(self.data_list))
                                            

In [None]:
# debug code
# sample_tokenizer = PARAMS['tokenizer_class'].from_pretrained(PARAMS['token_path'])
# sample_loader = FullReorderBatchLoader(val_data, 10, sample_tokenizer)

# for s_txt_tsr, s_scr, s_std in sample_loader:
#     print(s_txt_tsr, s_scr, s_std, sep='\n')
#     break

In [None]:
class ContinuousWrapper(torch.nn.Module):
    def __init__(self, 
                 pre_trained_cls, 
                 pretrained_model_path,
                 freeze_param_condition=lambda name: False):
        super().__init__()
        self.pre_model = pre_trained_cls.from_pretrained(pretrained_model_path, num_labels = 1)
        for pre_model_param_name, pre_param in self.pre_model.named_parameters():
            if freeze_param_condition(pre_model_param_name):
                pre_param.requires_grad = False
                print('freezing {0}'.format(pre_model_param_name))
        
    def forward(self, x, mask=None):
        return self.pre_model(x, attention_mask=mask).logits
    
    def loss(self, x, scores, mask=None, weights=None):
        if weights is None:
            weights = torch.ones(scores.shape).to(scores.device)
        s_pred = torch.reshape(self(x, mask), scores.shape)
        loss_pred = torch.sqrt(torch.mean(torch.pow((scores - s_pred) * weights, 2)))
        return loss_pred
    
    def make_prediction(self, x, mask=None, strategy='none', *strategy_args):
        return torch.reshape(self(x, mask), (x.shape[0],))

In [None]:
class REstimator(ContinuousWrapper):
    def __init__(self, 
                 pre_trained_cls, 
                 pretrained_model_path,
                 freeze_param_condition=lambda name: False):
        super().__init__(pre_trained_cls, 
                 pretrained_model_path,
                 freeze_param_condition)
        self.shrink_and_expand = torch.nn.Sequential(
                                    torch.nn.Tanh(),
                                    torch.nn.Linear(1, 1)
                                )
        
    def forward(self, x, mask=None):
        raw_v = super().forward(x, mask)
        
        return self.shrink_and_expand(raw_v)
    
    
        

In [None]:
class DiscreteEstimator(torch.nn.Module):
    def __init__(self, 
                 pre_trained_cls, 
                 pretrained_model_path, 
                 pred_range, 
                 divisions, 
                 freeze_param_condition=lambda name: False
                ):
        super().__init__()
        self.pre_model = pre_trained_cls.from_pretrained(pretrained_model_path, num_labels = divisions)
        
        for pre_model_param_name, pre_param in self.pre_model.named_parameters():
            if freeze_param_condition(pre_model_param_name):
                pre_param.requires_grad = False
                print('freezing {0}'.format(pre_model_param_name))
        
        
        # not a parameter, thus tensor should not be propagated
        self.value_match_vec = torch.linspace(pred_range[0], pred_range[1], divisions)
    
    # this will only return the weights before softmax
    def forward(self, x, mask=None):
        return self.pre_model(x, attention_mask=mask).logits
    
    
    def loss(self, x, scores, mask=None, weights=None):
        match_vec = self.value_match_vec.to(x.device)
        pred_x = self(x, mask)
        
        scores_match_shape = torch.cat([torch.reshape(scores, (*scores.shape, 1))] * match_vec.shape[0], dim=1)
        score_distance = torch.pow(scores_match_shape - match_vec, 2)
        score_label = torch.argmin(score_distance, dim=1)
        
        return torch.nn.functional.cross_entropy(pred_x, score_label, weight=weights)
        
    def make_prediction(self, x, mask=None, strategy='avg', *strategy_args):
        match_vec = self.value_match_vec.to(x.device)
        pred_x = self(x, mask)
        
        if strategy == 'avg':
            softmax_coe = 1.0 if len(strategy_args) == 0 else strategy_args[0]
            avg_terms = torch.nn.functional.softmax(pred_x * softmax_coe, dim=1) * match_vec
            avg_result = torch.sum(avg_terms, dim=1)
            return avg_result
        elif strategy == 'argmax':
            match_idxs = torch.argmax(pred_x, dim=1)
            argmax_result = match_vec[match_idxs]
            return argmax_result
        
        else:
            raise ValueError('unrecognized strategy {0}'.format(strategy))
    

In [None]:
class SimTokenizer:
    def __call__(self, text_inputs, *args, **kwargs):
        return {
            'input_ids': self.parse(text_inputs),
            'mask': None
        }
    
    def parse(self, text_inputs):
        raise NotImplemented
        
# inspired by https://www.kaggle.com/weka511/naive-readability
class LRTokenizer(SimTokenizer):
    stop_words = set(nltk.corpus.stopwords.words('english')) 
    
    @classmethod
    def from_pretrained(cls, freq_file_path):
        return cls(freq_file_path)
    
    def __init__(self, freq_file_path):
        self.word_freqs = {}
        with open(freq_file_path) as f_freq:
            freq_reader = csv.DictReader(f_freq)
            for freq_data in freq_reader:
                self.word_freqs[freq_data['word'].lower()] = int(freq_data['count'])
                
        mx_freq, mn_freq = max(self.word_freqs[wrd_i] for wrd_i in self.word_freqs), min(self.word_freqs[wrd_i] for wrd_i in self.word_freqs)
        for word_i in self.word_freqs:
            self.word_freqs[word_i] = float(self.word_freqs[word_i] - mn_freq) / float(mx_freq - mn_freq)
    
    def parse(self, text_inputs):
        return torch.tensor([self.get_features(txt_pc) for txt_pc in text_inputs])
    
    def get_features(self, txt_ipt):
        txt_wrds = nltk.tokenize.word_tokenize(txt_ipt)
        txt_sents = nltk.tokenize.sent_tokenize(txt_ipt)
        
        n_sents = float(len(txt_sents))
        len_sents = float(len(txt_wrds)) / n_sents
        syllables_per_sents = float(self.count_syllables(txt_wrds)) / n_sents
        freq_avg = self.freq_cnt(txt_wrds)
        
        stop_word_cnts_per_sent = (np.array(self.count_stop_words(txt_wrds)) / n_sents).tolist()
        
        return [
            n_sents, 
            len_sents, 
            syllables_per_sents, 
            freq_avg,
            *stop_word_cnts_per_sent
        ]
        
        
    def count_syllables(self, txt_wrds):
        vowels = set(['a', 'e', 'i', 'o', 'u', 'y'])
        
        cnt = 0
        for wrd in txt_wrds:
            last_vowel = False
            for wc in wrd:
                found_vowel = wc in vowels
                if found_vowel:
                    if not last_vowel:
                        cnt += 1
                        
                    last_vowel = True
                    
                else:
                    last_vowel = False
            if len(wrd) > 1 and wrd[-1] == 'e':
                cnt -= 1
        return cnt
    
    def freq_cnt(self, all_words):
        all_freqs = [self.word_freqs[wrd_i] for wrd_i in all_words if wrd_i in self.word_freqs]
        return float(sum(all_freqs)) / float(len(all_freqs))
        
    def count_stop_words(self, all_words, stop_word_list=None):
        if stop_word_list is None:
            stop_word_list = self.stop_words
        
        cnter = collections.Counter(all_words)
        return [float(cnter[sw_i]) for sw_i in stop_word_list]
        

        
class FeatureModel(torch.nn.Module):
    def __init__(self, dims, mask=None):
        super().__init__()
        dims = [*dims]
        if len(dims) < 2:
            raise ValueError('At least provide the dim for input and output')
        
        if mask is None:
            mask = []
        input_dim = dims[0]
        dims[0] = dims[0] - len(mask)
        
        lyrs = []
        for i in range(1, len(dims)):
            lyrs.append(torch.nn.Linear(dims[i-1], dims[i]))
            if i < len(dims) - 1:
                lyrs.append(torch.nn.Tanhshrink())
        self.feature_model = torch.nn.Sequential(*lyrs)
        
        self.feature_indices = [i for i in range(0, input_dim) if i not in mask]
    
    def forward(self, x, mask=None):
        x = x[:, self.feature_indices]
        return self.feature_model(x)
    
    def loss(self, x, scores, mask=None, weights=None):
        if weights is None:
            weights = torch.ones(scores.shape).to(scores.device)
        s_pred = torch.reshape(self(x), scores.shape)
        loss_pred = torch.sqrt(torch.mean(torch.pow((scores - s_pred) * weights, 2)))
        return loss_pred
    
    def make_prediction(self, x, mask=None, strategy='none', *strategy_args):
        return torch.reshape(self(x), (x.shape[0],))
        


In [None]:
# debug code
# sample_lrp = LRTokenizer.from_pretrained('/kaggle/input/english-word-frequency/unigram_freq.csv')

# sample_lrp([
#     'hi i have no idea what is happening. but i will try.', 
#     'you guess what happend? i have no idea. just asking. you sure?', 
#     'hey you are a bitch.', 
#     'you are done, i am saying this. you are done. no more words.'
# ])


In [None]:
class ModelScaledWrapper(torch.nn.Module):
    def __init__(self, wrapped_model, coefficient):
        super().__init__()
        self.wrapped = wrapped_model
        self.coefficient = coefficient
        
    def make_prediction(self, *args):
        return self.coefficient * self.wrapped.make_prediction(*args)
    
    def forward(self, *args):
        return self.coefficient * self.wrapped(*args)

In [None]:
class GradBoostModel(torch.nn.Module):
    def __init__(self, device='cpu'):
        super().__init__()
        self.sub_modules = torch.nn.ModuleList([])
        self.pred_params = []
        self.tokenizers = []
        self.model_device = device
        self.to(self.model_device)
    
    def add_model(self, sub_model, sub_model_pred_param, sub_model_tokenizer, additional_args):
        coefficient = self.fit_coefficient(
            sub_model, 
            sub_model_tokenizer, 
            sub_model_pred_param, 
            additional_args['dt_loader']
        ) * additional_args['shrinkage']
        
        self.sub_modules.append(ModelScaledWrapper(sub_model, coefficient).to(self.model_device))
        self.pred_params.append(sub_model_pred_param)
        self.tokenizers.append(sub_model_tokenizer)
        
        
    def forward(self, txt_input, model_num=0):
        if len(self) > 0:
            while model_num <= 0:
                model_num = model_num + len(self)
            
            rt_array = []
            for sub_md, sub_pred_prm, sub_tk in zip(
                        self.sub_modules[0: model_num],
                        self.pred_params[0: model_num],
                        self.tokenizers[0: model_num]
                    ):
                sub_tokenized = sub_tk(txt_input, padding=True, return_tensors='pt')
                rt_array.append(
                    sub_md.make_prediction(
                        sub_tokenized['input_ids'].to(self.model_device), 
                        sub_tokenized['attention_mask'].to(self.model_device), 
                        *sub_pred_prm
                    ) 
                )
            rt = sum(rt_array)
        else:
            rt = torch.zeros((len(txt_input), )).to(self.model_device)
        return rt
    
    def loss(self, txt_input, score, *forward_args):
        scr_pred = self(txt_input, *forward_args)
        loss_pred = torch.sqrt(torch.mean(torch.pow(scores - scr_pred, 2)))
        return loss_pred
    
    def next_model_loss(self, t, md, x, msk, scr):
        with torch.no_grad():
            curr_md_target = (scr - self(t))
        return md.loss(x, curr_md_target, msk) 
    
    def last_model_added(self, *args):
        pass
    
    def to(self, device):
        self.model_device = device
        return super().to(self.model_device)
        
    def __len__(self):
        return len(self.sub_modules)
    
    def fit_coefficient(self, md, md_tk, md_pred_args, fit_data_loader):
        base_md = self
        base_scores = []
        md_scores = []
        ans_scores = []
        
        with torch.no_grad():
            for txt_f, scr_f, _ in fit_data_loader:
                txt_tk = md_tk(txt_f, padding=True, return_tensors='pt')
                txt_vec_f = txt_tk['input_ids'].to(PARAMS['device'])
                txt_msk_f = txt_tk['attention_mask'].to(PARAMS['device'])
                scr_f = torch.tensor(scr_f).to(PARAMS['device'])

                base_md_pred = base_md(txt_f)
                md_pred = md.make_prediction(txt_vec_f, txt_msk_f, *md_pred_args)

                base_scores.append(base_md_pred)
                md_scores.append(md_pred)
                ans_scores.append(scr_f)

        yb = torch.cat(base_scores, dim=0)
        ym = torch.cat(md_scores, dim=0)
        ya = torch.cat(ans_scores, dim=0)

        # find c such that c minimize [ya - (yb + c * ym)]^2
        cm = (torch.sum((ya - yb) * ym) / torch.sum(ym * ym)).detach().cpu().numpy()
        return cm
            

In [None]:
class WeightHeadModel(torch.nn.Module):
    def __init__(self, pred_model_num, w_model_features, output_dims):
        super().__init__()
        
        output_dims.append(pred_model_num) # assume output_dims only contain internal dims
        
        self.bi_ln_interpolate = torch.nn.Bilinear(pred_model_num, w_model_features, output_dims[0])
        
        inner_layers = []
        for lyr_i in range(1, len(output_dims)):
            inner_layers.append(torch.nn.Tanhshrink())
            inner_layers.append(torch.nn.Linear(output_dims[lyr_i - 1], output_dims[lyr_i]))
        
        self.ln_model = torch.nn.Sequential(
            *inner_layers
        )
        
    def forward(self, preds, features):
        return self.ln_model(self.bi_ln_interpolate(preds, features))


class AvgWeightHeadModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__()
        
    def forward(self, preds, features):
        return (torch.ones(preds.shape) / float(preds.shape[1])).to(preds.device)
    

class WeightedAvgModel(torch.nn.Module):
    def __init__(self, weight_tokenizer, weighted_model, weight_head_cls, weight_head_params, device='cpu'):
        super().__init__()
        self.sub_modules = torch.nn.ModuleList([])
        self.pred_params = []
        self.tokenizers = []
        
        self.weight_model_tokenizer = weight_tokenizer
        self.weight_model = weighted_model
        self.weight_head_params = weight_head_params
        self.weight_head_cls = weight_head_cls
        self.weight_model_head = None
        
        self.model_device = device
        self.to(self.model_device)
    
    def add_model(self, sub_model, sub_model_pred_param, sub_model_tokenizer, additional_args):
        self.sub_modules.append(sub_model.to(self.model_device))
        self.pred_params.append(sub_model_pred_param)
        self.tokenizers.append(sub_model_tokenizer)
    
    def loss(self, txt_input, scores, *forward_args):
        scr_pred = self(txt_input, *forward_args)
        loss_pred = torch.sqrt(torch.mean(torch.pow(scores - scr_pred, 2)))
        return loss_pred
    
    def next_model_loss(self, t, md, x, msk, scr):
        return md.loss(x, scr, msk) 
    
    def last_model_added(self, tr_dt_loader, v_dt_loader, w_lr, w_epochs):
        self.weight_model_head = self.weight_head_cls(len(self), *self.weight_head_params).to(self.model_device)
        
        w_opt = torch.optim.AdamW(
            [
                {'params': self.weight_model.parameters()},
                {'params': self.weight_model_head.parameters()}
            ],
            lr = w_lr
        )
        for w_epoch_i in range(0, w_epochs):
            print('-----epoch {0}------'.format(w_epoch_i + 1))
            tr_losses = []
            v_losses = []
            
            self.train()
            for txt_w_train, scr_w_train, _ in tr_dt_loader:
                scr_w_train = torch.tensor(scr_w_train).to(self.model_device)
                w_opt.zero_grad()
                
                train_w_loss = self.loss(txt_w_train, scr_w_train)
                train_w_loss.backward()
                w_opt.step()
                tr_losses.extend([train_w_loss.detach().cpu().numpy()] * len(txt_w_train))
            
            self.eval()
            for txt_w_val, scr_w_val, _ in v_dt_loader:
                scr_w_val = torch.tensor(scr_w_val).to(self.model_device)
                with torch.no_grad():
                    val_w_loss = self.loss(txt_w_val, scr_w_val)
                    v_losses.extend([val_w_loss.detach().cpu().numpy()] * len(txt_w_val))
            
            print('weight train loss {0}'.format(np.mean(tr_losses)))
            print('weight val loss {0}'.format(np.mean(v_losses)))
            print()
        
        
    def forward(self, txt_input):
        if self.weight_model_head is None:
            raise ValueError('Training not finished, cannot predict')
            
        if len(self) == 0:
            rt = torch.zeros((len(txt_input), )).to(self.model_device)
        else:
            with torch.no_grad():
                score_preds = []
                
                for sub_md, sub_pred_prm, sub_tk in zip(
                            self.sub_modules,
                            self.pred_params,
                            self.tokenizers
                        ):
                    sub_tokenized = sub_tk(txt_input, padding=True, return_tensors='pt')
                    score_preds.append(
                        torch.reshape(
                            sub_md.make_prediction(
                                sub_tokenized['input_ids'].to(self.model_device), 
                                sub_tokenized['attention_mask'].to(self.model_device), 
                                *sub_pred_prm
                            ),
                            (len(txt_input), 1)
                        )
                    )
                
            score_preds = torch.cat(score_preds, dim=1)
            
            weight_txt_vec = self.weight_model_tokenizer(txt_input, padding=True, return_tensors='pt')['input_ids'].to(self.model_device)
            score_weights = self.weight_model_head(score_preds, self.weight_model(weight_txt_vec))
            
            final_pred = self.weight_collapse(score_preds, score_weights)
            return final_pred
            
    @classmethod
    def weight_collapse(cls, preds, ws):
        # write it as a func so that it can be called during training as well
        wf = torch.nn.functional.softmax(ws, dim=1)
        ret = torch.sum(
            preds * wf,
            dim=1
        )
        return ret
        
    def to(self, device):
        self.model_device = device
        return super().to(self.model_device)
        
    def __len__(self):
        return len(self.sub_modules)

In [None]:
#debug code
# mlst = torch.nn.ModuleList([torch.nn.Linear(3, 3), torch.nn.Linear(5, 3), torch.nn.Linear(3, 4)])
# mlst[0:-1]
# gbs = GradBoostModel()
# print(len(gbs))
# gbs.add_model(torch.nn.Linear(3, 4), 'a', None)
# print(len(gbs))

In [None]:
PARAMS = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    
    'sent_reorder_loader_params': {
        'trunc_lower': 1.0,
        'trunc_higher': 1.0,
        'trunc_min': 2,
        'trunc_shuffle': False,
        'iter_percentage': 0.7
    },
    
    'main_model_class': WeightedAvgModel,
    'main_model_construct_args': [
        LRTokenizer('/kaggle/input/english-word-frequency/unigram_freq.csv'),
        FeatureModel([183, 50, 30]),
        AvgWeightHeadModel,
        [],
#         WeightHeadModel,
#         [30, [15, 7]]
    ],
    
    'sub_models': [
#         {
#             'model_class': ContinuousWrapper,
#             'construct_args': [
#                 BertForSequenceClassification, 
#                 '/kaggle/input/bert-base-uncased', 
#                 lambda p_name: sum([1 if non_freeze_str in p_name else 0 for 
#                                                         non_freeze_str in [
#                                                           'classifier',
#                                                           'pooler',
#                                                           'encoder.layer'
# #                                                             '.'
#                                                         ]
#                                     ]) == 0,
#             ],
#             'tokenizer_class': BertTokenizer,
#             'tokenizer_path': '/kaggle/input/bert-base-uncased',

#             'pred_args': ['none'],

#             'epochs': 12,
#             'lr': 0.00001,
#             'weight_decay': 0.07,
#             'random_sample_decay': 1.0,
#             'add_model_params': {}
#         },
#         {
#             'model_class': DiscreteEstimator,
#             'construct_args': [
#                 RobertaForSequenceClassification, 
#                '/kaggle/input/roberta-transformers-pytorch/distilroberta-base', 
#                 (-5.0, 3.0),
#                 9,
#                 lambda p_name: sum([1 if non_freeze_str in p_name else 0 for 
#                                                       non_freeze_str in [
# #                                                           'classifier',
# #                                                           'pooler',
# #                                                           'encoder.layer'
#                                                           '.'
#                                                       ]
#                                                  ]) == 0,
#             ],
#             'tokenizer_class': RobertaTokenizer,
#             'tokenizer_path': '/kaggle/input/roberta-transformers-pytorch/distilroberta-base',
            
#             'pred_args': ['avg', 5],
            
#             'epochs': 9,
#             'lr': 0.00001,
#             'weight_decay': 0.01,
#             'random_sample_decay': 1.0,
#             'add_model_params': {
# #                 'dt_loader': TextReorderBatchLoader(training_data, 10), 
# #                 'shrinkage': 1.0
#             }
#         },
            
        {
            'model_class': ContinuousWrapper,
            'construct_args': [
                RobertaForSequenceClassification, 
               '/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli', 
                lambda p_name: sum([1 if non_freeze_str in p_name else 0 for 
                                                        non_freeze_str in [
                                                          'classifier',
                                                          'pooler',
                                                          *['encoder.layer.{0}'.format(unf_lyr) for unf_lyr in range(16, 24)],
#                                                           '.'
                                                        ]
                                    ]) == 0,
            ],
            'tokenizer_class': RobertaTokenizer,
            'tokenizer_path': '/kaggle/input/roberta-transformers-pytorch/roberta-base',

            'pred_args': ['none'],

            'epochs': 8,
            'lr': 0.00001,
            'weight_decay': 0.5,
            'random_sample_decay': 1.0,
            'add_model_params': {
#                 'dt_loader': TextReorderBatchLoader(training_data, 10), 
#                 'shrinkage': 1.0
            }
        },
        {
            'model_class': ContinuousWrapper,
            'construct_args': [
                RobertaForSequenceClassification, 
               '/kaggle/input/roberta-transformers-pytorch/roberta-large', 
                lambda p_name: sum([1 if non_freeze_str in p_name else 0 for 
                                                        non_freeze_str in [
                                                          'classifier',
                                                          'pooler',
                                                          *['encoder.layer.{0}'.format(unf_lyr) for unf_lyr in range(12, 24)],
#                                                           '.'
                                                        ]
                                    ]) == 0,
            ],
            'tokenizer_class': RobertaTokenizer,
            'tokenizer_path': '/kaggle/input/roberta-transformers-pytorch/roberta-base',

            'pred_args': ['none'],

            'epochs': 12,
            'lr': 0.00001,
            'weight_decay': 0.5,
            'random_sample_decay': 1.0,
            'add_model_params': {
#                 'dt_loader': TextReorderBatchLoader(training_data, 10), 
#                 'shrinkage': 1.0
            }
        },
        {
            'model_class': ContinuousWrapper,
            'construct_args': [
                RobertaForSequenceClassification, 
               '/kaggle/input/roberta-transformers-pytorch/roberta-base', 
                lambda p_name: sum([1 if non_freeze_str in p_name else 0 for 
                                                        non_freeze_str in [
                                                          'classifier',
                                                          'pooler',
                                                          'encoder.layer',
#                                                           '.'
                                                        ]
                                    ]) == 0,
            ],
            'tokenizer_class': RobertaTokenizer,
            'tokenizer_path': '/kaggle/input/roberta-transformers-pytorch/roberta-base',

            'pred_args': ['none'],

            'epochs': 16,
            'lr': 0.00001,
            'weight_decay': 0.5,
            'random_sample_decay': 1.0,
            'add_model_params': {
#                 'dt_loader': TextReorderBatchLoader(training_data, 10), 
#                 'shrinkage': 1.0
            }
        },
        
#         {
#             'model_class': ContinuousWrapper,
#             'construct_args': [
#                 RobertaForSequenceClassification, 
#                '/kaggle/input/roberta-transformers-pytorch/distilroberta-base', 
#                 lambda p_name: sum([1 if non_freeze_str in p_name else 0 for 
#                                                         non_freeze_str in [
#                                                           'classifier',
#                                                           'pooler',
#                                                           'encoder.layer.5',
#                                                           'encoder.layer.4',
#                                                           'encoder.layer.3',
#                                                         ]
#                                     ]) == 0,
#             ],
#             'tokenizer_class': RobertaTokenizer,
#             'tokenizer_path': '/kaggle/input/roberta-transformers-pytorch/distilroberta-base',

#             'pred_args': ['none'],

#             'epochs': 15,
#             'lr': 0.0000001,
#             'weight_decay': 0.1,
#             'random_sample_decay': 1.0,
#             'grad_boost_shrinkage': 1.0
#         },
    ],
    
    'batch_size': 10,
    'val_batch_size': 30,
    
    'on_finish_params': [
        TextReorderBatchLoader(
            training_data,
            10
        ),
        TextReorderBatchLoader(
            val_data,
            10
        ),
        0.0001,
        0
    ]
    
    
}


In [None]:

# model = DiscreteEstimator(
#     PARAMS['model_class'],
#     PARAMS['model_path'],
#     PARAMS['discrete_pred_range'],
#     PARAMS['discrete_classes'],
#     PARAMS['freeze_param_condition']
# ).to(PARAMS['device'])
# model = ContinuousWrapper(
#     PARAMS['model_class'],
#     PARAMS['model_path'],
#     PARAMS['freeze_param_condition']
# ).to(PARAMS['device'])
model = PARAMS['main_model_class'](*PARAMS['main_model_construct_args']).to(PARAMS['device'])

# pre_tokenizer = PARAMS['tokenizer_class'].from_pretrained(PARAMS['token_path'])



In [None]:
# optimizer = torch.optim.AdamW(model.parameters(), lr=PARAMS['lr'], weight_decay=PARAMS['weight_decay'])
# print(list(model.parameters()))

In [None]:
# unreg_loss_fn = lambda scr1, scr2: torch.sqrt(torch.mean(torch.pow(scr2 - scr1, 2)))
# reg_loss_fn = lambda scr1, scr2, std: torch.sqrt(torch.mean(
#                                             torch.pow(scr2 - scr1, 2) / torch.maximum(
#                                                                             torch.pow(std, 2), 
#                                                                             0.01 * torch.ones(std.shape).to(std.device)
#                                                                         )
#                                         ))

# loss_md_fn = lambda base_md, t, diff_c, md, x, scr: md.loss(x, (scr - base_md(t)) * diff_c)
# def loss_md_fn(base_md, t, md, x, scr):
#     with torch.no_grad():
#         curr_md_target = (scr - base_md(t))
#     return md.loss(x, curr_md_target)
            

# for cycle_num in range(1, PARAMS['cycles'] + 1):
#     for fold_num in range(0, PARAMS['folds']):
for _ in range(0, 1):
    for sub_model_idx in range(0, len(PARAMS['sub_models'])):
        print('training sub model {0}'.format(sub_model_idx + 1))
        sub_train_loader = TextReorderBatchLoader(
            training_data, 
            PARAMS['batch_size'], 
            **PARAMS['sent_reorder_loader_params'],
            
        )
        sub_val_loader = TextReorderBatchLoader(val_data, PARAMS['val_batch_size'])
        
        curr_md = PARAMS['sub_models'][sub_model_idx]['model_class'](
                        *PARAMS['sub_models'][sub_model_idx]['construct_args']
                    ).to(PARAMS['device'])
        curr_tk = PARAMS['sub_models'][sub_model_idx]['tokenizer_class'].from_pretrained(
            PARAMS['sub_models'][sub_model_idx]['tokenizer_path']
        )
        
        curr_pred_params = PARAMS['sub_models'][sub_model_idx]['pred_args']
        
        curr_opt = torch.optim.AdamW(
            curr_md.parameters(), 
            lr=PARAMS['sub_models'][sub_model_idx]['lr'], 
            weight_decay=PARAMS['sub_models'][sub_model_idx]['weight_decay']
        )
        
        curr_model_epochs = PARAMS['sub_models'][sub_model_idx]['epochs']
        
        curr_add_param = PARAMS['sub_models'][sub_model_idx]['add_model_params']
        for epoch in range(1, curr_model_epochs + 1):
            prog_bar = ProgressBar(len(sub_train_loader), 100)
            prog_bar.print_init()
            print('epoch {0} / {1}'.format(epoch, curr_model_epochs))

            loss_stats = {
                'train_md': [],
                'val_md': []
            }
        
            model.eval()
            curr_md.train()
            for text_train, score_train, std_train in sub_train_loader:
                torch.cuda.empty_cache() 
                curr_opt.zero_grad()
                text_train_tk = curr_tk(text_train, padding=True, return_tensors='pt')
                text_train_vec = text_train_tk['input_ids'].to(PARAMS['device'])
                text_train_msk = text_train_tk['attention_mask'].to(PARAMS['device'])
                
                score_train = torch.tensor(score_train).to(PARAMS['device'])
                std_train = torch.tensor(std_train).to(PARAMS['device'])
                
                d_score_train = torch.normal(
                    torch.zeros(std_train.shape).to(PARAMS['device']),
                    std_train
                ) * PARAMS['sub_models'][sub_model_idx]['random_sample_decay']


                train_loss_md = model.next_model_loss(
                    text_train,
                    curr_md, 
                    text_train_vec, 
                    text_train_msk,
                    score_train + d_score_train
                )
                train_loss_md.backward()
                curr_opt.step()
        #         scheduler.step()



                loss_stats['train_md'].append(train_loss_md.detach().cpu().numpy())

                prog_bar.update(score_train.shape[0])
    
            model.eval()
            curr_md.eval()
            with torch.no_grad():
                for text_val, score_val, std_val in sub_val_loader:
                    torch.cuda.empty_cache() 
                    text_val_tk = curr_tk(text_val, padding=True, return_tensors='pt')
                    text_val_vec = text_val_tk['input_ids'].to(PARAMS['device'])
                    text_val_msk = text_val_tk['attention_mask'].to(PARAMS['device'])
                    score_val = torch.tensor(score_val).to(PARAMS['device'])
                    std_val = torch.tensor(std_val).to(PARAMS['device'])


                    val_loss_md = model.next_model_loss( 
                                    text_val, 
                                    curr_md, 
                                    text_val_vec, 
                                    text_val_msk,
                                    score_val
                                )
                    loss_stats['val_md'].append(val_loss_md.detach().cpu().numpy())

            print()

            for loss_name in loss_stats:
                print('{0}: {1}'.format(loss_name, np.mean(loss_stats[loss_name])))
        
        model.add_model(
            curr_md, 
            curr_pred_params, 
            curr_tk, 
            curr_add_param
        )
        print()
        
    

In [None]:
model.last_model_added(*PARAMS['on_finish_params'])

In [None]:
# can be different from the ones in training

validation_loss_func = lambda scr1, scr2: torch.sqrt(torch.mean(torch.pow(scr2 - scr1, 2)))
validation_results = []

val_loader = TextReorderBatchLoader(val_data, len(val_data))


base_scores = []
pred_scores = []
pred_losses = []

model.eval()
with torch.no_grad():
    for text_val, score_val, std_val in val_loader:
        torch.cuda.empty_cache() 
#         text_val = pre_tokenizer(text_val, padding=True, return_tensors='pt')['input_ids'].to(PARAMS['device'])
        score_val = torch.tensor(score_val).to(PARAMS['device'])
        std_val = torch.tensor(std_val).to(PARAMS['device'])

        score_pred_val = model(text_val)
        val_loss = validation_loss_func(score_val, score_pred_val)
        
        validation_results.extend([val_loss.detach().cpu().numpy()] * score_val.shape[0])
        
        base_scores.extend(
            score_val.detach().cpu().numpy()
        )
        pred_scores.extend(
            score_pred_val.detach().cpu().numpy()
        )
        pred_losses.extend(
            torch.sqrt(torch.pow(score_val - score_pred_val, 2)).detach().cpu().numpy()
        )
        
        
        
print('final validation loss {0}'.format(np.mean(validation_results)))

plt.plot(base_scores, pred_scores, 'b.')
plt.show()
plt.plot(base_scores, pred_losses, 'r.')
plt.show()

In [None]:
model.eval()
test_preds = []
eval_batch_size = 10
with torch.no_grad():
#     for test_id, test_str in all_tests:
    for test_idx in range(0, len(all_tests), eval_batch_size):
        torch.cuda.empty_cache() 
#         pred = model(pre_tokenizer([test_str], padding=True, return_tensors='pt')['input_ids'].to(PARAMS['device'])).logits[0, 0]
#         pred = model.make_prediction(
#             pre_tokenizer([test_str], padding=True, return_tensors='pt')['input_ids'].to(PARAMS['device']),
#             PARAMS['pred_strategy'],
#             *PARAMS['pred_strategy_args']
#         )[0]
        test_ids = [t_id for t_id, _ in all_tests[test_idx: test_idx + eval_batch_size]]
        test_strs = [t_txt for _, t_txt in all_tests[test_idx: test_idx + eval_batch_size]]
        pred = model(test_strs).detach().cpu().numpy()
        for test_id, target_value in zip(test_ids, pred):
            test_preds.append({
                'id': test_id,
                'target': target_value
            })
    with open('submission.csv', 'w') as output_file:
        fieldnames = ['id', 'target']
        result_writer = csv.DictWriter(output_file, fieldnames=fieldnames)

        result_writer.writeheader()
        for test_row in test_preds:
            result_writer.writerow(test_row)

In [None]:
!cat submission.csv