In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import random
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from joblib import dump, load
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
import gc
gc.enable()
import sys
sys.path.append('../input/senttrans/sentence-transformers-1.2.1')


from sentence_transformers import SentenceTransformer, util
#from sentence_transformers.util import semantic_search
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 28
seed_everything(seed=SEED)
MAX_LENGTH = 256

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


In [None]:
def rms(y_actual, y_predicted):
  return mean_squared_error(y_actual, y_predicted, squared=False)

In [None]:
from torch import nn
def predict_fast(model_name=None, data=None, init_model=None, tokenizer=None, num_labels=1, is_multilabel=False, output_logits=False, use_softmax=False):
  device = "cuda:0"
  tokenizer = AutoTokenizer.from_pretrained(model_name) if model_name else tokenizer
  config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) if model_name else None
  model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config) if model_name else init_model
  model.to(device)
  model.eval()
  y_pred = []
  batches = chunks(data, 32)
  for batch in tqdm(batches):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
    input_ids = inputs['input_ids'].to(device)
    attention = inputs['attention_mask'].to(device)
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention
    }
    with torch.no_grad():        
          outputs = model(**inputs)
    if not use_softmax:
      logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
    else:
      logits = nn.functional.softmax(outputs.logits, dim=-1).detach().cpu().numpy().squeeze().tolist()
    if is_multilabel and not output_logits:
      logits = np.argmax(logits, axis=-1)
    y_pred.extend(logits)
    gc.collect()
  return y_pred

In [None]:
def postprocess_predictions(predictions, bin_predictions, bin_averages, threshold=0.58):
  new_predictions = []
  for idx, p in enumerate(predictions):
    if abs(p - bin_averages[bin_predictions[idx][0]]) > 0.5 and np.argmax(bin_predictions[idx][1]) > threshold:
      new_predictions.append(np.mean([p, bin_averages[bin_predictions[idx][0]]]))
    else:
      new_predictions.append(p)
  return new_predictions

In [None]:
def perform_bin_postprocessing(predictions, bin_dirs, averages, data):
    bin_preds = []
    for bin_dir in bin_dirs:
        preds = predict_fast(bin_dir, data, use_softmax=True, is_multilabel=True, num_labels=8, output_logits=True)
        bin_preds.append(np.array(preds))
    final_preds = np.mean(np.vstack(bin_preds), axis=0)
    final_bins = [np.argmax(p) for p in final_preds]
    zipped = list(zip(final_bins, final_preds))
    new_preds = postprocess_predictions(predictions, zipped, averages)
    return new_preds

In [None]:
def make_ridge_predictions(df, ridge_dirs, model_dirs, model_bin_dirs):
  tx = [str(t) for t in df.excerpt.values]
  predictions = []
  for idx, model_dir in enumerate(model_dirs):
    logits = predict_fast(model_name=model_bin_dirs[idx], data=tx, is_multilabel=True, num_labels=4, output_logits=True)
    preds = predict_fast(model_dir, tx)
  
    logits_arr = np.array(logits)
    preds_arr = np.array(preds)

    Y = np.column_stack([logits_arr, preds_arr])
    clf = load(ridge_dirs[idx])
    y_preds = clf.predict(Y)
    predictions.append(y_preds)

  preds = np.vstack(predictions)
  return np.mean(preds, axis=0)

In [None]:
def make_ensembler_predictions(fold_predictions, ensembler_dirs, return_mean=True):
  final_predictions = []
  for idx, predictions in enumerate(fold_predictions):
    clf = load(ensembler_dirs[idx])
    Y = np.column_stack(predictions)
    y_preds = clf.predict(Y)
    final_predictions.append(y_preds)
  
  if return_mean:
    preds = np.vstack(final_predictions)
    del final_predictions
    return np.mean(preds, axis=0)
  else:
    return final_predictions

In [None]:
def predict_mean(data, models):
  preds = []
  for model in models:
    y_pred = predict_fast(model_name=model, data=data)
    preds.append(y_pred)
  preds = np.vstack(preds)
  return np.mean(preds, axis=0)

In [None]:
from itertools import chain
def predict_bi_encoder(model_dir, train_tx, val_tx, train_scores, top_k=1):
  model = SentenceTransformer(model_dir)

  train_encodings = model.encode(train_tx)
  val_encodings = model.encode(val_tx)
  hits = util.semantic_search(val_encodings, train_encodings, top_k=top_k)

  results = []

  for hit in hits:
    res = list(chain.from_iterable((h['score'], train_scores[h['corpus_id']]) for h in hit))
    results.append(res)
    gc.collect()
  return results

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
train_tx = [str(t) for t in train_df.excerpt.values]
train_sc = [float(f) for f in train_df.target.values]
test_values = [str(t) for t in test_df.excerpt.values]

In [None]:
ensembler_dirs = [
    '../input/electraensembling/electra_larger_ensemble/model_fold_0/ridge_model.joblib',
    '../input/electraensembling/electra_larger_ensemble/model_fold_1/ridge_model.joblib',
    '../input/electraensembling/electra_larger_ensemble/model_fold_2/ridge_model.joblib',
    '../input/electraensembling/electra_larger_ensemble/model_fold_3/ridge_model.joblib',
    '../input/electraensembling/electra_larger_ensemble/model_fold_4/ridge_model.joblib',
    '../input/electraensembling/electra_larger_ensemble/model_fold_5/ridge_model.joblib',
]

In [None]:
bi_deb_0 = predict_bi_encoder('../input/bideberta/model_fold_0-20210731T211557Z-001/model_fold_0', train_tx, test_values, train_sc, top_k=20)
bi_deb_1 = predict_bi_encoder('../input/bideberta/model_fold_1-20210731T202223Z-001/model_fold_1', train_tx, test_values, train_sc, top_k=20)
bi_deb_2 = predict_bi_encoder('../input/bideberta/model_fold_2-20210731T211554Z-001/model_fold_2', train_tx, test_values, train_sc, top_k=20)
bi_deb_3 = predict_bi_encoder('../input/bideberta/model_fold_3-20210731T211551Z-001/model_fold_3', train_tx, test_values, train_sc, top_k=20)
bi_deb_4 = predict_bi_encoder('../input/bideberta/model_fold_4-20210731T204824Z-001/model_fold_4', train_tx, test_values, train_sc, top_k=20)
bi_deb_5 = predict_bi_encoder('../input/bideberta/model_fold_5-20210731T202235Z-001/model_fold_5', train_tx, test_values, train_sc, top_k=20)

In [None]:
bi_ensemble = [
    '../input/ensemble-bayes/ensembler_bayes/model_fold_0/ridge_model.joblib',
    '../input/ensemble-bayes/ensembler_bayes/model_fold_1/ridge_model.joblib',
    '../input/ensemble-bayes/ensembler_bayes/model_fold_2/ridge_model.joblib',
    '../input/ensemble-bayes/ensembler_bayes/model_fold_3/ridge_model.joblib',
    '../input/ensemble-bayes/ensembler_bayes/model_fold_4/ridge_model.joblib',
    '../input/ensemble-bayes/ensembler_bayes/model_fold_5/ridge_model.joblib',
]

In [None]:
bi_fold = [
    [bi_deb_0],
    [bi_deb_1],
    [bi_deb_2],
    [bi_deb_3],
    [bi_deb_4],
    [bi_deb_5],
]

In [None]:
bi_res = make_ensembler_predictions(bi_fold, bi_ensemble, return_mean=True)

In [None]:
ensembler_dirs_2 = [
   '../input/kernelridge/ensembler_kernelridge_all/model_fold_0/ridge_model.joblib',
    '../input/kernelridge/ensembler_kernelridge_all/model_fold_1/ridge_model.joblib',
    '../input/kernelridge/ensembler_kernelridge_all/model_fold_2/ridge_model.joblib',
    '../input/kernelridge/ensembler_kernelridge_all/model_fold_3/ridge_model.joblib',
    '../input/kernelridge/ensembler_kernelridge_all/model_fold_4/ridge_model.joblib',
    '../input/kernelridge/ensembler_kernelridge_all/model_fold_5/ridge_model.joblib',
]



In [None]:
# fold 0
albi_preds_0 = predict_fast('../input/albertxxlargelowlr/model_fold_0-20210709T232146Z-001/model_fold_0/best', test_values)
alb_preds_0 = predict_fast('../input/albertxxlarge2models/model_fold_0-20210706T104908Z-001/model_fold_0/best', test_values)
deberta_preds_0 = predict_fast('../input/debertalarge/model_fold_0-20210707T114802Z-001/model_fold_0/best', test_values)
deb_preds_0 = predict_fast('../input/debertalargelowlr/model_fold_0-20210709T171339Z-001/model_fold_0/best', test_values)
ro_predictions_0 = predict_fast('../input/robertalargetwomodels/model_fold_0-20210705T095604Z-001/model_fold_0/best', test_values)
electra_preds_0 = predict_fast('../input/electralarge/model_fold_0-20210711T074853Z-001/model_fold_0/best', test_values)
rob_predictions_0 = predict_fast('../input/rolargef0/model_fold_0/best', test_values)

In [None]:
# fold 1
deberta_preds_1 = predict_fast('../input/debertalarge/model_fold_1-20210707T114805Z-001/model_fold_1/best', test_values)
albi_preds_1 = predict_fast('../input/albertxxlargelowlr/model_fold_1-20210709T232149Z-001/model_fold_1/best', test_values)
deb_preds_1 = predict_fast('../input/debertalargelowlr/model_fold_1-20210709T171343Z-001/model_fold_1/best', test_values)
ro_predictions_1 = predict_fast('../input/robertalargetwomodels/model_fold_1-20210705T101950Z-001/model_fold_1/best', test_values)
alb_preds_1 = predict_fast('../input/albertxxlarge2models/model_fold_1-20210706T104911Z-001/model_fold_1/best', test_values)
electra_preds_1 = predict_fast('../input/electralarge/model_fold_1-20210711T074858Z-001/model_fold_1/best', test_values)
rob_predictions_1 = predict_fast('../input/roblargeaugf1/model_fold_1/best', test_values)


In [None]:
#fold_2
deberta_preds_2 = predict_fast('../input/debertalargept2/model_fold_2-20210708T070250Z-001/model_fold_2/best', test_values)
ro_predictions_2 = predict_fast('../input/robertalargetwomodels/model_fold_2-20210705T104857Z-001/model_fold_2/best', test_values)
albi_preds_2 = predict_fast('../input/albertxxlargelowlr/model_fold_2-20210709T232151Z-001/model_fold_2/best', test_values)
deb_preds_2 = predict_fast('../input/debertalargelowlr/model_fold_2-20210709T171347Z-001/model_fold_2/best', test_values)
alb_preds_2 = predict_fast('../input/albertxxlarge2models/model_fold_2-20210706T110402Z-001/model_fold_2/best', test_values)
electra_preds_2 = predict_fast('../input/electralarge/model_fold_2-20210711T074904Z-001/model_fold_2/best', test_values)
rob_predictions_2 = predict_fast('../input/rolargef2/model_fold_2/best', test_values)




In [None]:
# fold 3
alb_preds_3 = predict_fast('../input/albertxxlarge2modelspt2/model_fold_3-20210706T172206Z-001/model_fold_3/best', test_values)
deberta_preds_3 = predict_fast('../input/debertalargept2/model_fold_3-20210708T070253Z-001/model_fold_3/best', test_values)
albi_preds_3 = predict_fast('../input/albertxxlargelowlr/model_fold_3-20210709T232154Z-001/model_fold_3/best', test_values)
deb_preds_3 = predict_fast('../input/debertalargelowlr/model_fold_3-20210709T171354Z-001/model_fold_3/best', test_values)
ro_predictions_3 = predict_fast('../input/robertalargetwomodels/model_fold_3-20210705T112821Z-001/model_fold_3/best', test_values)
electra_preds_3 = predict_fast('../input/electralarge/model_fold_3-20210711T074906Z-001/model_fold_3/best', test_values)
rob_predictions_3 = predict_fast('../input/rolargef0/model_fold_0/best', test_values)

In [None]:
# fold 4
ro_predictions_4 = predict_fast('../input/robertalargetwomodels/model_fold_4-20210705T124132Z-001/model_fold_4/best', test_values)
alb_preds_4 = predict_fast('../input/albertxxlarge2modelspt2/model_fold_4-20210706T172209Z-001/model_fold_4/best', test_values)
deberta_preds_4 = predict_fast('../input/debertalargept2/model_fold_4-20210708T070257Z-001/model_fold_4/best', test_values)
deb_preds_4 = predict_fast('../input/debertalargelowlr/model_fold_4-20210709T183816Z-001/model_fold_4/best', test_values)
rob_predictions_4 = predict_fast('../input/roblargeaugf4/model_fold_4/best', test_values)
albi_preds_4 = predict_fast('../input/albertxxlargelowlr/model_fold_4-20210709T232157Z-001/model_fold_4/best', test_values)
electra_preds_4 = predict_fast('../input/electralarge/model_fold_4-20210711T074913Z-001/model_fold_4/best', test_values)



In [None]:


# fold 5
albi_preds_5 = predict_fast('../input/albertxxlargelowlr/model_fold_5-20210709T232201Z-001/model_fold_5/best', test_values)
deb_preds_5 = predict_fast('../input/debertalargelowlr/model_fold_5-20210709T183818Z-001/model_fold_5/best', test_values)
ro_predictions_5 = predict_fast('../input/robertalargetwomodels/model_fold_5-20210705T124258Z-001/model_fold_5/best', test_values)
rob_predictions_5 = predict_fast('../input/roblargef5/model_fold_5/best', test_values)
deberta_preds_5 = predict_fast('../input/debertalargept2/model_fold_5-20210708T081109Z-001/model_fold_5/best', test_values)
alb_preds_5 = predict_fast('../input/albertxxlarge2modelspt2/model_fold_5-20210706T172212Z-001/model_fold_5/best', test_values)
electra_preds_5 = predict_fast('../input/electralarge/model_fold_5-20210711T074916Z-001/model_fold_5/best', test_values)




In [None]:


fold_predictions = [
    [np.array(alb_preds_0), np.array(deberta_preds_0), np.array(albi_preds_0), np.array(deb_preds_0), np.array(ro_predictions_0), np.array(electra_preds_0)],
    [np.array(alb_preds_1), np.array(deberta_preds_1), np.array(albi_preds_1), np.array(deb_preds_1), np.array(ro_predictions_1), np.array(electra_preds_1)],
    [np.array(alb_preds_2), np.array(deberta_preds_2), np.array(albi_preds_2), np.array(deb_preds_2), np.array(ro_predictions_2), np.array(electra_preds_2)],
    [np.array(alb_preds_3), np.array(deberta_preds_3), np.array(albi_preds_3), np.array(deb_preds_3), np.array(ro_predictions_3), np.array(electra_preds_3)],
    [np.array(alb_preds_4), np.array(deberta_preds_4), np.array(albi_preds_4), np.array(deb_preds_4), np.array(ro_predictions_4), np.array(electra_preds_4)],
    [np.array(alb_preds_5), np.array(deberta_preds_5), np.array(albi_preds_5), np.array(deb_preds_5), np.array(ro_predictions_5), np.array(electra_preds_5)],

]



In [None]:
fold_predictions_2 = [
    [np.array(rob_predictions_0), np.array(alb_preds_0), np.array(deberta_preds_0), np.array(albi_preds_0), np.array(deb_preds_0), np.array(ro_predictions_0), np.array(electra_preds_0)],
    [np.array(rob_predictions_1), np.array(alb_preds_1), np.array(deberta_preds_1), np.array(albi_preds_1), np.array(deb_preds_1), np.array(ro_predictions_1), np.array(electra_preds_1)],
    [np.array(rob_predictions_2), np.array(alb_preds_2), np.array(deberta_preds_2), np.array(albi_preds_2), np.array(deb_preds_2), np.array(ro_predictions_2), np.array(electra_preds_2)],
    [np.array(rob_predictions_3),np.array(alb_preds_3), np.array(deberta_preds_3), np.array(albi_preds_3), np.array(deb_preds_3), np.array(ro_predictions_3), np.array(electra_preds_3)],
    [np.array(rob_predictions_4), np.array(alb_preds_4), np.array(deberta_preds_4), np.array(albi_preds_4), np.array(deb_preds_4), np.array(ro_predictions_4), np.array(electra_preds_4)],
    [np.array(rob_predictions_5), np.array(alb_preds_5), np.array(deberta_preds_5), np.array(albi_preds_5), np.array(deb_preds_5), np.array(ro_predictions_5), np.array(electra_preds_5)],

]

In [None]:
alball_preds = predict_fast('../input/albertall/albert-xxlarge-all-data', test_values)

In [None]:
deb_bs_0 = predict_fast('../input/debertabootstrap/model_fold_0-20210728T204503Z-001/model_fold_0/best', test_values)
deb_bs_1 = predict_fast('../input/debertabootstrap/model_fold_1-20210728T204507Z-001/model_fold_1/best', test_values)
deb_bs_2 = predict_fast('../input/debertabspt2/model_fold_2-20210729T062046Z-001/model_fold_2/best', test_values)
deb_bs_3 = predict_fast('../input/debertabspt2/model_fold_3-20210729T062049Z-001/model_fold_3/best', test_values)
deb_bs_4 = predict_fast('../input/debertabspt2/model_fold_4-20210729T062054Z-001/model_fold_4/best', test_values)
deb_bs_5 = predict_fast('../input/debertabspt2/model_fold_5-20210729T062057Z-001/model_fold_5/best', test_values)
bs_mean = np.array(deb_bs_0) * 1/6 + np.array(deb_bs_1) * 1/6 + np.array(deb_bs_2) * 1/6 + np.array(deb_bs_3) * 1/6 + np.array(deb_bs_4) * 1/6 + np.array(deb_bs_5) * 1/6

In [None]:
#mean_alb = np.array(alball_preds) * 0.55 + np.array(bs_mean) * 0.45

In [None]:
ensemble = make_ensembler_predictions(fold_predictions, ensembler_dirs)

In [None]:
ensemble_2 = make_ensembler_predictions(fold_predictions_2, ensembler_dirs_2)
#mean = np.array(ensemble_2) * 0.6 + np.array(bi_res) * 0.4

In [None]:
#preds = np.array(mean) * 0.5 + np.array(mixed) * 0.5

In [None]:
mean_ensemble = np.array(ensemble) * 2./8. + np.array(ensemble_2) * 1./8. + np.array(bi_res) * 2./8. + np.array(alball_preds) * 2./8. + np.array(bs_mean) * 1./8.

In [None]:
#### BOOTSTRAPPING ####

In [None]:
gc.collect()

In [None]:
submission_df = pd.DataFrame({'id': test_df.id, 'target': mean_ensemble})


In [None]:
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df