In [None]:
from IPython.display import clear_output
! pip install datasets
! pip install transformers[torch]
! pip install bayesian-optimization
! pip install cvxopt
! pip install qpsolvers==3.4.0
! pip install accelerate==0.20.3 -U
! pip install git+https://github.com/simonzhang00/ripser-plusplus.git
! pip install git+https://github.com/IlyaTrofimov/RTD.git
clear_output()

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, set_seed
from sklearn.metrics import accuracy_score
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# load the data
set_seed(42)
imdb = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
tokenized_imdb = imdb.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
clear_output()

In [None]:
def load_trainer(number, seed):
    model = AutoModelForSequenceClassification.from_pretrained("gdrive/MyDrive/results_bert_" + str(i + 1) + "/")
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        seed=seed,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_imdb["train"],
        eval_dataset=tokenized_imdb["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    return trainer, model

In [None]:
seeds = [42, 109, 121, 122, 123]
trainers = []
models = []
for i in range(5):
    t, m = load_trainer(i, seeds[i])
    trainers.append(t)
    models.append(m)

In [None]:
dataset = tokenized_imdb["test"].train_test_split(test_size=0.5)
train, val, test = tokenized_imdb["train"], dataset['train'], dataset['test']

In [None]:
import numpy as np
corr_matrix_rtd = np.genfromtxt("./gdrive/MyDrive/results_bert_1/correlation_rtd.csv", delimiter=',')
corr_matrix_res = np.genfromtxt("./gdrive/MyDrive/results_bert_1/correlation_big.csv", delimiter=',')

true_labels = np.genfromtxt("./gdrive/MyDrive/ens_predictions/true_labels.csv", delimiter=',')
single_predictions = []
for i in range(5):
    single_predictions.append(np.genfromtxt("./gdrive/MyDrive/ens_predictions/prediction_" + str(i + 1) + ".csv", delimiter=','))
single_predictions = np.array(single_predictions)

In [None]:
from sklearn.metrics import auc, accuracy_score
def accuracy_rejection(N, step, probs, true_labels, predicted_labels):
    idx = np.argsort(probs)[::-1]
    r_rate = [0]
    r_accuracy = [accuracy_score(true_labels, predicted_labels)]
    for i in range(step, N, step):
        idx = idx[:(N - i)]
        r_rate.append(i / N)
        r_accuracy.append(accuracy_score(true_labels[idx], predicted_labels[idx]))
    return auc(r_rate, r_accuracy)

In [None]:
attention_weights = []
for m in models:
    attention_weights.append(m.distilbert.transformer.layer[5].attention.out_lin.weight.cpu().detach().numpy())

def weighted_prediction(single_predictions, weights):
    weights = weights / weights.sum()
    return np.vstack((np.dot(weights, np.array(single_predictions)[:, :, 0]), np.dot(weights, np.array(single_predictions)[:, :, 1]))).T

pairs = []
for i in range(5):
    for j in range(i + 1, 5):
        for k in range(j + 1, 5):
            for l in range(k + 1, 5):
                for n in range(l + 1, 5):
                    pairs.append([i, j, k, l, n])

def corr_based_posterior_variance(corr_matrix, weights):
    weights = np.expand_dims(weights, axis=0)
    pv = np.dot(np.dot(weights, corr_matrix), weights.T)
    return pv[0][0]

def get_corr_rtd(pair):
    crr = []
    for i in pair:
        curr_crr = []
        for j in pair:
            curr_crr.append(corr_matrix_rtd[i, j])
        crr.append(np.array(curr_crr))
    crr = np.array(crr)
    return crr

def get_corr_res(pair):
    crr = []
    for i in pair:
        curr_crr = []
        for j in pair:
            curr_crr.append(corr_matrix_res[i, j])
        crr.append(np.array(curr_crr))
    crr = np.array(crr)
    return crr

In [None]:
from bayes_opt import BayesianOptimization, UtilityFunction
from cvxopt import matrix, solvers
from qpsolvers import solve_qp
import rtd

def black_box_function(w1, w2, w3, w4, w5):
    weights = np.array([w1, w2, w3, w4, w5])
    weights = weights / weights.sum()
    ensemble_predictions = np.vstack((np.dot(weights, np.array(single_predictions[pair])[:, :, 0]), np.dot(weights, np.array(single_predictions[pair])[:, :, 1]))).T
    ens_acc = accuracy_score(np.argmax(ensemble_predictions, axis=1), true_labels)
    return ens_acc

def get_opt_weights(method, pair):
    if method == 'accuracy':
        pbounds = {"w1": [0.0, 1.0], "w2": [0.0, 1.0], 'w3':[0.0, 1.0], 'w4':[0.0, 1.0], 'w5':[0.0, 1.0]}
        optimizer = BayesianOptimization(f = black_box_function,
                                        pbounds = pbounds, verbose = 5,
                                        random_state = 4)
        optimizer.maximize(init_points = 5, n_iter = 100)
        ws = optimizer.max["params"]
        return ws['w1'], ws['w2'], ws['w3'], ws['w4'], ws['w5']
    elif method == 'attention':
        corr_matrix = get_corr_rtd(pair)
        P = 2 * corr_matrix
        q = np.zeros_like(corr_matrix[:, :1])
        A = np.ones((1, corr_matrix.shape[0]))
        b = np.ones(1)
        lb = np.zeros(corr_matrix.shape[0])
        ub = np.ones(corr_matrix.shape[0])
        x_sol = solve_qp(P, q, A=A, b=b, lb=lb, ub=ub, initvals=np.array([0.2, 0.2, 0.2, 0.2, 0.2]), solver='cvxopt', verbose=True)
        return x_sol[0], x_sol[1], x_sol[2], x_sol[3], x_sol[4]
    else:
        corr_matrix = get_corr_res(pair)
        P = 2 * corr_matrix
        q = np.zeros_like(corr_matrix[:, :1])
        A = np.ones((1, corr_matrix.shape[0]))
        b = np.ones(1)
        lb = np.zeros(corr_matrix.shape[0])
        ub = np.ones(corr_matrix.shape[0])
        x_sol = solve_qp(P, q, A=A, b=b, lb=lb, ub=ub, initvals=np.array([0.2, 0.2, 0.2, 0.2, 0.2]), solver='cvxopt', verbose=True)
        return x_sol[0], x_sol[1], x_sol[2], x_sol[3], x_sol[4]

In [None]:
ans = []
opt_methods = ['accuracy', 'attention', 'results']
for pair in pairs:
    accurs = []
    for opt_method in opt_methods:
        w1, w2, w3, w4, w5 = get_opt_weights(opt_method, pair)
        current_predictions = weighted_prediction(np.array(single_predictions[pair]), np.array([w1, w2, w3, w4, w5]))
        sigmoid = 1 / (1 + np.exp(-current_predictions))
        predicted_labels = np.argmax(sigmoid, axis=1)
        corr_matrix = get_corr_res(pair)
        #ac_r = corr_based_posterior_variance(corr_matrix, np.array([w1, w2]))
        accur = accuracy_score(true_labels, predicted_labels)
        accurs.append(accur)
        #ans.append(np.array([pair, opt_method, np.array([w1, w2]), ac_r, accur]))
    ans.append(np.array([max(accurs), pair]))
    print(ans[-1])

|   iter    |  target   |    w1     |    w2     |    w3     |    w4     |    w5     |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9362   [0m | [0m0.967    [0m | [0m0.5472   [0m | [0m0.9727   [0m | [0m0.7148   [0m | [0m0.6977   [0m |
| [0m2        [0m | [0m0.9354   [0m | [0m0.2161   [0m | [0m0.9763   [0m | [0m0.00623  [0m | [0m0.253    [0m | [0m0.4348   [0m |
| [0m3        [0m | [0m0.936    [0m | [0m0.7794   [0m | [0m0.1977   [0m | [0m0.863    [0m | [0m0.9834   [0m | [0m0.1638   [0m |
| [0m4        [0m | [0m0.9357   [0m | [0m0.5973   [0m | [0m0.008986 [0m | [0m0.3866   [0m | [0m0.04416  [0m | [0m0.9567   [0m |
| [95m5        [0m | [95m0.9363   [0m | [95m0.4361   [0m | [95m0.949    [0m | [95m0.7863   [0m | [95m0.8663   [0m | [95m0.1732   [0m |
| [95m6        [0m | [95m0.9368   [0m | [95m0.4742   [0m | [95m0.9403   [0m | [95m0.7261   [0m | [95

In [None]:
for i in range(len(ans)):
    ans[i] = np.array([ans[i][0], ans[i][1][0], ans[i][1][1], ans[i][1][2], ans[i][1][3]])
ans = np.array(ans)

In [None]:
ans

array([[0.93712, 0.     , 1.     , 2.     , 3.     ],
       [0.93776, 0.     , 1.     , 2.     , 4.     ],
       [0.93744, 0.     , 1.     , 3.     , 4.     ],
       [0.93712, 0.     , 2.     , 3.     , 4.     ],
       [0.93736, 1.     , 2.     , 3.     , 4.     ]])

In [None]:
import pandas as pd
df = pd.DataFrame(ans, columns=['accur', 'm1', 'm2', 'm3'])
df

Unnamed: 0,accur,m1,m2,m3
0,0.93616,0.0,1.0,2.0
1,0.9372,0.0,1.0,3.0
2,0.93656,0.0,1.0,4.0
3,0.9364,0.0,2.0,3.0
4,0.93632,0.0,2.0,4.0
5,0.93632,0.0,3.0,4.0
6,0.93656,1.0,2.0,3.0
7,0.93768,1.0,2.0,4.0
8,0.93656,1.0,3.0,4.0
9,0.93656,2.0,3.0,4.0
