In [None]:
# !pip install transformers

In [None]:
import json
import logging

import numpy as np

import math


import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch.nn.functional as F
import torch.optim as optim

# import transformers
# from transformers import BartTokenizer, BartForConditionalGeneration

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# from matplotlib import pyplot

import time

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir("/content/drive/MyDrive/CS_NLP_BART")

In [None]:
class Random_atlas():

    def __init__(self, size=10000):
        self.size = size
        self.index = 0
        self.atlas = np.random.rand(self.size)

    def sample_atlas(self):
        if self.index >= self.size:
            self.index = 0
            self.atlas = np.random.rand(self.size)
        res = self.atlas[self.index]
        self.index += 1
        return res

    def int_in_range(self, range, blacklist):
        min, max = range
        dif = max - min

        # giving an error for to large black list.
        if len(blacklist) >= dif:
            # logging.error("The blacklist is covering the range can't get a random number")
            return 0, False # False is to indicate that the function can not return a random int

        # we sample a random integer in the given range
        random_num = self.sample_atlas()
        res = math.floor(random_num * dif) + min

        # we keep on re-sampling random integers until we get one not in the blacklist
        count = 0
        while res in blacklist:
            random_num = self.sample_atlas()
            res = math.floor(random_num * dif) + min
            count += 1
            if count >= 200:
                print("got stuck in while loop")
                return 0, False # False is to indicate that the function can not return a random int

        return res, True # True is to indicate that the function returned a random int



In [None]:
def generate_finetunning_data(events, number_of_samples, emb_type="CLS"):
    # Opening JSON file
    events = events

    # This give us the randomness
    random_atlas = Random_atlas(size=20000)

    # this will be the JSON we will save as the date.
    finetunning_data_0 = []
    finetunning_data_1 = []
    finetunning_lables = []
    debuging_lookup = []

    num_events = len(events)

    for (event_idx, event) in enumerate(events):
        print(f"event number {event_idx}")
        articles_list = event["articles"]

        num_articles = len(articles_list)

        # black list so that the same sample is not added twice
        inner_blacklist_dict = {i: [i] for i in range(num_articles)}

        for (article_idx, article) in enumerate(articles_list):

            # first we add same event samples
            for _ in range(number_of_samples):

                # get list of articles we should not sample from
                temp_blacklist = inner_blacklist_dict[article_idx]

                # get the random sample index, or learn that there are no random index to get
                other_article_index, found_random_int = random_atlas.int_in_range(range=(0, num_articles), blacklist=temp_blacklist)

                if found_random_int:

                    # update the blacklist
                    inner_blacklist_dict[article_idx].append(other_article_index)
                    inner_blacklist_dict[other_article_index].append(article_idx)

                    # add the new sample and its mirror
                    if emb_type=="CLS":
                        finetunning_data_0.append(article["CLS"])
                        finetunning_data_1.append(articles_list[other_article_index]["CLS"])
                        finetunning_lables.append(1)
                        debuging_lookup.append((event_idx, article_idx, event_idx, other_article_index, 1))

                        finetunning_data_0.append(articles_list[other_article_index]["CLS"])
                        finetunning_data_1.append(article["CLS"])
                        finetunning_lables.append(1)
                        debuging_lookup.append((event_idx, other_article_index, event_idx, article_idx, 1))
                    else:
                        logging.error(f"Only embedding types are 'CLS','AVG', and a different embedding type is given.")

            # second we add different event samples
            for _ in range(number_of_samples):
                # in this step we can not, fail to get a random int, so we ignore the true/false.

                # get the random sample event index, we must not sample from the same event so its is black listed
                other_event_index, _ = random_atlas.int_in_range(range=(0, num_events), blacklist=[event_idx])

                # get the random sample article index, we ignore the posibility of sampleing the same article here as is quite small
                other_event_num_articles = len(events[other_event_index]["articles"])
                other_article_index, _ = random_atlas.int_in_range(range=(0, other_event_num_articles), blacklist=[])

                # add the new sample and its mirror
                if emb_type == "CLS":
                    finetunning_data_0.append(article["CLS"])
                    finetunning_data_1.append(events[other_event_index]["articles"][other_article_index]["CLS"])

                    finetunning_lables.append(0)
                    debuging_lookup.append((event_idx, article_idx, other_event_index, other_article_index, 0))

                    finetunning_data_0.append(events[other_event_index]["articles"][other_article_index]["CLS"])
                    finetunning_data_1.append(article["CLS"])

                    finetunning_lables.append(0)
                    debuging_lookup.append((other_event_index, other_article_index, event_idx, article_idx, 0))

                else:
                    logging.error(f"Only embedding types are 'CLS','AVG', and a different embedding type is given.")

    return finetunning_data_0, finetunning_data_1, finetunning_lables, debuging_lookup
    print("debug")


In [None]:
class CustomFinetuningDataset(Dataset):
    def __init__(self, events, triplets):
        self.events = events
        self.triplets = triplets

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):

        X_1_idx, X_2_idx, label = self.triplets[idx]

        X_1 = torch.FloatTensor(self.events[X_1_idx[0]]["articles"][X_1_idx[1]]["CLS"])
        X_2 = torch.FloatTensor(self.events[X_2_idx[0]]["articles"][X_2_idx[1]]["CLS"])
        label = torch.tensor(label, dtype=torch.int8).type(torch.long)

        return X_1, X_2, label

def generate_finetunning_dataset(events, number_of_samples, emb_type="CLS"):
    # Opening JSON file
    events = events

    # This give us the randomness
    random_atlas = Random_atlas(size=20000)

    # this will be the JSON we will save as the date.
    triplets = []

    num_events = len(events)

    for (event_idx, event) in enumerate(events):
        print(f"event number {event_idx}")
        articles_list = event["articles"]

        num_articles = len(articles_list)

        # black list so that the same sample is not added twice
        inner_blacklist_dict = {i: [i] for i in range(num_articles)}

        for (article_idx, article) in enumerate(articles_list):

            # first we add same event samples
            for _ in range(number_of_samples):

                # get list of articles we should not sample from
                temp_blacklist = inner_blacklist_dict[article_idx]

                # get the random sample index, or learn that there are no random index to get
                other_article_index, found_random_int = random_atlas.int_in_range(range=(0, num_articles), blacklist=temp_blacklist)

                if found_random_int:

                    # update the blacklist
                    inner_blacklist_dict[article_idx].append(other_article_index)
                    inner_blacklist_dict[other_article_index].append(article_idx)

                    # add the new sample and its mirror
                    if emb_type=="CLS":
                        triplets.append(((event_idx, article_idx), (event_idx, other_article_index), 1))
                        triplets.append(((event_idx, other_article_index), (event_idx, article_idx), 1))
                    else:
                        logging.error(f"Only embedding types are 'CLS','AVG', and a different embedding type is given.")

            # second we add different event samples
            for _ in range(number_of_samples):
                # in this step we can not, fail to get a random int, so we ignore the true/false.

                # get the random sample event index, we must not sample from the same event so its is black listed
                other_event_index, _ = random_atlas.int_in_range(range=(0, num_events), blacklist=[event_idx])

                # get the random sample article index, we ignore the posibility of sampleing the same article here as is quite small
                other_event_num_articles = len(events[other_event_index]["articles"])
                other_article_index, _ = random_atlas.int_in_range(range=(0, other_event_num_articles), blacklist=[])

                # add the new sample and its mirror
                if emb_type == "CLS":
                    triplets.append(((event_idx, article_idx), (other_event_index, other_article_index), 0))
                    triplets.append(((other_event_index, other_article_index), (event_idx, article_idx), 0))

                else:
                    logging.error(f"Only embedding types are 'CLS','AVG', and a different embedding type is given.")

    finetunning_dataset = CustomFinetuningDataset(events, triplets)

    return finetunning_dataset

In [None]:
class Encoder(nn.Module):
    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        # and then used to extract features from the training and test data.

        # 1024 ==> 1024
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(1024, 2048),
            torch.nn.ReLU()
        )

    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # defined in the constructor.
        encoded = self.encoder(x)

        return encoded


class Net(nn.Module):
    def __init__(self, encoder):
        """
        The constructor of the model.
        """
        super().__init__()
        # and then used to extract features from the training and test data.

        # 1024 ==> 1024
        # self.encoder_1 = encoder
        # self.encoder_2 = Encoder()
        self.encoder = encoder

        # 2048 ==> 1
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(4096, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 1)
        )

    def forward(self, x_1, x_2):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # defined in the constructor.
        encoded_1 = self.encoder(x_1)
        encoded_2 = self.encoder(x_2)
        encoded = torch.cat((encoded_1, encoded_2), 1)
        res = self.fc(encoded)
        res = F.sigmoid(res)
        return res


In [None]:
def create_loader_from_np(X_1, X_2, y=None, train=True, batch_size=32, shuffle=True, num_workers=0):
    """
    Create a torch.utils.data.DataLoader object from numpy arrays containing the data.

    input: X: numpy array, the features
           y: numpy array, the labels

    output: loader: torch.data.util.DataLoader, the object containing the data
    """
    if train:
        dataset = TensorDataset(torch.from_numpy(X_1).type(torch.float),
                                torch.from_numpy(X_2).type(torch.float),
                                torch.from_numpy(y).type(torch.long))
        loader = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=shuffle,
                              pin_memory=True, num_workers=num_workers)

    else:
        dataset = TensorDataset(torch.from_numpy(X_1).type(torch.float),
                                torch.from_numpy(X_2).type(torch.float))
        loader = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=shuffle,
                              pin_memory=True, num_workers=num_workers)
    return loader

In [None]:
def create_loader_from_list(X_1, X_2, y=None, train=True, batch_size=32, shuffle=True, num_workers=0):
    """
    Create a torch.utils.data.DataLoader object from numpy arrays containing the data.

    input: X: numpy array, the features
           y: numpy array, the labels

    output: loader: torch.data.util.DataLoader, the object containing the data
    """
    if train:
        dataset = TensorDataset(torch.Tensor(X_1).type(torch.float),
                                torch.Tensor(X_2).type(torch.float),
                                torch.Tensor(y).type(torch.long))
        loader = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=shuffle,
                              pin_memory=True, num_workers=num_workers)

    else:
        dataset = TensorDataset(torch.Tensor(X_1).type(torch.float),
                                torch.Tensor(X_2).type(torch.float))
        loader = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=shuffle,
                              pin_memory=True, num_workers=num_workers)
    return loader

In [None]:
def finetune_model(encoder, net, train_loader, val_loader):

    train_loader = train_loader
    val_loader = val_loader

    encoder = encoder
    model = net

    model.to(device)

    n_epochs = 100


    criterion = nn.BCELoss()

    print("new 2")

    # create your optimizer
    optimizer = optim.AdamW(model.parameters(), lr=0.0002, weight_decay=0.1)
    val_losses = []
    train_losses = []

    best_val_loss = 0.2

    for epoch in range(n_epochs):
        model.train()
        loss_to_print = 0.0
        e_loss = 0.0
        predictions = []
        y_true = []
        for i, [X_1, X_2, y] in enumerate(train_loader):
            X_1 = X_1.to(device)
            X_2 = X_2.to(device)
            y = y.to(device)

            optimizer.zero_grad()

            preds = model(X_1, X_2)
            loss = criterion(torch.reshape(preds, (-1,)), y.to(torch.float32))
            loss.backward()
            optimizer.step()
            e_loss += loss.item()
            loss_to_print += loss.item()
            if i % 1000 == 999:
                print(f"[{epoch + 1}, {i + 1:5d}], loss: {loss_to_print / 1000:.3f}")
                loss_to_print = 0.0
        print(f"[Epoch {epoch + 1:2d}] Epoch Loss: {e_loss / len(train_loader):.5f}", end=" | ")
        train_losses.append(e_loss / len(train_loader))
        model.eval()
        val_loss_p = 0

        with torch.no_grad():  # We don't need to compute gradients for testing
            for [x_val_1, x_val_2, y_val] in val_loader:
                x_val_1 = x_val_1.to(device)
                x_val_2 = x_val_2.to(device)
                y_val = y_val.to(device)
                y_hat = model(x_val_1, x_val_2)
                val_loss = criterion(torch.reshape(y_hat, (-1,)), y_val.to(torch.float32))
                val_loss_p += val_loss.item()
                predicted = y_hat.cpu().numpy()
                y_batch = y_val.cpu().numpy()

                # Rounding the predictions to 0 or 1
                predicted[predicted >= 0.5] = 1
                predicted[predicted < 0.5] = 0
                predictions.append(predicted)
                y_true.extend(y_batch)




        predictions = np.vstack(predictions)
        val_loss_p = val_loss_p / len(val_loader)
        acc = accuracy_score(y_true, predictions)
        print(f"Val Loss: {val_loss_p:.5f}, Val Accuracy: {acc:.5f}")
        val_losses.append(val_loss_p)

        if val_loss_p < best_val_loss:
            print(f"New Best Loss: {val_loss_p:.5f}")
            best_val_loss = val_loss_p
            torch.save(encoder, f"./encoder.pt")
            torch.save(model, f"./model.pt")

    # torch.save(encoder, f"./encoder.pt")
    # torch.save(model, f"./model.pt")

    return train_losses, val_losses

In [None]:
f = open('data_5000_with_BART_embedings_CLS.json')
events = json.load(f)

In [None]:

# events_train = events[:4500]
# events_val = events[4500:]


# finetunning_data_train_X_0, finetunning_data_train_X_1, finetunning_data_train_y, debuging_lookup_tr = generate_finetunning_data(events[:4500], number_of_samples=5, emb_type="CLS")
# finetunning_data_val_X_0, finetunning_data_val_X_1, finetunning_data_val_y, debuging_lookup_val = generate_finetunning_data(events[4500:], number_of_samples=5, emb_type="CLS")



In [None]:

# train_loader = create_loader_from_list(finetunning_data_train_X_0, finetunning_data_train_X_1, finetunning_data_train_y, batch_size=512, shuffle=True)
# val_loader = create_loader_from_list(finetunning_data_val_X_0, finetunning_data_val_X_1, finetunning_data_val_y, batch_size=512, shuffle=False)



In [None]:
events_train = events[:4500]
events_val = events[4500:]

train_loader = DataLoader(dataset= generate_finetunning_dataset(events_train, number_of_samples=10, emb_type="CLS"),
                          batch_size=512,
                          shuffle=True,
                          pin_memory=True,
                          num_workers=4)
val_loader = DataLoader(dataset= generate_finetunning_dataset(events_val, number_of_samples=10, emb_type="CLS"),
                          batch_size=512,
                          shuffle=False,
                          pin_memory=True,
                          num_workers=4)

event number 0
event number 1
event number 2
event number 3
event number 4
event number 5
event number 6
event number 7
event number 8
event number 9
event number 10
event number 11
event number 12
event number 13
event number 14
event number 15
event number 16
event number 17
event number 18
event number 19
event number 20
event number 21
event number 22
event number 23
event number 24
event number 25
event number 26
event number 27
event number 28
event number 29
event number 30
event number 31
event number 32
event number 33
event number 34
event number 35
event number 36
event number 37
event number 38
event number 39
event number 40
event number 41
event number 42
event number 43
event number 44
event number 45
event number 46
event number 47
event number 48
event number 49
event number 50
event number 51
event number 52
event number 53
event number 54
event number 55
event number 56
event number 57
event number 58
event number 59
event number 60
event number 61
event number 62
ev

In [None]:
device


device(type='cuda', index=0)

In [None]:
encoder = Encoder()
net = Net(encoder)

train_losses, val_losses = finetune_model(encoder, net, train_loader, val_loader)


new 2
[1,  1000], loss: 0.592
[1,  2000], loss: 0.403
[1,  3000], loss: 0.354
[1,  4000], loss: 0.329
[1,  5000], loss: 0.295
[1,  6000], loss: 0.278
[1,  7000], loss: 0.260
[1,  8000], loss: 0.250
[1,  9000], loss: 0.243
[Epoch  1] Epoch Loss: 0.33004 | Val Loss: 0.22877, Val Accuracy: 0.90620
[2,  1000], loss: 0.235
[2,  2000], loss: 0.230
[2,  3000], loss: 0.225
[2,  4000], loss: 0.220
[2,  5000], loss: 0.216
[2,  6000], loss: 0.212
[2,  7000], loss: 0.205
[2,  8000], loss: 0.203
[2,  9000], loss: 0.197
[Epoch  2] Epoch Loss: 0.21515 | Val Loss: 0.18702, Val Accuracy: 0.92650
New Best Loss: 0.18702
[3,  1000], loss: 0.189
[3,  2000], loss: 0.186
[3,  3000], loss: 0.183
[3,  4000], loss: 0.182
[3,  5000], loss: 0.178
[3,  6000], loss: 0.177
[3,  7000], loss: 0.175
[3,  8000], loss: 0.172
[3,  9000], loss: 0.170
[Epoch  3] Epoch Loss: 0.17867 | Val Loss: 0.17398, Val Accuracy: 0.93315
New Best Loss: 0.17398
[4,  1000], loss: 0.168
[4,  2000], loss: 0.164
[4,  3000], loss: 0.165
[4,  4