In [1]:
data_path = "../data/annotated_data"
save_model_dir_name = "best_model_albert"
# set to true if colab!
colab = False
train_model = False
if colab:
    data_path = '/content/drive/MyDrive/annotated_data'
    !pip install transformers
    !pip install evaluate
    !pip install seaborn
    !pip install accelerate
    !pip install Cython
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
    except:
        print("probably not in colab")

# Imports

In [2]:
import os
import ast
import json
import torch
torch.manual_seed(0)
import random
random.seed(0)
import itertools
import numpy as np
np.random.seed(0)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AlbertForSequenceClassification,get_scheduler
from sklearn import preprocessing
from tqdm.auto import tqdm
from experiment_utils import EarlyStopper
from sklearn.metrics import accuracy_score, classification_report

# Load Data

In [3]:
files = [f for f in os.listdir(data_path) if f.endswith('jsonl')]
rows = []

FileNotFoundError: [Errno 2] No such file or directory: '../data/annotated_data'

In [None]:
for f in files:
    with open(f'{data_path}/{f}', 'r') as json_file:
        json_list = list(json_file)
    for json_str in json_list:
        result = json.loads(json_str)
        label = result["label"]
        text = result["text"]
        new_spans = []
        if result["spans"] and len(result["spans"]) != 0:
            try:
                new_spans = [s["text"] for s in result["spans"]]
            except:
                x = ast.literal_eval(result["spans"])
                new_spans = [s["text"] for s in x]
        new_row = {
            "text":text,
            "label":label,
            "spans":new_spans
        }
        rows.append(new_row)
    print(f"Loaded: {f}")
df = pd.DataFrame(rows)

## Inspecting the data

In [None]:
plt.figure(figsize=(10,8))

abs_values = df['label'].value_counts(ascending=False)
g = sns.countplot(data=df, y="label", order=abs_values.index)
rel_values = df['label'].value_counts(ascending=False, normalize=True).values * 100
bar_labels = [f'{p[0]} ({p[1]:.0f}%)' for p in zip(abs_values.values, rel_values)]
g.bar_label(container=g.containers[0], labels=bar_labels)
g.set_title("number of examples per class")

In [None]:
df["text_len"] = df[["text"]].apply(lambda x:len(x.item()), axis=1)
plt.figure(figsize=(10,8))
g = sns.boxplot(data=df, x="text_len", y="label")
g.set_title("text len per class")
print("text length statistics")
df.groupby(["label"])["text_len"].agg(["mean","median","min","max"])

# Data Preparation
## Encode Labels

In [None]:
le = preprocessing.LabelEncoder()
df["label_encoded"] = le.fit_transform(df["label"])

## Train, Valdiation and Test Split

In [None]:
# 70, 20, 10 split
train_df, val_df, test_df = np.split(df.sample(frac=1), [int(.7*len(df)), int(.9*len(df))])
train_df.reset_index(inplace=True)
val_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

In [None]:
# collect spans from the training data-set
new_spans = set(itertools.chain.from_iterable(train_df.spans.tolist()))

# Create Datasets and Tokenize

In [None]:
# add new spans to the tokenizer
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2", do_lower_case=False)
# extend tokenizer with domain spans
new_tokens = set(new_spans) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens))

In [None]:
# define PyTorch dataset
class TAndIDataSet(Dataset):
    def __init__(self, text_column, encoded_labels):
        self.texts = text_column
        self.encodings = tokenizer(text_column.tolist(), truncation=True, padding=True, max_length=512)
        self.labels = encoded_labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item, self.texts[idx]

    def __len__(self):
        return len(self.labels)

In [None]:
# create datasets
train_dataset = TAndIDataSet(train_df.text, train_df.label_encoded)
val_dataset = TAndIDataSet(val_df.text, val_df.label_encoded)
test_dataset = TAndIDataSet(test_df.text, test_df.label_encoded)
# create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16)
val_dataloader = DataLoader(val_dataset, batch_size=32)
test_dataloader =  DataLoader(test_dataset, batch_size=4)

# Finetuning
## Definitions

In [None]:
if train_model:
    # model definition
    model = AlbertForSequenceClassification.from_pretrained(
        "albert-base-v2",
        num_labels=len(le.classes_)
    ).to("cuda")
    # adjust embeddings size for new vocabulary length
    model.resize_token_embeddings(len(tokenizer))
else:
    tokenizer = AutoTokenizer.from_pretrained(f"./model_checkpoints/{save_model_dir_name}", local_files_only=True)
    model = AlbertForSequenceClassification.from_pretrained(f"./model_checkpoints/{save_model_dir_name}", local_files_only=True).to("cuda")

In [None]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# epochs
num_epochs = 15
# learning rate scheduler
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_epochs * len(train_dataloader)
)

## Training Loop

In [None]:
if train_model:
    training_stats = []
    progress_bar = tqdm(range(num_epochs*(len(train_dataloader)+len(val_dataloader))))
    early_stopper = EarlyStopper(patience=5)
    best_val_loss = np.Inf
    for epoch in range(num_epochs):
        print(f"\nRunning Epoch {epoch+1}/{num_epochs}...")
        epoch_train_losses = []
        epoch_val_losses = []
        epoch_train_correct = 0
        epoch_val_correct = 0
        # training loop
        model.train()
        for batch, _ in train_dataloader:
            # move batch to gpu
            batch = {k: v.to("cuda") for k, v in batch.items()}
            # forward pass
            output = model(**batch)
            predictions = torch.argmax(output.logits, dim=-1)
            epoch_train_correct += (predictions == batch["labels"].flatten()).long().sum()
            # compute loss & backprop
            loss = output.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            # reset optimizer
            optimizer.zero_grad()
            # save batch_loss
            epoch_train_losses.append(loss.item())
            progress_bar.update(1)
        # validation loop
        model.eval()
        for batch, _ in val_dataloader:
            batch = {k: v.to("cuda") for k, v in batch.items()}
            with torch.no_grad():
                output = model(**batch)
                predictions = torch.argmax(output.logits, dim=-1)
                epoch_val_correct += (predictions == batch["labels"].flatten()).long().sum()
                epoch_val_losses.append(output.loss.item())
            progress_bar.update(1)
        # save epoch metrics
        mean_epoch_val_loss = np.mean(epoch_val_losses)
        epoch_stats = {
            "epoch":epoch,
            "train_loss":np.mean(epoch_train_losses),
            "val_loss":mean_epoch_val_loss,
            "train_accuracy":epoch_train_correct.item()/len(train_dataset),
            "val_accuracy":epoch_val_correct.item()/len(val_dataset)
        }
        print(epoch_stats)
        training_stats.append(epoch_stats)
        # save best model
        if mean_epoch_val_loss < best_val_loss:
            model_path = f"./{save_model_dir_name}"
            model.save_pretrained(model_path)
            tokenizer.save_pretrained(model_path)
            best_val_loss = mean_epoch_val_loss

        if early_stopper.early_stop(mean_epoch_val_loss):
            print("Stopping early")
            break

    # if colab make sure to save best model to google drive
    training_stats = pd.DataFrame(training_stats)
    if colab:
        !cp -r $save_model_dir_name /content/drive/MyDrive
else:
    print("training disabled")

In [None]:
if train_model:
    plt.figure(figsize=(8,8))
    sns.lineplot(
        data=training_stats[["epoch","train_loss","val_loss"]].melt(id_vars=["epoch"]),
        x="epoch",
        y="value",
        hue="variable"
    )

In [None]:
if train_model:
    plt.figure(figsize=(8,8))
    sns.lineplot(
        data=training_stats[["epoch","train_accuracy","val_accuracy"]].melt(id_vars=["epoch"]),
        x="epoch",
        y="value",
        hue="variable"
    )

# Testing

In [None]:
progress_bar = tqdm(range(len(test_dataloader)))
test_correct = 0
test_predictions = []
model.eval()
softmax = torch.nn.Softmax(dim=len(le.classes_))
for batch, texts in test_dataloader:
    batch = {k: v.to("cuda") for k, v in batch.items()}
    with torch.no_grad():
        output = model(**batch, output_attentions=True, output_hidden_states=True)
        local_predictions = torch.argmax(output.logits, dim=-1)
        top_k_predictions = torch.topk(output.logits, k=3)
        test_correct += (local_predictions == batch["labels"].flatten()).long().sum()
        for idx, pred in enumerate(local_predictions.tolist()):
            test_predictions.append({
                "y_hat_enc":pred,
                "y_enc":batch["labels"].flatten().tolist()[idx],
                "text":texts[idx],
                "top_k":top_k_predictions.indices[idx].tolist(),
                "top_k_logits": top_k_predictions.values[idx].tolist(),
                "in_top_k":batch["labels"].flatten().tolist()[idx] in top_k_predictions.indices[idx].tolist()
            })
    progress_bar.update(1)
print(f"Test Accuracy: {test_correct/(len(test_dataloader)*test_dataloader.batch_size)}")
test_predictions = pd.DataFrame(test_predictions)
test_predictions["y"] = le.inverse_transform(test_predictions["y_enc"])
test_predictions["y_hat"] = le.inverse_transform(test_predictions["y_hat_enc"])
print(f"Test top-k Accuracy: {len(test_predictions.loc[test_predictions.in_top_k])/(len(test_dataloader)*test_dataloader.batch_size)}")

## Wrong classifications

In [None]:
test_predictions.loc[test_predictions.y_enc != test_predictions.y_hat_enc].head(n=1000)

In [None]:
print("num. miss-classifications per class")
test_predictions.loc[test_predictions.y_enc != test_predictions.y_hat_enc].groupby(["y"])["y"].count()

### Confusion Matrix

In [None]:
# axis_ticks = le.inverse_transform(list(range(0,17)))
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
for key,value in le_name_mapping.items():
    print(f"{value} : {key}")
cm = confusion_matrix(test_predictions["y_enc"],test_predictions["y_hat_enc"])
plt.figure(figsize=(8,7))
g = sns.heatmap(cm, annot=True, fmt='g', square = True, cmap = 'Blues_r')
g.set_ylabel("y")
g.set_xlabel("y_hat")

print(classification_report(test_predictions["y_enc"],test_predictions["y_hat_enc"], target_names=le.classes_))
print("Accuracy score: ", accuracy_score(test_predictions["y_enc"],test_predictions["y_hat_enc"]))
per_class_accuracy = cm.diagonal()/cm.sum(axis=1)
print("per class accuracy:", per_class_accuracy)

### Detailed inspection of miss-classifications

In [None]:
# class of interest
coi = "electric_car"
coi_miss = test_predictions.loc[(test_predictions.y==coi) & (test_predictions.y_hat != coi)]
print(f"Total num. miss: {len(coi_miss)}, non-top-k miss: {len(coi_miss.loc[~coi_miss.in_top_k])}")
coi_miss_enhanced = []
for idx, row in coi_miss.iterrows():
    for tk in range(len(row.top_k)):
        new_row = {
            "score": row.top_k_logits[tk],
            "y_hat_enc": row.top_k[tk],
            "y_hat": le.inverse_transform([row.top_k[tk]])[0],
            "rank":str(tk)
        }
        coi_miss_enhanced.append(new_row)
coi_miss_enhanced = pd.DataFrame(coi_miss_enhanced)
coi_miss

In [None]:
print(f"rank 0 should be '{coi}'")
coi_miss_enhanced_agg = coi_miss_enhanced.groupby(['rank',"y_hat","y_hat_enc"]).size().reset_index(name="counts")
g = sns.barplot(data=coi_miss_enhanced_agg, y="rank", x="counts", hue="y_hat")
sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))

#### Inspecting text of non top-k correct classifications

In [None]:
non_top_k = test_predictions.loc[~test_predictions.in_top_k]
#axis_ticks = le.inverse_transform(list(set(list(non_top_k.y_hat_enc.unique()) + list(non_top_k.y_enc.unique()))))
cm = confusion_matrix(non_top_k["y_enc"],non_top_k["y_hat_enc"])
for key,value in le_name_mapping.items():
    print(f"{value} : {key}")
plt.figure(figsize=(8,7))
g = sns.heatmap(cm, annot=True, fmt='g', square = True, cmap = 'Blues_r')
g.set_ylabel("y")
g.set_xlabel("y_hat")
print(f"Num of non-top-k predictions: {len(non_top_k)}")

In [None]:
# subselect worst class
worst_class = "drone"
for idx, row in non_top_k.loc[non_top_k.y == worst_class].iterrows():
    print(f"[{idx}]")
    print(f"class: {row.y}, predicted: {row.y_hat}")
    print(row.text)
    print("--------------")