## Fine-tuning a distilbert model using both a softmax loss and contrastive loss

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
import torch
import torch.nn as nn
import transformers
from itertools import combinations
from sentence_transformers import evaluation, InputExample, losses, models, SentencesDataset, SentenceTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from torch.utils.data import DataLoader

%matplotlib inline

In [2]:
print(torch.cuda.get_device_name(0), torch.cuda.is_available())
torch.cuda.empty_cache()

Tesla K80 True


In [3]:
def output_pairs(lab, strings):
    """returns pairs from a list of strings"""
    remainder = len(strings) % 2
    if remainder != 0:
        strings = strings[:-remainder]
    num_pairs = int(len(strings)/2)
    pairs = []
    for i in range(num_pairs):
        begin = i*2
        end = begin + 2
        pair = strings[begin:end]
        pairs.append(pair)
    return pairs


def return_examples(df_):
    """returns paired reviews from the same class for
    training SBERT model"""
    examples = []
    for lab in label_dict.values():
        lab_df = df_[df_["labels"]==lab]
        lab_df = shuffle(lab_df)
        texts = list(lab_df["text"].values)
        assert len(texts) % 2 == 0
        l = int(len(texts)/2)
        assert l > 0
        labs = [lab] * l
        zipped = list(zip(texts[:l], texts[l:], labs))
        assert len(zipped) == l 
        assert l == len(texts)/2
        for ex in zipped:
            assert type(ex[0]) == type(ex[1]) == str
            pair = [ex[0], ex[1]]
            assert type(pair) == list
            assert type(ex[2]) == int
            ex = InputExample(texts=pair, label=ex[2])
            examples.append(ex)
    return examples


def return_contrast_examples(df_, num_pos, num_neg):
    """single call to get positive and negative examples for
    contrastive loss"""
    pos = return_positive_contrast_examples(df_, num_pos)
    neg = return_negative_contrast_examples(df_, num_neg)
    print("Contrastive loss examples total: {}".format(len(pos + neg)))
    return pos + neg


def return_positive_contrast_examples(df_, n):
    """positive examples for contrastive loss:
    same class, labeled 1"""
    pos_contrast_examples = []
    for lab in label_dict.values():
        lab_df = df_[df_["labels"]==lab]
        lab_df = shuffle(lab_df)
        lab_df = lab_df.sample(n)
        revs = [rev.lower() for rev in lab_df["text"].values]
        pairs = output_pairs(lab, revs)
        for pair in pairs:
            assert type(pair) == list
            assert len(pair) == 2
            assert type(pair[0]) == type(pair[1]) == str
            ex = InputExample(texts=pair, label=1)
            pos_contrast_examples.append(ex)
    return pos_contrast_examples
    
    
def return_negative_contrast_examples(df_, n):
    """negative examples for contrastive loss:
    pairs from all combinations of classes,
    with label = 0 indicating they are not
    of teh same class"""
    neg_contrast_examples = []
    df_dict = {}
    for lab in label_dict.values():
        lab_df = df_[df_["labels"]==lab]
        lab_df = shuffle(lab_df)
        lab_df = lab_df.sample(n)
        df_dict[lab] = lab_df
    for k, comb in enumerate(combinations(df_dict.keys(), 2)):
        lab_i, lab_j = comb
        df_i = df_dict[lab_i].sample(n)
        df_j = df_dict[lab_j].sample(n)
        texts_i = list(df_i["text"].values)
        texts_j = list(df_j["text"].values)
        neg_labs = [0] * len(texts_i)
        assert len(neg_labs) == len(texts_i) == len(texts_j)
        exes = list(zip(texts_i, texts_j, neg_labs))
        for ex in exes:
            assert type(ex[0]) == type(ex[1]) == str
            assert ex[2] == 0
            ex = InputExample(texts=[ex[0].lower(), ex[1].lower()], label=ex[2])
            neg_contrast_examples.append(ex)
    return neg_contrast_examples

### Load data and specify model

In [4]:
train_df = pd.read_json("train_df_final.json")
dev_df = pd.read_json("dev_df_final.json")
test_df = pd.read_json("test_df_final.json")

print(train_df.shape)
print(dev_df.shape)
print(test_df.shape)

classes = sorted(list(set(train_df["occupation"].values)))
label_dict = {}

for i, cat in enumerate(classes):
    label_dict[cat] = i
    
print("Num. occupations: {}. Training set shape: {}. Dev set shape: {}.".format(len(label_dict.keys()), 
                                                                                train_df.shape, dev_df.shape))

(392000, 5)
(98000, 5)
(48419, 5)
Num. occupations: 49. Training set shape: (392000, 5). Dev set shape: (98000, 5).


In [6]:
# FINAL ARCHITECTURE -- commented out to avoid overwriting model.
tuning_contrastive_path = os.path.join(os.getcwd(), "final_proj_model_TUNING_softmax_contrastive/")
if not os.path.isdir(tuning_contrastive_path):
    os.mkdir(tuning_contrastive_path)
    
# model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")

# model.save(tuning_contrastive_path)

## Train model

In [3]:
train_df = pd.read_json("train_df_final.json")
dev_df = pd.read_json("dev_df_final.json")

classes = sorted(list(set(train_df["occupation"].values)))

label_dict = {}

for i, cat in enumerate(classes):
    label_dict[cat] = i

assert 49 == len(set(train_df["labels"])) == len(label_dict.keys()) == len(set(dev_df["labels"]))

num_epochs = 5 # trained iteratively

for i in range(num_epochs):
    
    torch.cuda.empty_cache()
        
    model = SentenceTransformer(tuning_contrastive_path)
    
    train_examples = shuffle(return_examples(train_df))
    dev_examples = shuffle(return_examples(dev_df))
    print("Training examples: {}.".format(len(train_examples)))
    print("Dev examples: {}".format(len(dev_examples)))
    
    # Datasets
    train_dataset = SentencesDataset(train_examples, model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)

    dev_dataset = SentencesDataset(dev_examples, model)
    dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=1)    
    
    # Main loss function + evaluator
    softmax_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), 
                                    num_labels=49)
    evaluator = evaluation.LabelAccuracyEvaluator(dev_dataloader, softmax_model=softmax_loss)

    # Option to add contrastive loss--not used for this model
    cl_fraction = 0.5
    if cl_fraction > 0.0:
        """If contrastive loss were used, a number of positive and negative examples would be
        created, and a training step on those examples with contrastive loss would occur before
        training with the softmax loss for each epoch."""
        num_pos = int(train_df.shape[0]/25/4 * cl_fraction)
        num_neg = int(train_df.shape[0]/(49*48)/2 * cl_fraction)
        contrast_train_examples = return_contrast_examples(train_df, num_pos, num_neg)
        contrast_dataset = SentencesDataset(contrast_train_examples, model)
        contrast_dataloader = DataLoader(contrast_dataset, shuffle=True, batch_size=16)
        contrast_loss = losses.ContrastiveLoss(model=model)
        # This exploits sentence_bert's multi-task learning feature
        train_objectives = [(contrast_dataloader, contrast_loss), (train_dataloader, softmax_loss)]
    else:
        train_objectives = [(train_dataloader, softmax_loss)]

    warmup_steps = int(len(train_examples)/10)
    model.fit(train_objectives=train_objectives, epochs=1, warmup_steps=warmup_steps, evaluator=evaluator, evaluation_steps=0, output_path=tuning_contrastive_path)

    model.save(tuning_contrastive_path)
    
    print("Dev set accuracy: {}".format(model.best_score))

In [4]:
train_acc_path = os.path.join(os.getcwd(), "final_results/train_acc/")
if not os.path.isdir(train_acc_path):
    if not os.path.isdir(os.path.join(os.getcwd(), "final_results/")):
        os.mkdir(os.path.join(os.getcwd(), "final_results/"))
    os.mkdir(train_acc_path)

train_acc_evaluator = evaluation.LabelAccuracyEvaluator(train_dataloader, softmax_model=softmax_loss)

single_train_pair = train_df[train_df["labels"]==0].sample(2)
texts_ = [t for t in single_train_pair["text"].values]
assert type(texts_[0]) == type(texts_[1]) == str
single_train_pair = [InputExample(texts=[texts_[0], texts_[1]], label=0)]
# single_train_pair = return_examples(single_train_pair)
single_train_dataset = SentencesDataset(single_train_pair, model)
single_train_dataloader = DataLoader(single_train_dataset, shuffle=True, batch_size=1)

model.fit(train_objectives=[(single_train_dataloader, softmax_loss)], epochs=1, warmup_steps=warmup_steps, 
          evaluator=train_acc_evaluator, evaluation_steps=0, output_path=train_acc_path)

In [12]:
model.best_score # training accuracy

0.19568367346938775

In [26]:
test_df = pd.read_json("test_df_final.json")
test_df.shape

(48419, 5)

In [5]:
test_acc_path = os.path.join(os.getcwd(), "final_results/test_acc/")
if not os.path.isdir(train_acc_path):
    if not os.path.isdir(os.path.join(os.getcwd(), "final_results/")):
        os.mkdir(os.path.join(os.getcwd(), "final_results/"))
    os.mkdir(tests_acc_path)

single_train_pair = train_df[train_df["labels"]==0].sample(2)
texts_ = [t for t in single_train_pair["text"].values]
assert type(texts_[0]) == type(texts_[1]) == str
single_train_pair = [InputExample(texts=[texts_[0], texts_[1]], label=0)]
single_train_dataset = SentencesDataset(single_train_pair, model)
single_train_dataloader = DataLoader(single_train_dataset, shuffle=True, batch_size=1)


def return_paired_examples(df_):
    examples = []
    for lab in label_dict.values():
        lab_df = df_[df_["labels"]==lab]
        lab_df = shuffle(lab_df)
        revs = [rev.lower() for rev in lab_df["text"].values]
        pairs = output_pairs(lab, revs)
        for pair in pairs:
            assert type(pair) == list
            assert len(pair) == 2
            assert type(pair[0]) == type(pair[1]) == str
            ex = InputExample(texts=pair, label=lab)
            examples.append(ex)
    return examples


test_examples = return_paired_examples(test_df)
test_dataset = SentencesDataset(test_examples, model)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=1)

test_acc_evaluator = evaluation.LabelAccuracyEvaluator(test_dataloader, softmax_model=softmax_loss)

model.fit(train_objectives=[(single_train_dataloader, softmax_loss)], epochs=1, warmup_steps=warmup_steps, 
          evaluator=test_acc_evaluator, evaluation_steps=0, output_path=test_acc_path)

print("Test accuracy: {}".format(model.best_score))

In [14]:
model.best_score # test accuracy

0.19055684071381362

### Viewing and saving classification report info

In [6]:
with open("y_true_epoch0.p", "rb") as reader:
    y_true = pickle.load(reader)

with open("y_pred_epoch0.p", "rb") as reader:
    y_pred = pickle.load(reader)

report = classification_report(y_true, y_pred, zero_division=0, output_dict=True)

report_df = pd.DataFrame(report).T.reset_index()

backward_d = {str(value):key for key, value in label_dict.items()}

# backward_d

report_df["occupation"] = [backward_d[str(lab)] if lab in [str(value) for value in label_dict.values()] else "" for lab in report_df["index"]]

report_df = report_df[["index", "occupation", "support", "f1-score", "precision", "recall"]]

report_df.to_json("cs230_classification_report_tuning_softmax_contrastive.json")

report_df