In [1]:
import os
import gc
import random
import pandas as pd
import pickle
import torch
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
print(device)

mps


In [5]:
MAX_LEN = 1024
# MODEL_NAME = "microsoft/DialoGPT-large"
MODEL_NAME = "microsoft/DialoGPT-medium"
# MODEL_NAME = "microsoft/deberta-v3-base"
# MODEL_NAME = "microsoft/deberta-v3-large"

from transformers import AutoTokenizer, AutoModelForSequenceClassification
def initialize_model(modelname, bool_tokenizer=False, bool_model=True):
    if bool_tokenizer:
        tokenizer = AutoTokenizer.from_pretrained(modelname, truncation=True, max_len=MAX_LEN, padding='max_length', cache_dir="/Users/stellali/repos/reddit_norm/.cache")
    else:
        tokenizer = None
    if bool_model:
        model = AutoModelForSequenceClassification.from_pretrained(modelname, num_labels=2, cache_dir="/Users/stellali/repos/reddit_norm/.cache").to(device)
    else:
        model = None
    if tokenizer and tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    if model and "DialoGPT" in modelname:
        model.config.pad_token_id = model.config.eos_token_id
    return model, tokenizer
_, tokenizer = initialize_model(MODEL_NAME, bool_tokenizer=True, bool_model=False)

In [6]:
os.listdir("../data/")

FileNotFoundError: [Errno 2] No such file or directory: '../data/'

# Loading and processing data

In [6]:
path_data = "../data/"
with open(path_data+"aggregated_binary_annotations_0_4.pkl","rb") as file:
    raw_data = pickle.load(file)
raw_data = pd.DataFrame(raw_data)

NORM_DIMENSION="formality"
path_synthetic_file = {'supportiveness':'gender_supportive_toxic_synthetic_data.csv',
                      'sarcasm': 'gender_genuine_sarcasm_synthetic_data.csv',
                       'politeness': 'gender_rude_polite_synthetic_data.csv',
                       'humorous': 'gender_humor_serious_synthetic_data.csv',
                       'formality': 'gender_casual_formal_synthetic_data.csv'
                      }

synthetic = pd.read_csv(path_data+path_synthetic_file[NORM_DIMENSION])
synthetic = synthetic.rename({'title1': 'post_title_1', 'description1':'post_description_1', 'comment1':'comment_1',
                              'title2': 'post_title_2', 'description2':'post_description_2', 'comment2':'comment_2'}, axis=1)
synthetic_old = None
if NORM_DIMENSION == "supportiveness":
    synthetic_old = pd.read_csv(path_data+"gender_supportive_toxic_synthetic_data_old.csv")
    synthetic_old = synthetic_old.rename({'title1': 'post_title_1', 'description1':'post_description_1', 'comment1':'comment_1',
                                  'title2': 'post_title_2', 'description2':'post_description_2', 'comment2':'comment_2'}, axis=1)

print("raw", len(raw_data), "synthetic", len(synthetic))
if synthetic_old:
    print("synthetic_old", len(synthetic_old))

raw 100 synthetic 1250


In [7]:
from collections import Counter
print(raw_data.columns)

def process_input_text(d, reverse_label=False):
    
    input1 = d['post_title_1'] +"\n"+ d['post_description_1'] if type(d['post_description_1']) == str else d['post_title_1']
#     input1 += "\nCOMMENT: "+d['comment_1']
#     comment1 = "COMMENT1: "+d['comment_1']
    input2 = d['post_title_2'] +"\n"+ d['post_description_2'] if type(d['post_description_2']) == str else d['post_title_2']
#     input2 += "\nCOMMENT: "+d['comment_2']
    if reverse_label:
        return "Comment 1: "+ d['comment_2']+"\nComment 2: "+ d['comment_1'] + "\nPOST1: "+input2+"\nPOST2: "+input1
    else:
        return "Comment 1: "+ d['comment_1']+"\nComment 2: "+ d['comment_2'] + "\nPOST1: "+input1+"\nPOST2: "+input2
    
def process_data(data:list[dict], balance_label=True)->list[str]:
    '''process each sample to training data'''
    if data is None:
        return None
    processed = []
    num_labels = [0, 0]
    for _, d in data.iterrows():
        if 'annotation' in d:
            try:
                label = int(d['annotation'][NORM_DIMENSION]) -1
            except:
                print(f"label not either 1 or 2 (label: {d['annotation'][NORM_DIMENSION]})")
                continue
        elif 'synthetic_label' in d:
            try:
                label = int(d['synthetic_label']) -1
            except:
                print(f"label not either 1 or 2")
                print(d['synthetic_label'])
                continue
        if balance_label:
            less_num_label = 0 if num_labels[0] < num_labels[1] else 1
            if less_num_label == label:
#                 input_text = input1+input2
                input_text = process_input_text(d)
                instance = [input_text, less_num_label]
            else:
                input_text = process_input_text(d, reverse_label=True)
                instance = [input_text, less_num_label]
            num_labels[less_num_label] += 1
        else:
            input_text = process_input_text(d)
            instance = [input_text, label]
        tokenized = tokenizer(instance[0], max_length=MAX_LEN, padding = 'max_length', truncation=True, return_tensors='pt')
#         tokenized = tokenizer(instance[0], max_length=MAX_LEN, padding = 'max_length', truncation=True, return_tensors='pt').to(device)
        if 'token_type_ids' in tokenized:
            tokenized = {'input_ids':tokenized['input_ids'][0], 'token_type_ids':tokenized['token_type_ids'][0], 'attention_mask':tokenized['attention_mask'][0]}
        else:
            tokenized = {'input_ids':tokenized['input_ids'][0], 'attention_mask':tokenized['attention_mask'][0]}
        instance[0]=tokenized
        processed.append(instance)
    return processed

data_human = process_data(raw_data)
data_synthetic = process_data(synthetic)
data_synthetic_old = process_data(synthetic_old)
# data = []
# for rdata in [raw_data, synthetic]:
#   data += process_data(rdata)
#   # print(data[0])
#   print("size", len(data))
#   print("label distribution", Counter([p[1] for p in data]))

Index(['post_title_1', 'post_description_1', 'comment_1', 'post_title_2',
       'post_description_2', 'comment_2', 'comment_metadata_1',
       'post_metadata_1', 'comment_metadat2', 'post_metadata2', 'annotation'],
      dtype='object')
label not either 1 or 2 (label: tie)
label not either 1 or 2 (label: None)


In [8]:
from torch.utils.data import Dataset, DataLoader
class CustomData(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

# training configuration

In [9]:
# PATH_SAVE
str_modelname=MODEL_NAME.split("/")[-1]
PATH_MODEL_SAVE="/mmfs1/home/chanyoun/models/"+f"{NORM_DIMENSION}-{str_modelname}/"
if not os.path.isdir(PATH_MODEL_SAVE):
    os.mkdir(PATH_MODEL_SAVE)

import logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(PATH_MODEL_SAVE+"training.log"),
        logging.StreamHandler()
    ]
)

In [10]:
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
TEST_RATIO=0.2
VAL_RATIO=0.1

# random.shuffle(data)
# num_test = int(len(data)*TEST_RATIO)
# print(f"data size\ttrain:{len(data)-num_test}\ttest:{num_test}")
# dataloader_train = DataLoader(CustomData(data[num_test:]), batch_size=TRAIN_BATCH_SIZE, shuffle=True)
# dataloader_test = DataLoader(CustomData(data[:num_test]), batch_size=VALID_BATCH_SIZE, shuffle=True)

# data_human
# data_synthetic
if data_synthetic_old is not None:
    data_synthetic_and_old = data_synthetic+data_synthetic_old
    random.shuffle(data_synthetic_and_old)
    num_val = int(len(data_synthetic_and_old)*VAL_RATIO)
    data_dev, data_train  = data_synthetic_and_old[:num_val], data_synthetic_and_old[num_val:]
    print(f"data size\ttrain:{len(data_train)}\tdev:{len(data_dev)}\ttest:{len(data_human)}")
    dataloader_train = DataLoader(CustomData(data_train), batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    dataloader_dev = DataLoader(CustomData(data_dev), batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    dataloader_test = DataLoader(CustomData(data_human), batch_size=VALID_BATCH_SIZE, shuffle=True)
else:
    random.shuffle(data_synthetic)
    num_val = int(len(data_synthetic)*VAL_RATIO)
    data_dev, data_train  = data_synthetic[:num_val], data_synthetic[num_val:]
    print(f"data size\ttrain:{len(data_train)}\tdev:{len(data_dev)}\ttest:{len(data_human)}")
    dataloader_train = DataLoader(CustomData(data_train), batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    dataloader_dev = DataLoader(CustomData(data_dev), batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    dataloader_test = DataLoader(CustomData(data_human), batch_size=VALID_BATCH_SIZE, shuffle=True)


data size	train:1125	dev:125	test:98


In [11]:
def evaluate(dataloder):
    model.eval()
    test_loss, num_example, num_correct = 0, 0, 0
    with torch.no_grad():
        for batch in dataloder:
            input, lbl = batch
            input = {k:v.to(device) for k,v in input.items()}
            lbl = lbl.to(device)
            outputs = model(**input, labels=lbl)
            test_loss += outputs.loss
            num_example += len(lbl)
            _, pred_idx = outputs.logits.max(dim=1)
            num_correct += sum(pred_idx == lbl).item()
    return test_loss/len(dataloder), num_correct/num_example


def train(epoch=0, best_val_acc=0, best_test_acc=0, bool_verbose=True, bool_save=True, min_save_epoch=3):
    model.train()
    train_loss = 0
    num_total_batch = len(dataloader_train)
    num_batch, num_example, num_correct_pred = 0, 0, 0
    for idx_batch, batch in enumerate(dataloader_train):
        input, lbl = batch
        input = {k:v.to(device) for k,v in input.items()}
        lbl = lbl.to(device)
        outputs = model(**input, labels=lbl)
        train_loss += outputs.loss
        num_example += len(lbl)
        num_batch += 1
        _, pred_idx = outputs.logits.max(dim=1)
        num_correct_pred += sum(pred_idx == lbl).item()
        if (idx_batch+1) % REPORT_EVERY == 0:
            avg_train_loss = train_loss / num_batch
            avg_acc = num_correct_pred / num_example
            avg_val_loss, avg_val_acc = evaluate(dataloader_dev)
            avg_test_loss, avg_test_acc = evaluate(dataloader_test)
            if bool_verbose:
                logging.info(f"Epoch {epoch} [{idx_batch+1:>3}/{num_total_batch:<3}] loss: {avg_train_loss.item():<8.3f}acc: {avg_acc:<8.3f}val loss: {avg_val_loss:<8.3f}val acc: {avg_val_acc:<8.3f}test loss: {avg_test_loss:<8.3f}test acc: {avg_test_acc:<8.3f}")
#                 print(f"Epoch {epoch} [{idx_batch+1:>3}/{num_total_batch:<3}] loss: {avg_train_loss.item():<8.3f}acc: {avg_acc:<8.3f}val loss: {avg_val_loss:<8.3f}val acc: {avg_val_acc:<8.3f}test loss: {avg_test_loss:<8.3f}test acc: {avg_test_acc:<8.3f}")
            if avg_val_acc > best_val_acc :
                best_val_acc = avg_val_acc
                best_test_acc = avg_test_acc
                if bool_verbose:
                    logging.info(f"new best val acc found ({avg_val_acc:.3f}), new test acc: {avg_test_acc:.3f}")
#                     print(f"new best val acc found ({avg_val_acc:.3f}), new test acc: {avg_test_acc:.3f}")
                if bool_save and epoch>=min_save_epoch:
                    logging.info(f"saving the model to {PATH_MODEL_SAVE}")
#                     print(f"saving the model to {PATH_MODEL_SAVE}")
                    model.save_pretrained(PATH_MODEL_SAVE, from_pt=True) 
            train_loss, num_batch, num_correct_pred, num_example = 0, 0, 0, 0
            model.train()
        optimizer.zero_grad()
        outputs.loss.backward()
        optimizer.step()
    return best_val_acc, best_test_acc

In [12]:
gc.collect()
torch.cuda.empty_cache()

In [13]:
get_memory_usage()

 Free:47.333GB	Available:47.619GB


In [None]:
# Defining some key variables that will be used later on in the training
NUM_EPOCHS = 15
REPORT_EVERY = 70
LEARNING_RATE = 1e-05
WEIGHT_DECAY= 1e-04

# optimizer
# model, _ = initialize_model("microsoft/deberta-v3-large")
# model, _ = initialize_model("microsoft/deberta-v3-base")
model, _ = initialize_model(MODEL_NAME, bool_tokenizer=False)
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

best_val_acc, best_test_acc = 0, 0
for epoch in range(NUM_EPOCHS):
    best_val_acc, best_test_acc = train(epoch+1, best_val_acc, best_test_acc, bool_save=False)
    gc.collect()
    torch.cuda.empty_cache()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialoGPT-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-04-09 21:44:24,222 [INFO] Epoch 1 [ 70/282] loss: 0.724   acc: 0.493   val loss: 0.710   val acc: 0.496   test loss: 0.700   test acc: 0.439   
2024-04-09 21:44:24,231 [INFO] new best val acc found (0.496), new test acc: 0.439
2024-04-09 21:45:48,863 [INFO] Epoch 1 [140/282] loss: 0.705   acc: 0.500   val loss: 0.684   val acc: 0.552   test loss: 0.682   test acc: 0.531   
2024-04-09 21:45:48,864 [INFO] new best val acc found (0.552), new test acc: 0.531
2024-04-09 21:47:13,527 [INFO] Epoch 1 [210/282] loss: 0.707   acc: 0.500   val loss: 0.688   val acc: 0.472   test loss: 0.696   test acc: 0.449   
2024-04-09 21:48:38,256 [INFO] Epoch 1 [280/282] loss: 0.688   acc: 0.546   val loss: 0.689   val acc: 0.528  

In [13]:
# Defining some key variables that will be used later on in the training
NUM_EPOCHS = 8
REPORT_EVERY = 140
LEARNING_RATE = 1e-05
WEIGHT_DECAY= 1e-04

# optimizer
model, _ = initialize_model("microsoft/deberta-v3-large")
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

best_val_acc, best_test_acc = 0, 0
for epoch in range(NUM_EPOCHS):
    best_val_acc, best_test_acc = train(epoch+1, best_val_acc, best_test_acc)
    gc.collect()
    torch.cuda.empty_cache()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 [140/563] loss: 0.701   acc: 0.487   val loss: 0.694   val acc: 0.494   test loss: 0.695   test acc: 0.475   
new best val acc found (0.494), new test acc: 0.475
Epoch 1 [280/563] loss: 0.693   acc: 0.521   val loss: 0.692   val acc: 0.498   test loss: 0.692   test acc: 0.515   
new best val acc found (0.498), new test acc: 0.515
Epoch 1 [420/563] loss: 0.692   acc: 0.537   val loss: 0.668   val acc: 0.602   test loss: 0.660   test acc: 0.596   
new best val acc found (0.602), new test acc: 0.596
Epoch 1 [560/563] loss: 0.630   acc: 0.686   val loss: 0.540   val acc: 0.743   test loss: 0.605   test acc: 0.667   
new best val acc found (0.743), new test acc: 0.667
Epoch 2 [140/563] loss: 0.505   acc: 0.780   val loss: 0.404   val acc: 0.827   test loss: 0.522   test acc: 0.768   
new best val acc found (0.827), new test acc: 0.768
Epoch 2 [280/563] loss: 0.432   acc: 0.804   val loss: 0.431   val acc: 0.795   test loss: 0.569   test acc: 0.768   
Epoch 2 [420/563] loss: 0.385   