In [2]:
def load_data(model, experiment='0_shot', path='../data/):
    if(experiment=='0_shot'):
        df = pd.read_csv(os.path.join(path, '0_shot.csv'))
        if(model=='gpt3'):
            return df['context_input_gpt3'].tolist(), df['decision'].tolist()
        elif(model=='chatGPT'):
            return df['context_input_chatGPT'].tolist(), df['decision'].tolist()
        elif(model=='T5'):
            return df['context_input_T5'].tolist(), df['decision'].tolist()
        else:
            return df['context'].tolist(), df['decision'].tolist()
    elif(experiment=='few_shot'):
        df = pd.read_csv(os.path.join(path, 'few_shot.csv'))
        if(model=='gpt3'):
            return df['context_input_gpt3'].tolist(), df['decision'].tolist()
        else:
            return df['context'].tolist(), df['decision'].tolist()
        
    else:
        df = pd.read_csv(os.path.join(path, '0_shot.csv'))
        if(model=='gpt2'):
            return df['context_input_gpt3'].tolist(), df['decision'].tolist()
        elif(model=='T5'):
            return df['context'].tolist(), df['decision'].tolist()

    


In [3]:
def select_data(context, decision, length):
    context_new, decision_new = [], []
    for c,d in zip(context, decision):
        if(len(c) < length and len(d) < length):
            context_new.append(c)
            decision_new.append(d)
        
    return context_new, decision_new

def print_results(predictions, references):
    results = rouge.compute(predictions=predictions, references=references)
    ic(results)
    results = bleu.compute(predictions=predictions, references=references)
    ic(results)
    results = meteor.compute(predictions=predictions, references=references)
    ic(results)
    results = bertscore.compute(predictions=predictions, references=references, lang="en")
    results = {
        'precision': np.average(results['precision']), 'recall': np.average(results['recall']),
        'f1': np.average(results['f1']), 'hashcode': results['hashcode']
        }
    ic(results)

def store_output(model_name, context, decision, predicted_decision, experiment='0_shot'):
    df = pd.DataFrame(list(zip(context, decision, predicted_decision)), columns =['context', 'decision', 'predicted_decision'])
    if(experiment=='0_shot'):
        df.to_csv('../output/0_shot/'+model_name+'.csv', index=False)
    elif(experiment=='few_shot'):
        df.to_csv('../output/few_shot/'+model_name+'.csv', index=False)
    else:
        df.to_csv('../output/fine_tune/'+model_name+'.csv', index=False)

## GPT2

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling
import torch
from transformers import pipeline
from icecream import ic
import numpy as np
import wandb
wandb.init(project="adr_gpt2")
from transformers.integrations import WandbCallback
import logging

import evaluate
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')
meteor = evaluate.load('meteor')
bertscore = evaluate.load("bertscore")

In [9]:
# Create a PyTorch dataset
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, data):
        self.examples = []
        for text in data:
            input_ids = tokenizer.encode(text, add_special_tokens=True)
            self.examples.append(torch.tensor(input_ids))
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return self.examples[idx]



In [12]:
logging.getLogger("transformers").setLevel(logging.WARNING)

# Tokenize the data
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
ic('tokenizer loaded')

context, decision = load_data('gpt2', experiment='fine_tune')
context, decision = select_data(context, decision, 2000)
print(len(context), len(decision))
context_train, context_test, decision_train, decision_test = train_test_split(context, decision, test_size=0.20, random_state=42)

train_dataset = [context_train[i] + decision_train[i] for i in range(len(context_train))]
val_dataset = [context_test[i] + decision_test[i] for i in range(len(context_test))]
train_dataset = TextDataset(tokenizer, train_dataset)
val_dataset = TextDataset(tokenizer, val_dataset)
test_dataset = TextDataset(tokenizer, context_test)

model = AutoModelForCausalLM.from_pretrained("gpt2", cache_dir='../cache')
ic('Model loaded')

# Fine-tune the model on the dataset
training_args = TrainingArguments(
    output_dir="../checkpoints/", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=20, # number of training epochs
    report_to="wandb",
    logging_steps=1,
    evaluation_strategy="epoch",
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=2,  # batch size for evaluation
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[WandbCallback()],
)

ic| 'tokenizer loaded'


78 78


ic| 'Model loaded'
You are adding a <class 'transformers.integrations.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback


In [None]:
trainer.train()

In [14]:
from transformers import GPT2Config
logging.getLogger("transformers").setLevel(logging.WARNING)

def evaluate(model_name, context, decision, experiment):
    generator = pipeline('text-generation', model='../checkpoints/checkpoint-93', tokenizer='gpt2')
    predicted_decision = []
    for input_text in context:
        output_text = generator(input_text, max_length=500, pad_token_id=tokenizer.eos_token_id)
        output_text = output_text[0]['generated_text']
        predicted_decision.append(output_text[len(input_text):])

    store_output(model_name, context, decision, predicted_decision, experiment)
    print_results(predicted_decision, decision)

evaluate('gpt2', context_test, decision_test, experiment='fine_tune')


ic| results: {'rouge1': 0.1550460300490164,
              'rouge2': 0.021454479691571493,
              'rougeL': 0.08993452079531158,
              'rougeLsum': 0.14206922188863258}
ic| results: {'bleu': 0.009986268716749973,
              'brevity_penalty': 1.0,
              'length_ratio': 2.625189681335357,
              'precisions': [0.12215799614643545,
                             0.018940858136838035,
                             0.003683598293912369,
                             0.0011668611435239206],
              'reference_length': 1977,
              'translation_length': 5190}
ic| results: {'meteor': 0.17607888354746867}
ic| results: {'f1': 0.8146063312888145,
              'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.24.0)',
              'precision': 0.7966932952404022,
              'recall': 0.8341554552316666}


## T5

In [None]:
import pandas as pd
import numpy as np
import os, sys
from icecream import ic
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForSeq2Seq
import torch
import wandb
wandb.init(project="t5_adr")
from transformers.integrations import WandbCallback
import logging

import evaluate
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')
meteor = evaluate.load('meteor')
bertscore = evaluate.load("bertscore")

In [5]:
# Create a PyTorch dataset
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, input_ids, output_ids):
        self.input_ids = input_ids['input_ids']
        self.attention_mask = input_ids['attention_mask']
        self.output_ids = output_ids['input_ids']
        self.output_attention_mask = output_ids['attention_mask']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'decoder_input_ids': self.output_ids[idx][:-1],
            'decoder_attention_mask': self.output_attention_mask[idx][:-1],
            'labels': self.output_ids[idx][1:]
        }

In [10]:
logging.getLogger("transformers").setLevel(logging.WARNING)

model_name = "google/flan-t5-small"
model_max_length = 2000

# Load the data
context, decision = load_data('T5', experiment='fine_tune')
context, decision = select_data(context, decision, model_max_length*4)

# Tokenize the data
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=model_max_length)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
ic('Tokenizing done')

input_ids = tokenizer.batch_encode_plus(context, padding=True, truncation=True, return_tensors="pt")
output_ids = tokenizer.batch_encode_plus(decision, padding=True, truncation=True, return_tensors="pt")
textDataset = TextDataset(tokenizer, input_ids, output_ids)
indices = np.arange(len(context))
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
test_context, test_decision = [context[i] for i in test_indices], [decision[i] for i in test_indices]
train_dataset, val_dataset = torch.utils.data.Subset(textDataset, train_indices), torch.utils.data.Subset(textDataset, test_indices)
ic(len(train_dataset), len(val_dataset))

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir='../cache')
ic('Model loaded')

training_args = TrainingArguments(
    output_dir="../checkpoints/" + model_name, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=20, # number of training epochs
    report_to="wandb",
    logging_steps=1,
    evaluation_strategy="epoch",
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=2,  # batch size for evaluation
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[WandbCallback()],
)

ic| 'Tokenizing done'
ic| len(train_dataset): 73, len(val_dataset): 19
ic| 'Model loaded'
You are adding a <class 'transformers.integrations.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback


In [None]:
trainer.train()

In [11]:
def generate_text(prompt, model, tokenizer, max_length=2000):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids=input_ids, max_length=max_length, do_sample=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def evaluate_t5(model_name, context, decision, experiment, model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='../cache', model_max_length=model_max_length)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, cache_dir='../cache')
    
    predicted_decision = []
    for c in context:
        predicted_decision.append(generate_text(c, model, tokenizer))

    store_output(model_name, context, decision, predicted_decision, experiment)
    print(model_name)
    print_results(predicted_decision, decision)

In [12]:
logging.getLogger("transformers").setLevel(logging.WARNING)
model_path = "../checkpoints/google/flan-t5-small/checkpoint-740/"
evaluate_t5(model_name, test_context, test_decision, 'fine_tune', model_path)

google/flan-t5-small


ic| results: {'rouge1': 0.18980749443085573,
              'rouge2': 0.036371821912997385,
              'rougeL': 0.12535303706718598,
              'rougeLsum': 0.1684382640814316}
ic| results: {'bleu': 0.017895622088931726,
              'brevity_penalty': 0.2424245268778062,
              'length_ratio': 0.4137249364586275,
              'precisions': [0.4232081911262799,
                             0.08644536652835408,
                             0.035739313244569026,
                             0.0227111426543648],
              'reference_length': 3541,
              'translation_length': 1465}
ic| results: {'meteor': 0.13891576128316147}
ic| results: {'f1': 0.8311444991513303,
              'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.24.0)',
              'precision': 0.8418689527009663,
              'recall': 0.8217253779110155}
