In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
# from transformers import TextDataset, DataCollatorForLanguageModeling
# import t5tokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import torch
# from transformers import pipeline
import numpy as np
import logging
import torch
import torch.distributed as dist

In [2]:
CACHE_DIR = '/scratch/adyansh/cache'

In [3]:
data = pd.read_csv('../ADR-data/context_decision.csv')

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, input_ids, output_ids):
        self.input_ids = input_ids['input_ids']
        self.attention_mask = input_ids['attention_mask']
        self.output_ids = output_ids['input_ids']
        self.output_attention_mask = output_ids['attention_mask']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'decoder_input_ids': self.output_ids[idx][:-1],
            'decoder_attention_mask': self.output_attention_mask[idx][:-1],
            'labels': self.output_ids[idx][1:]
        }


In [6]:
def get_data(data: pd.DataFrame, max_length = -1):
    context = data['Context'].tolist()
    decision = data['Decision'].tolist()
    for i in range(len(context)):
        context[i] = f"This is an Architectural Decision Record. Provide a Decision for the Context given below.\n{context[i]}\n## Decision\n"
    if max_length != -1:
        removed = []
        context_new = []
        decision_new = []
        for i, (c, d) in enumerate(zip(context, decision)):
            if len(c) <= max_length and len(d) <= max_length:
                context_new.append(c)
                decision_new.append(d)
            else:
                removed.append(i)
        context = context_new
        decision = decision_new
        
    return context, decision, removed

In [7]:
model_name = "google/flan-t5-small"
model_max_length = 2000

context, decision, removed = get_data(data, model_max_length)
len(context), len(decision)

(4856, 4856)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR, model_max_length=model_max_length)

In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=CACHE_DIR, device_map="cuda:0")

In [10]:
# predicted_decision = []

# for c in tqdm(context):
#     # print(c)
#     input_ids = tokenizer(c, return_tensors="pt").input_ids.to(device)

#     outputs = model.generate(input_ids, max_length=len(input_ids[0])*4, min_length= int(len(input_ids[0])/8))
#     predicted_decision.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
predicted_decision = []

BATCH_SIZE = 64

inputs = tokenizer(context, return_tensors="pt", padding=True, truncation=True, max_length=model_max_length, return_attention_mask=True)

with torch.no_grad():
    for i in tqdm(range(0, len(context), BATCH_SIZE)):
        input_ids = inputs['input_ids'][i:i+BATCH_SIZE].to(device)
        attention_mask = inputs['attention_mask'][i:i+BATCH_SIZE].to(device)

        outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=model_max_length, min_length= int(model_max_length/8))
        predicted_decision.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))

len(predicted_decision)

In [None]:
for i in removed:
    predicted_decision.insert(i, "")

In [16]:
context_2 = [c.replace("\n", "\\n") for c in data['Context'].tolist()]
df = pd.DataFrame({'Context': context_2 , 'Decision': data['Decision'].tolist(), model_name.split('/')[-1]: predicted_decision})

df.to_csv(f'../results/results.csv', index=False)

: 