In [1]:
# %pip install transformers datasets torch

In [None]:
# from transformers import BartForConditionalGeneration, BartTokenizer

# model_name = "facebook/bart-base"
# tokenizer = BartTokenizer.from_pretrained(model_name)
# model = BartForConditionalGeneration.from_pretrained(model_name)


In [1]:
import pandas as pd
# Load the TSV dataset
file_path = 'dialogues_eda.tsv'
df = pd.read_csv(file_path, sep='\t')
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,question,answer
0,"Hi, How are you doing?",I am fine. How about yourself?
1,I am fine. How about yourself?,I am pretty good. Thanks for asking.
2,I am pretty good. Thanks for asking.,No problem. So how have you been?
3,No problem. So how have you been?,I have been great. What about you?
4,I have been great. What about you?,I have been good. I am in school right now.


In [8]:
# df.to_dict(orient='records')

In [2]:
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
class DialoguesDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.input_ids = []
        self.labels = []
        self.tokenizer = tokenizer

        self._prepare_data(data, max_length)

    def _prepare_data(self, data, max_length):
        for example in tqdm(data,desc="Processing examples"):
            # Assuming the data is in a dictionary format
            question = example["question"]
            answer = example["answer"]

            input_encodings = self.tokenizer(question, truncation=True, padding="max_length", max_length=max_length)
            label_encodings = self.tokenizer(answer, truncation=True, padding="max_length", max_length=max_length)

            self.input_ids.append(torch.tensor(input_encodings["input_ids"]))
            self.labels.append(torch.tensor(label_encodings["input_ids"]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "labels": self.labels[idx],
        }


In [3]:

from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = "facebook/bart-base"
# tokenizer = BartTokenizer.from_pretrained(model_name)
# model = BartForConditionalGeneration.from_pretrained(model_name)

# Example dataset
# data = [
#     {"question": "What is BART?", "answer": "BART is a transformer model by Facebook AI."},
#     {"question": "How does BART work?", "answer": "BART uses an encoder-decoder structure for NLP tasks."}
# ]
data = df.to_dict(orient='records')

tokenizer = BartTokenizer.from_pretrained(model_name)
train_dataset = DialoguesDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

# Load model
model = BartForConditionalGeneration.from_pretrained(model_name)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)


Processing examples: 100%|██████████| 3725/3725 [00:04<00:00, 768.14it/s]


In [None]:
from tqdm import tqdm
for batch in tqdm(train_dataloader, desc=f"Test Data"):
    print(batch)
    input_ids = batch["input_ids"]
    labels = batch["labels"]
    break

Test Data:   0%|          | 0/3725 [00:00<?, ?it/s]

{'input_ids': tensor([[    0,   846, 12653,    16,    98,   505,     6,   125,    82,   109,
            45, 15304,     4,     2,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  




: 

In [None]:
# for i, batch in enumerate(tqdm(self.train_loader)):
#                 input_ids, token_type_ids, labels = batch
#                 input_ids = input_ids.to(self.args['device'])
#                 token_type_ids = token_type_ids.to(self.args['device'])
#                 labels = labels.to(self.args['device'])

#                 outputs = self.model(
#                     input_ids=input_ids,
#                     token_type_ids=token_type_ids,
#                     labels=labels
#                 )

In [None]:
from tqdm import tqdm
num_epochs = 3
model.train()

for epoch in range(num_epochs):
    total_loss = 0

    for i,batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        input_ids = batch["input_ids"]
        labels = batch["labels"]
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

print("Training Completed!")


Epoch 1/3:   0%|          | 1/3725 [00:09<9:31:13,  9.20s/it]

In [5]:
def predict_answer(model, tokenizer, question, context):
    model.eval()
    inputs = tokenizer(question, context, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_token_ids = outputs.logits.argmax(dim=-1)
    answer = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)
    
    return answer

# Example Usage
question = "What is deep learning?"
context = "Deep learning is a subset of machine learning that uses neural networks to learn from data."

answer = predict_answer(model, tokenizer, question, context)
print("Predicted Answer:", answer)


Predicted Answer: What is deep learning? learning subset of machine learning. uses neural networks to learn from data.
