In [2]:
#!pip install transformers

In [3]:
import pandas as pd
import numpy as np
import os
import atexit
import shutil
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
file_path = 'data/train.csv'
data = pd.read_csv(file_path)

In [5]:
data.head(10)

Unnamed: 0,qtype,Question,Answer
0,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...
1,symptoms,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...
2,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...
3,exams and tests,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos..."
4,treatment,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen..."
5,prevention,How to prevent Lymphocytic Choriomeningitis (L...,LCMV infection can be prevented by avoiding co...
6,information,What is (are) Parasites - Cysticercosis ?,Cysticercosis is an infection caused by the la...
7,susceptibility,Who is at risk for Parasites - Cysticercosis? ?,Cysticercosis is an infection caused by the la...
8,exams and tests,How to diagnose Parasites - Cysticercosis ?,"If you think that you may have cysticercosis, ..."
9,treatment,What are the treatments for Parasites - Cystic...,Some people with cysticercosis do not need to ...


In [6]:
# Step 1: Load the pre-trained BlenderBot model and tokenizer
model_name = "facebook/blenderbot-1B-distill"
tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
model = BlenderbotForConditionalGeneration.from_pretrained(model_name)

In [7]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
data.shape

(16407, 3)

In [9]:
def preprocess_data(df):
    # Tokenize inputs and labels
    inputs = tokenizer(df['Question'].tolist(), max_length=128, truncation=True, padding=True, return_tensors="pt")
    targets = tokenizer(df['Answer'].tolist(), max_length=128, truncation=True, padding=True, return_tensors="pt").input_ids

    # Create a dictionary to store the tokenized data
    tokenized_data = {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets
    }
    
    return tokenized_data

# Preprocess the DataFrame
tokenized_data = preprocess_data(data)


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

In [11]:
# Create a custom dataset class
class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.labels = tokenized_data['labels']
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Create the dataset
qa_dataset = QADataset(tokenized_data)

# Set up DataLoader
train_loader = DataLoader(qa_dataset, batch_size=4, shuffle=True)


In [12]:
# from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# training_args = Seq2SeqTrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     weight_decay=0.01,
#     save_total_limit=3,
#     num_train_epochs=3,
#     predict_with_generate=True,  # Since we are generating text (answers)
#     logging_dir='./logs',
# )


In [13]:
from transformers import Trainer, TrainingArguments
import tf_keras as keras

# Define training arguments

try:
    training_args = TrainingArguments(
    output_dir="./report",        # Output directory
    evaluation_strategy="epoch",          # Evaluate every epoch
    learning_rate=5e-5,                   # Set a learning rate
    per_device_train_batch_size=4,        # Batch size for training
    num_train_epochs=3,                   # Number of epochs to train
    weight_decay=0.01,                    # Strength of weight decay
     save_steps=10_000,                    # Save every 10,000 steps
    logging_dir="./logs",                 # Directory for storing logs
)
except Exception as e:
    print("An error occurred during training:", e)







In [14]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=qa_dataset
)

# Start training
trainer.train()


  0%|          | 0/12306 [00:00<?, ?it/s]