In [None]:
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForQuestionAnswering,GPT2LMHeadModel,
    TrainingArguments,
    pipeline,
    logging,
    Trainer,
    DataCollatorForLanguageModeling
)
from huggingface_hub import notebook_login, login

from sklearn.model_selection import train_test_split

import pandas as pd

In [None]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
import wandb

# Replace 'your_api_key' with your actual API key
wandb.login(key="42ec63b91e907bed87b6dc91680e063c2c5cbe27")

In [None]:
# Load and preprocess data
dataset = pd.read_csv('MedQuAD.csv')  # Replace with your dataset path
df = dataset
dataset = dataset.drop('qtype', axis=1)
dataset = dataset.rename(columns={'Question': 'question', 'Answer': 'answer'})

In [None]:
import matplotlib.pyplot as plt
# Check the unique values in the 'qtype' column
unique_qtypes = df['qtype'].unique()

# Display the distribution of question types
qtype_distribution = df['qtype'].value_counts()

# Plot the distribution
plt.figure(figsize=(8, 5))
qtype_distribution.plot(kind='bar', color='skyblue')
plt.title('Distribution of Question Types')
plt.xlabel('Question Type')
plt.ylabel('Number of Questions')
plt.xticks(rotation=45, ha='right')
plt.show()

# Display the unique question types
print("Unique Question Types:", unique_qtypes)

In [None]:
df['Answer_Length_Words'] = df['Answer'].str.split().apply(len)
# Visualize the distribution of answer lengths
plt.figure(figsize=(6, 4))
plt.hist(df['Answer_Length_Words'], bins=100, color='salmon', edgecolor='black')
plt.title('Answer Length Distribution (Words)')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

* Lowercasing the text.
* Removing punctuation.
* Removing stopwords.
* Removing frequent words.
* Removing rare words.
* Removing emoticons.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import string

# Download the necessary NLTK datasets
nltk.download('stopwords')
nltk.download('wordnet')

df = dataset
# Lowercasing
# Lowercasing
df['question'] = df['question'].str.lower()
df['answer'] = df['answer'].str.lower()
# Remove punctuation
df['question'] = df['question'].str.translate(str.maketrans('', '', string.punctuation))
df['answer'] = df['answer'].str.translate(str.maketrans('', '', string.punctuation))
df.head()

In [None]:
# Define a set of question words to retain
question_words = {'who', 'what', 'where', 'when', 'why', 'how'}

# Define stopwords excluding question words
stop_words = set(stopwords.words('english')) - question_words

# Remove stopwords
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

df['question'] = df['question'].apply(remove_stopwords)
df['answer'] = df['answer'].apply(remove_stopwords)
df.head()

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer
# Lemmatization
# Initialize stemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()  # Alternative: more aggressive stemming

# Function to stem text
def stem_text(text, stemmer):
    return " ".join([stemmer.stem(word) for word in text.split()])

# Apply stemming
df['question'] = df['question'].apply(lambda x: stem_text(x, porter))
df['answer'] = df['answer'].apply(lambda x: stem_text(x, porter))

# Display processed data
df.head()

In [None]:
dataset = df
df_full_train, df_test = train_test_split(dataset, test_size=0.2, random_state=56)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=56)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)
health_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [None]:
health_dataset_dict

In [None]:
MODEL_NAME = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model.to(device)
model 

In [None]:
def preprocess_function(data):
    inputs = [q + " [SEP] " + a for q, a in zip(data["question"], data["answer"])]
   # The "inputs" are the tokenized answer:
#    inputs = [doc for doc in examples["question"] + " [SEP] " + doc for doc in examples["answer"]]
    
    model_inputs = tokenizer(inputs, max_length=200, truncation=True, padding=True, return_tensors="pt")
  
   # The "labels" are the tokenized outputs:
    return model_inputs

In [None]:
tokenized_dataset = health_dataset_dict.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['train'][0].values()

In [None]:
# Define a writable directory for outputs
import os
output_dir = "MEdQuAD/results/gpt2"  # Replace with a directory where you have write permissions
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Free GPU memory
torch.cuda.empty_cache()

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=0.001,
    logging_dir=os.path.join(output_dir, 'logs'),  # Set directory for logs
    logging_strategy="steps",  # Log at each step
    logging_steps=10,          # Number of steps between logging
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.001,
    save_total_limit=3,
    num_train_epochs=50,
    push_to_hub=False
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# Train the model
trainer.train()

In [None]:
trainer.save_model(MEdQuAD/results/gpt2/trained_model")  

In [None]:
import matplotlib.pyplot as plt

# Extract logs from the trainer
log_history = trainer.state.log_history

# Initialize dictionaries to store epoch-wise losses
train_loss_by_epoch = {}
eval_loss_by_epoch = {}

# Iterate through the logs to collect train and eval losses
for log in log_history:
    if 'loss' in log and 'epoch' in log:
        epoch = int(log['epoch'])
        train_loss_by_epoch[epoch] = log['loss']
    if 'eval_loss' in log and 'epoch' in log:
        epoch = int(log['epoch'])
        eval_loss_by_epoch[epoch] = log['eval_loss']

# Sort epochs and align train and eval losses
sorted_epochs = sorted(set(train_loss_by_epoch.keys()).union(set(eval_loss_by_epoch.keys())))
train_losses = [train_loss_by_epoch.get(epoch, None) for epoch in sorted_epochs]
eval_losses = [eval_loss_by_epoch.get(epoch, None) for epoch in sorted_epochs]

# Plotting the loss curves
plt.figure(figsize=(10, 5))
plt.plot(sorted_epochs, train_losses, label='Training Loss')
plt.plot(sorted_epochs, eval_losses, label='Evaluation Loss', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Load the fine-tuned model and tokenizer
last_checkpoint = "MEdQuAD/results/gpt2/checkpoint-123050"
finetuned_model = GPT2LMHeadModel.from_pretrained(last_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(last_checkpoint)
# Set the model to evaluation mode
finetuned_model.eval()

In [None]:
inputs = "Who is at risk for Lymphocytic Choriomeningitis (LCM)?"
inputs = tokenizer(inputs, return_tensors="pt")
print(inputs)
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
print(answer)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set the model to evaluation mode

# Define a function to generate answers
def generate_answer(question, model, tokenizer):
    inputs = tokenizer(question, return_tensors='pt', max_length=200, truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the device
    # Set attention mask
    attention_mask = inputs['attention_mask']
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'], 
            attention_mask=attention_mask,  # Use attention mask
            max_length=200, 
            num_beams=1, 
            early_stopping=True, 
            pad_token_id=tokenizer.eos_token_id  # Set pad token ID
        )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    sep_token = "[SEP]"
    if sep_token in answer:
        question_part = answer.split(sep_token)[0].strip()
        answer = answer[len(question_part):].strip()
        # Also remove [SEP] and any leading punctuation or whitespace
        answer = answer.lstrip(sep_token + " ,.")
    return answer

# Initialize lists for predictions and references
predictions = []
references = []

# Iterate over the test dataset and generate predictions
for data in tokenized_dataset['test']:
    question = data["question"]
    reference = data["answer"]
    predicted_answer = generate_answer(question, model, tokenizer)
    predictions.append(predicted_answer)
    references.append(reference)

In [None]:
tokenized_dataset['test']['question'][0]

In [None]:
references[0]

In [None]:
predictions[0]

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

# Initialize metrics
if len(predictions) == 0 or len(references) == 0:
    print("No predictions or references to evaluate.")
else:
    # Initialize metrics
    smooth = SmoothingFunction().method4
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate metrics
    bleu1_scores = []
    bleu4_scores = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for ref, pred in zip(references, predictions):
        # BLEU-1 and BLEU-4
        bleu1 = sentence_bleu([ref.split()], pred.split(), weights=(1, 0, 0, 0), smoothing_function=smooth)
        bleu4 = sentence_bleu([ref.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
        bleu1_scores.append(bleu1)
        bleu4_scores.append(bleu4)

        # ROUGE-1, ROUGE-2, and ROUGE-L
        rouge_scores = rouge.score(ref, pred)
        rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
        rougeL_scores.append(rouge_scores['rougeL'].fmeasure)
        
    # Check if there are any scores to average
    if len(bleu1_scores) == 0:
        avg_bleu1 = avg_bleu4 = avg_rouge1 = avg_rouge2 = avg_rougeL = 0
    else:
        # Average the scores
        avg_bleu1 = sum(bleu1_scores) / len(bleu1_scores)
        avg_bleu4 = sum(bleu4_scores) / len(bleu4_scores)
        avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
        avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
        avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
    # Print the results
    print(f"BLEU-1 Score: {avg_bleu1}")
    print(f"BLEU-4 Score: {avg_bleu4}")
    print(f"ROUGE-1 Score: {avg_rouge1}")
    print(f"ROUGE-2 Score: {avg_rouge2}")
    print(f"ROUGE-L Score: {avg_rougeL}")

In [None]:
from nltk.translate.meteor_score import meteor_score

# Tokenize the sentences (split by space for simplicity, but consider using more sophisticated tokenization if needed)
tokenized_references = [ref.split() for ref in references]
tokenized_hypotheses = [hyp.split() for hyp in predictions]
# Calculate METEOR scores for each reference-hypothesis pair
meteor_scores = [meteor_score([ref], hyp) for ref, hyp in zip(tokenized_references, tokenized_hypotheses)]

# Average the scores (if multiple pairs are present)
avg_meteor = sum(meteor_scores) / len(meteor_scores)
# Print the results
print(f"METEOR Score: {avg_meteor}")