In [None]:
import nltk
import evaluate
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset,Dataset, DatasetDict
import evaluate
import nltk
import numpy as np
import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from rouge import Rouge

In [None]:
import wandb

# Replace 'your_api_key' with your actual API key
wandb.login(key="42ec63b91e907bed87b6dc91680e063c2c5cbe27")

In [None]:
dataset = pd.read_csv('MedQuAD.csv')  # Replace with your dataset path
df = dataset
dataset = dataset.drop('qtype', axis=1)
dataset = dataset.rename(columns={'Question': 'question', 'Answer': 'answer'})

In [None]:
import matplotlib.pyplot as plt
# Check the unique values in the 'qtype' column
unique_qtypes = df['qtype'].unique()

# Display the distribution of question types
qtype_distribution = df['qtype'].value_counts()

# Plot the distribution
plt.figure(figsize=(8, 5))
qtype_distribution.plot(kind='bar', color='skyblue')
plt.title('Distribution of Question Types')
plt.xlabel('Question Type')
plt.ylabel('Number of Questions')
plt.xticks(rotation=45, ha='right')
plt.show()

# Display the unique question types
print("Unique Question Types:", unique_qtypes)

In [None]:
df['Answer_Length_Words'] = df['Answer'].str.split().apply(len)
# Visualize the distribution of answer lengths
plt.figure(figsize=(6, 4))
plt.hist(df['Answer_Length_Words'], bins=100, color='salmon', edgecolor='black')
plt.title('Answer Length Distribution (Words)')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import string

# Download the necessary NLTK datasets
nltk.download('stopwords')
nltk.download('wordnet')

df = dataset
# Lowercasing
# Lowercasing
df['question'] = df['question'].str.lower()
df['answer'] = df['answer'].str.lower()
# Remove punctuation
df['question'] = df['question'].str.translate(str.maketrans('', '', string.punctuation))
df['answer'] = df['answer'].str.translate(str.maketrans('', '', string.punctuation))
df.head()

In [None]:
# Define a set of question words to retain
question_words = {'who', 'what', 'where', 'when', 'why', 'how', 'is', 'are'}

# Define stopwords excluding question words
stop_words = set(stopwords.words('english')) - question_words

# Remove stopwords
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

df['question'] = df['question'].apply(remove_stopwords)
df['answer'] = df['answer'].apply(remove_stopwords)
df.head()

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer
# Lemmatization
# Initialize stemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()  # Alternative: more aggressive stemming

# Function to stem text
def stem_text(text, stemmer):
    return " ".join([stemmer.stem(word) for word in text.split()])

# Apply stemming
df['question'] = df['question'].apply(lambda x: stem_text(x, porter))
df['answer'] = df['answer'].apply(lambda x: stem_text(x, porter))

# Display processed data
df.head()

In [None]:
# dataset = df
df_full_train, df_test = train_test_split(dataset, test_size=0.2, random_state=56)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=56)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
health_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [None]:
health_dataset_dict

In [None]:
from datasets import Dataset, DatasetDict
from transformers import BloomTokenizerFast, BloomForCausalLM, Trainer, TrainingArguments
# Load the tokenizer and model
model_id = "bigscience/bloom-560m"
tokenizer = BloomTokenizerFast.from_pretrained(model_id)
model = BloomForCausalLM.from_pretrained(model_id)

In [None]:
CUTOFF = 300
# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples["question"], truncation=True, padding="max_length", max_length=CUTOFF)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

# Tokenize the datasets
tokenized_datasets = health_dataset_dict.map(tokenize_function, batched=True)


In [None]:
import os
output_dir = "MEdQuAD/results/bloom"  # Replace with a directory where you have write permissions
  # Replace with a directory where you have write permissions
os.makedirs(output_dir, exist_ok=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Define custom compute_loss function
import torch.nn.functional as F  # Make sure this is imported for the loss calculation
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels').to(model.device)
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        # Shift logits and labels for causal language modeling
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        
        # Flatten the tokens to calculate loss
        loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=0.001,
    logging_dir=os.path.join(output_dir, 'logs'),  # Set directory for logs
    logging_strategy="steps",  # Log at each step
    logging_steps=10,          # Number of steps between logging
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.001,
    save_total_limit=3,
    num_train_epochs=50,
    push_to_hub=False
)

# Initialize the Trainer with the custom loss computation
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
# Save the trained model
trainer.save_model("/kaggle/working/results/bloom/trained_model")  # Change this to your desired directory

In [None]:
from torch.utils.data import DataLoader
# Function to generate predictions
def generate_predictions(dataset, model, tokenizer, device, batch_size=16):
    model.to(device)
    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size)
    predictions = []
    references = []

    for batch in dataloader:
        # Tokenize the inputs
        inputs = tokenizer(batch['question'], return_tensors='pt', padding=True, truncation=True, max_length=CUTOFF)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)

        # Generate predictions without updating the model parameters
        with torch.no_grad():
            outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=200)
        
        # Decode the generated tokens into strings
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        
        # Extend the predictions and references lists
        refs = batch['answer']  # Assumes 'answer' is already tokenized or processed as needed
        predictions.extend(preds)
        references.extend(refs)

    return predictions, references

# Generate predictions for the test dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
predictions, references = generate_predictions(tokenized_datasets['test'], model, tokenizer, device)

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

# Initialize metrics
if len(predictions) == 0 or len(references) == 0:
    print("No predictions or references to evaluate.")
else:
    # Initialize metrics
    smooth = SmoothingFunction().method4
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate metrics
    bleu1_scores = []
    bleu4_scores = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for ref, pred in zip(references, predictions):
        # BLEU-1 and BLEU-4
        bleu1 = sentence_bleu([ref.split()], pred.split(), weights=(1, 0, 0, 0), smoothing_function=smooth)
        bleu4 = sentence_bleu([ref.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
        bleu1_scores.append(bleu1)
        bleu4_scores.append(bleu4)

        # ROUGE-1, ROUGE-2, and ROUGE-L
        rouge_scores = rouge.score(ref, pred)
        rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
        rougeL_scores.append(rouge_scores['rougeL'].fmeasure)
        
    # Check if there are any scores to average
    if len(bleu1_scores) == 0:
        avg_bleu1 = avg_bleu4 = avg_rouge1 = avg_rouge2 = avg_rougeL = 0
    else:
        # Average the scores
        avg_bleu1 = sum(bleu1_scores) / len(bleu1_scores)
        avg_bleu4 = sum(bleu4_scores) / len(bleu4_scores)
        avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
        avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
        avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
    # Print the results
    print(f"BLEU-1 Score: {avg_bleu1}")
    print(f"BLEU-4 Score: {avg_bleu4}")
    print(f"ROUGE-1 Score: {avg_rouge1}")
    print(f"ROUGE-2 Score: {avg_rouge2}")
    print(f"ROUGE-L Score: {avg_rougeL}")

In [None]:
from nltk.translate.meteor_score import meteor_score

# Tokenize the sentences (split by space for simplicity, but consider using more sophisticated tokenization if needed)
tokenized_references = [ref.split() for ref in references]
tokenized_hypotheses = [hyp.split() for hyp in predictions]
# Calculate METEOR scores for each reference-hypothesis pair
meteor_scores = [meteor_score([ref], hyp) for ref, hyp in zip(tokenized_references, tokenized_hypotheses)]

# Average the scores (if multiple pairs are present)
avg_meteor = sum(meteor_scores) / len(meteor_scores)
# Print the results
print(f"METEOR Score: {avg_meteor}")