In [None]:
# Install required libraries (uncomment if running for the first time)
!pip install transformers torch bitsandbytes kagglehub peft

# Import necessary libraries
import kagglehub
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, PeftModel
from huggingface_hub import login

# Set device for model computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to download and load the COVID-19 Symptoms dataset
def load_covid_symptoms_dataset():
    path_symptoms = kagglehub.dataset_download("takbiralam/covid19-symptoms-dataset")
    symptoms_path = f"{path_symptoms}/covid-19 symptoms dataset.csv"
    symptoms_df = pd.read_csv(symptoms_path)
    return symptoms_df

# Function to map and preprocess the dataset for readability
def preprocess_data(symptoms_df):
    diff_breath_map = {-1: "no difficulty breathing", 0: "moderate difficulty breathing", 1: "severe difficulty breathing"}
    binary_map = {0: "no", 1: "yes"}
    symptoms_df['bodyPain'] = symptoms_df['bodyPain'].map(binary_map)
    symptoms_df['runnyNose'] = symptoms_df['runnyNose'].map(binary_map)
    symptoms_df['diffBreath'] = symptoms_df['diffBreath'].map(diff_breath_map)
    symptoms_df['infectionProb'] = symptoms_df['infectionProb'].map(binary_map)
    return symptoms_df

# Function to generate fine-tuning prompts and responses from dataset
def generate_fine_tune_data(symptoms_df):
    data = [
        {
            "prompt": (f"Patient has fever: {row['fever']}Â°F, body pain: {row['bodyPain']}, "
                       f"age: {row['age']}, runny nose: {row['runnyNose']}, difficulty breathing: {row['diffBreath']}."),
            "response": "High" if row['infectionProb'] == "yes" else "Low"
        }
        for _, row in symptoms_df.iterrows()
    ]
    return pd.DataFrame(data)

# Hugging Face login (replace with your token)
def authenticate_hugging_face(token):
    login(token)

# Initialize tokenizer and model
def initialize_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    # Ensure pad token is set
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))

    return tokenizer, model

# Tokenize input data
def tokenize_data(fine_tune_data, tokenizer):
    def tokenize_function(example):
        input_text = f"<s>Prompt: {example['prompt']}\nResponse: {example['response']}</s>"
        return tokenizer(input_text, padding="max_length", truncation=True, max_length=256)

    return fine_tune_data.apply(tokenize_function, axis=1)

# PyTorch Dataset class for fine-tuning
class FineTuneDataset(Dataset):
    def __init__(self, inputs):
        self.input_ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.input_ids[idx]
        }

# Prepare dataset and tokenize in batch mode
def prepare_dataset(fine_tune_data, tokenizer):
    inputs = tokenizer(
        fine_tune_data['prompt'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    ).to(device)
    return FineTuneDataset(inputs)

# Configure PEFT (LoRA) settings
def configure_peft(model):
    peft_config = LoraConfig(
        r=8, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
    )
    return PeftModel(model, peft_config)

# Set training arguments
def get_training_args():
    return TrainingArguments(
        output_dir="./finetuned-covid",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        save_total_limit=2,
    )

# Train the model using Hugging Face Trainer
def train_model(model, train_dataset):
    training_args = get_training_args()
    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)
    trainer.train()

    # Save model and tokenizer
    model.save_pretrained("./finetuned-covid")
    tokenizer.save_pretrained("./finetuned-covid")
    print("Fine-tuning with PEFT completed and models saved.")

# Generate responses from fine-tuned model
def generate_response(prompt, model, tokenizer):
    inputs = tokenizer(f"<s>Prompt: {prompt}</s>", return_tensors="pt").to(device)
    outputs = model.generate(inputs.input_ids, max_length=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Main workflow
def main():
    # Load and preprocess dataset
    symptoms_df = load_covid_symptoms_dataset()
    symptoms_df = preprocess_data(symptoms_df)

    # Generate fine-tuning data
    fine_tune_data = generate_fine_tune_data(symptoms_df)

    # Authenticate and initialize model
    authenticate_hugging_face("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")  # Replace with your actual token
    tokenizer, model = initialize_model_and_tokenizer("meta-llama/Llama-3.2-1B")

    # Prepare dataset for fine-tuning
    tokenized_data = tokenize_data(fine_tune_data, tokenizer)
    train_dataset = prepare_dataset(fine_tune_data, tokenizer)

    # Configure PEFT for the model and start training
    model = configure_peft(model)
    train_model(model, train_dataset)

    # Example test
    test_prompt = "Patient symptoms: fever, dry cough, and fatigue. Is this likely COVID-19?"
    print("Response:", generate_response(test_prompt, model, tokenizer))

# Execute main workflow
if __name__ == "__main__":
    main()