In [None]:
!pip install -q peft transformers datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PrefixTuningConfig, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import os
from math import ceil


In [None]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Model and tokenizer configuration
model_name_or_path = "t5-base"  # Using a smaller model
tokenizer_name_or_path = "t5-base"
max_length = 512
lr = 1e-4
num_epochs = 5
batch_size = 1

In [None]:
# Load dataset
dataset = load_dataset("csv", data_files={"train": "disease_with_symptoms.csv"})
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]

In [None]:
# Preprocess dataset
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

In [None]:
def preprocess_function(examples):
    inputs = [f"translate symptoms to disease: {s}" for s in examples["Preprocessed_Symptoms"]]
    targets = [f"Disease: {d}" for d in examples["Disease"]]

    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

In [None]:
# Map preprocessing function to datasets
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)


Running tokenizer on dataset:   0%|          | 0/477 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/53 [00:00<?, ? examples/s]

In [None]:
# Create data loaders
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)


In [None]:
# Model configuration
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device)

In [None]:
# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    # Evaluation loop
    model.eval()
    eval_loss = 0
    eval_preds = []
    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            eval_loss += outputs.loss.item()
            eval_preds.extend(tokenizer.batch_decode(torch.argmax(outputs.logits, -1), skip_special_tokens=True))

    # Compute metrics
    train_loss = total_loss / len(train_dataloader)
    eval_loss /= len(eval_dataloader)

    # Print metrics
    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}")


100%|██████████| 477/477 [02:48<00:00,  2.83it/s]
100%|██████████| 53/53 [00:05<00:00,  9.41it/s]


Epoch 1/5: Train Loss: 1.6018, Eval Loss: 1.2000


100%|██████████| 477/477 [02:44<00:00,  2.90it/s]
100%|██████████| 53/53 [00:05<00:00,  9.37it/s]


Epoch 2/5: Train Loss: 1.1086, Eval Loss: 1.2013


100%|██████████| 477/477 [02:44<00:00,  2.90it/s]
100%|██████████| 53/53 [00:05<00:00,  9.38it/s]


Epoch 3/5: Train Loss: 0.8190, Eval Loss: 1.1965


100%|██████████| 477/477 [02:44<00:00,  2.90it/s]
100%|██████████| 53/53 [00:05<00:00,  9.42it/s]


Epoch 4/5: Train Loss: 0.6555, Eval Loss: 1.1797


100%|██████████| 477/477 [02:44<00:00,  2.90it/s]
100%|██████████| 53/53 [00:05<00:00,  9.40it/s]

Epoch 5/5: Train Loss: 0.5388, Eval Loss: 1.2097





In [None]:
# Inference
# Define symptoms
symptoms = "inattention carelessness hyperactivity executive dysfunction disinhibition emotional dysregulation impulsivity impaired working memory"

# Tokenize symptoms
inputs = tokenizer(f"translate symptoms to disease: {symptoms}", return_tensors="pt").to(device)

# Generate predictions
with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_length=max_length)
    predicted_diseases = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Display unique predictions
unique_predictions = list(set(predicted_diseases))
print("Predicted diseases:", unique_predictions)

Predicted diseases: ['Disease: Attention deficit hyperactivity disorder']


In [None]:
symptoms = "blackhead whitehead pimple oily skin scarring"

# Tokenize symptoms
inputs = tokenizer(f"translate symptoms to disease: {symptoms}", return_tensors="pt").to(device)

# Generate predictions
with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_length=max_length)
    predicted_diseases = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Display unique predictions
unique_predictions = list(set(predicted_diseases))
print("Predicted diseases:", unique_predictions)

Predicted diseases: ['Disease: Acne']
