In [1]:
!pip install -q peft transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PrefixTuningConfig, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import os
from math import ceil


In [4]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"

Model and tokenizer configuration

In [5]:
# Model and tokenizer configuration
model_name_or_path = "t5-base"
tokenizer_name_or_path = "t5-base"
max_length = 512
lr = 1e-4
num_epochs = 5
batch_size = 4

Loading Dataset

In [6]:
# Load dataset
dataset = load_dataset("csv", data_files={"train": "disease_with_symptoms.csv"})
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]

Generating train split: 0 examples [00:00, ? examples/s]

Pre Processing Dataset

In [7]:
# Preprocess dataset
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
def preprocess_function(examples):
    inputs = [f"translate symptoms to disease: {s}" for s in examples["Preprocessed_Symptoms"]]
    targets = [f"Disease: {d}" for d in examples["Disease"]]

    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

In [9]:
# Map preprocessing function to datasets
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)


Running tokenizer on dataset:   0%|          | 0/477 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/53 [00:00<?, ? examples/s]

In [10]:
# Create data loaders
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)


Model Configuration

In [11]:
# Model configuration
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

Training Loop

In [13]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    # Evaluation loop
    model.eval()
    eval_loss = 0
    eval_preds = []
    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            eval_loss += outputs.loss.item()
            eval_preds.extend(tokenizer.batch_decode(torch.argmax(outputs.logits, -1), skip_special_tokens=True))

    # Compute metrics
    train_loss = total_loss / len(train_dataloader)
    eval_loss /= len(eval_dataloader)

    # Print metrics
    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}")


100%|██████████| 120/120 [02:06<00:00,  1.05s/it]
100%|██████████| 14/14 [00:05<00:00,  2.72it/s]


Epoch 1/5: Train Loss: 1.7364, Eval Loss: 1.2628


100%|██████████| 120/120 [02:05<00:00,  1.05s/it]
100%|██████████| 14/14 [00:05<00:00,  2.73it/s]


Epoch 2/5: Train Loss: 1.1960, Eval Loss: 1.2015


100%|██████████| 120/120 [02:05<00:00,  1.05s/it]
100%|██████████| 14/14 [00:05<00:00,  2.73it/s]


Epoch 3/5: Train Loss: 1.0192, Eval Loss: 1.1726


100%|██████████| 120/120 [02:05<00:00,  1.05s/it]
100%|██████████| 14/14 [00:05<00:00,  2.71it/s]


Epoch 4/5: Train Loss: 0.8911, Eval Loss: 1.1723


100%|██████████| 120/120 [02:05<00:00,  1.05s/it]
100%|██████████| 14/14 [00:05<00:00,  2.72it/s]

Epoch 5/5: Train Loss: 0.8122, Eval Loss: 1.1730





Inferences

In [14]:
# symptoms
symptoms = "inattention carelessness hyperactivity executive dysfunction disinhibition emotional dysregulation impulsivity impaired working memory"

# Tokenize symptoms
inputs = tokenizer(f"translate symptoms to disease: {symptoms}", return_tensors="pt").to(device)

# Generate predictions
with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_length=max_length)
    predicted_diseases = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Display unique predictions
unique_predictions = list(set(predicted_diseases))
print("Predicted diseases:", unique_predictions)

Predicted diseases: ['Disease: Attention deficit hyperactivity disorder']


In [17]:
symptoms = "blackhead whitehead pimple oily skin scarring"

# Tokenize symptoms
inputs = tokenizer(f"translate symptoms to disease: {symptoms}", return_tensors="pt").to(device)

# Generate predictions
with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_length=max_length)
    predicted_diseases = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Display unique predictions
unique_predictions = list(set(predicted_diseases))
print("Predicted diseases:", unique_predictions)

Predicted diseases: ['Disease: Acne']


In [16]:
# Evaluate the model
model.eval()
correct = 0
total = 0
for batch in tqdm(eval_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    pred_ids = torch.argmax(outputs.logits, dim=-1)
    label_ids = batch["labels"]
    # Compute accuracy
    correct += (pred_ids == label_ids).sum().item()
    total += label_ids.ne(tokenizer.pad_token_id).sum().item()

accuracy = correct / total * 100
print(f"Accuracy: {accuracy:.2f}% on the evaluation dataset")

100%|██████████| 14/14 [00:05<00:00,  2.80it/s]

Accuracy: 1.39% on the evaluation dataset



