# **Disease Prediction**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TrainingArguments, Trainer
import tensorflow as tf

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Train_data.csv")

# Drop the unnecessary column
df = df.drop(columns=['Unnamed: 0'])

# Verify the dataset structure
print(df.head())
print(df.columns)

       label                                               text
0  Psoriasis  I have been experiencing a skin rash on my arm...
1  Psoriasis  My skin has been peeling, especially on my kne...
2  Psoriasis  I have been experiencing joint pain in my fing...
3  Psoriasis  There is a silver like dusting on my skin, esp...
4  Psoriasis  My nails have small dents or pits in them, and...
Index(['label', 'text'], dtype='object')


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the 'label' column
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Verify label encoding
print(df[['label', 'label_encoded']].drop_duplicates())


                                label  label_encoded
0                           Psoriasis             15
50                     Varicose Veins             17
100                           Typhoid             16
150                       Chicken pox              4
200                          Impetigo             10
250                            Dengue              6
300                  Fungal infection              8
350                       Common Cold              5
400                         Pneumonia             14
450             Dimorphic Hemorrhoids              7
500                         Arthritis              1
550                              Acne              0
600           urinary tract infection             23
650                           allergy             18
700   gastroesophageal reflux disease             21
750                     drug reaction             20
800              peptic ulcer disease             22
850                          diabetes         

In [None]:
from sklearn.model_selection import train_test_split

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label_encoded'], test_size=0.2, random_state=42
)

# Confirm splits
print(f"Train size: {len(train_texts)}, Validation size: {len(val_texts)}")

Train size: 960, Validation size: 240


In [None]:
from transformers import AutoTokenizer

# Initialize tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the text
train_encodings = tokenizer(
    list(train_texts), truncation=True, padding=True, max_length=128
)
val_encodings = tokenizer(
    list(val_texts), truncation=True, padding=True, max_length=128
)

# Verify tokenization output
print(train_encodings.keys())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
import torch

# Convert tokenized data to PyTorch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels_tensor = torch.tensor(train_labels.tolist())

val_inputs = torch.tensor(val_encodings['input_ids'])
val_masks = torch.tensor(val_encodings['attention_mask'])
val_labels_tensor = torch.tensor(val_labels.tolist())

# Create TensorDataset
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_labels_tensor)
val_dataset = torch.utils.data.TensorDataset(val_inputs, val_masks, val_labels_tensor)

# Verify dataset shapes
print(f"Train Dataset: {train_dataset.tensors[0].shape}, Validation Dataset: {val_dataset.tensors[0].shape}")

Train Dataset: torch.Size([960, 78]), Validation Dataset: torch.Size([240, 68])


In [None]:
from transformers import AutoModelForSequenceClassification

# Define the BERT model for classification
num_labels = len(df['label_encoded'].unique())  # Number of unique classes
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AdamW
from torch.optim import lr_scheduler

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler (optional: linear decay of learning rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

# Loss function
loss_function = torch.nn.CrossEntropyLoss()



In [None]:
from torch.utils.data import DataLoader

# Define batch size
batch_size = 16

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
import torch
from torch.nn.functional import softmax

# Device setup
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training parameters
epochs = 4
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    total_loss = 0

    # Training loop
    for batch in train_dataloader:
        batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Training Loss: {avg_train_loss}")

    # Validation loop
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_dataloader:
            batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)

            outputs = model(input_ids=batch_inputs, attention_mask=batch_masks, labels=batch_labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()

            # Calculate accuracy
            preds = torch.argmax(softmax(logits, dim=1), dim=1)
            correct += (preds == batch_labels).sum().item()
            total += batch_labels.size(0)

    avg_val_loss = val_loss / len(val_dataloader)
    accuracy = correct / total
    print(f"Validation Loss: {avg_val_loss}, Accuracy: {accuracy}")


Epoch 1/4
Training Loss: 2.941800061861674
Validation Loss: 2.3772061189015705, Accuracy: 0.4875
Epoch 2/4
Training Loss: 1.616729653875033
Validation Loss: 0.9750986973444621, Accuracy: 0.8708333333333333
Epoch 3/4
Training Loss: 0.6028752888242404
Validation Loss: 0.40155888299147285, Accuracy: 0.95
Epoch 4/4
Training Loss: 0.23162327582637468
Validation Loss: 0.2296550914645195, Accuracy: 0.9541666666666667


In [None]:
# Save model and tokenizer
output_dir = './bert_model'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

Model saved to ./bert_model


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
def predict(text, model, tokenizer, device):
    # Tokenize input text
    encoding = tokenizer(
        text, truncation=True, padding=True, max_length=128, return_tensors='pt'
    ).to(device)

    # Predict
    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        prediction = torch.argmax(softmax(logits, dim=1), dim=1)

    return label_encoder.inverse_transform(prediction.cpu().numpy())

# Example inference
text =  "I've been feeling extreme fatigue, headaches, and muscle aches, and I noticed my joints becoming swollen and stiff."
prediction = predict(text, model, tokenizer, device)
print(f"Predicted label: {prediction[0]}")

Predicted label: Arthritis


# **Successfully providing great result**