In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the CSV file
df = pd.read_csv('Train_data.csv')

# Display the first few rows of the dataframe
print(df.head())

In [None]:
x = df["label"]
y = df["text"]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42, stratify=x)
train_df = pd.DataFrame({"label": x_train, "text": y_train})
test_df = pd.DataFrame({"label": x_test, "text": y_test})

In [None]:
from transformers import AutoTokenizer

# Use BioBERT (or replace with 'distilbert-base-uncased' for general BERT)
# dmis-lab/biobert-base-cased-v1.1
# dmis-lab/biobert-base-cased-v1.2
model_name = "dmis-lab/biobert-base-cased-v1.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization
train_df["text"] = train_df["text"].astype(str)
test_df["text"] = test_df["text"].astype(str)

train_encodings = tokenizer(list(train_df["text"]), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_df["text"]), truncation=True, padding=True, max_length=128)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df["label"])
test_labels = label_encoder.transform(test_df["label"])

# Check mapping of diseases to numbers
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Mapping:", label_mapping)

In [None]:
import torch

class DiseaseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch dataset
train_dataset = DiseaseDataset(train_encodings, train_labels)
test_dataset = DiseaseDataset(test_encodings, test_labels)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load model for classification
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=90
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

In [None]:
import matplotlib.pyplot as plt

# Extract log history
history = trainer.state.log_history

# Separate training and validation losses
train_losses = [entry["loss"] for entry in history if "loss" in entry]
eval_losses = [entry["eval_loss"] for entry in history if "eval_loss" in entry]

# Create x-axis values (logging steps for training loss, epochs for eval loss)
train_steps = [i * training_args.logging_steps for i in range(len(train_losses))]
eval_epochs = list(range(1, len(eval_losses) + 1))

# Plot training and validation loss
plt.figure(figsize=(8, 5))
plt.plot(train_steps, train_losses, label="Training Loss", marker="o")
plt.plot(eval_epochs, eval_losses, label="Validation Loss", marker="s", linestyle="dashed")
plt.xlabel("Steps (Training) / Epochs (Validation)")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Get predictions
predictions = trainer.predict(test_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Convert predictions back to disease labels
predicted_labels = label_encoder.inverse_transform(preds)

# Print accuracy and classification report
accuracy = accuracy_score(test_df["label"], predicted_labels)
print("Test Accuracy:", accuracy)
print("Classification Report:\n", classification_report(test_df["label"], predicted_labels))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_disease(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)  # Move to GPU if available
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return predicted_class

reverse_label_mapping = {v: k for k, v in label_mapping.items()}
results_dict = {}

for i, symptom_text in enumerate(y_test):
  predicted_label = predict_disease(symptom_text)
  predicted_disease = reverse_label_mapping.get(predicted_label, "Unknown Disease")
  actual_disease = x_test.iloc[i]

  results_dict[i] = {"Symptom Description": symptom_text,
        "Predicted Disease": predicted_disease,
        "Actual Disease": actual_disease}

# Print the dictionary
for key, value in results_dict.items():
    print(f"Sample {key}:")
    print(f"  Symptoms: {value['Symptom Description']}")
    print(f"  Predicted: {value['Predicted Disease']}")
    print(f"  Actual: {value['Actual Disease']}")
    print("-" * 50)
