

GENERATING THE DATAASET

In [3]:
import pandas as pd
import random

# Step 1: Define some rare diseases and their symptoms
rare_diseases = {
    "Addison's Disease": ["fatigue", "weight loss", "low blood pressure", "muscle weakness", "darkened skin"],
    "Wilson's Disease": ["jaundice", "tremors", "slurred speech", "abdominal pain", "mood swings"],
    "Creutzfeldt-Jakob Disease": ["memory loss", "blurred vision", "muscle stiffness", "behavioral changes", "difficulty speaking"],
    "Gaucher Disease": ["bone pain", "fatigue", "easy bruising", "enlarged spleen", "nosebleeds"],
    "Pompe Disease": ["muscle weakness", "breathing difficulty", "enlarged heart", "trouble feeding", "fatigue"],
    "Fabry Disease": ["burning pain", "rash", "stomach pain", "kidney problems", "decreased sweating"],
    "Batten Disease": ["seizures", "vision loss", "clumsiness", "personality changes", "speech problems"],
    "Alkaptonuria": ["dark urine", "joint pain", "stiffness", "heart problems", "discoloration of skin"],
    "Prader-Willi Syndrome": ["obesity", "weak muscles", "learning difficulties", "short stature", "insatiable appetite"],
    "Progeria": ["growth failure", "baldness", "stiff joints", "aged appearance", "cardiovascular disease"]
}

# Step 2: Generate a realistic dataset (synthetic)
data = []
for i in range(200):  # 200 samples
    disease = random.choice(list(rare_diseases.keys()))
    symptoms = rare_diseases[disease]
    selected_symptoms = random.sample(symptoms, k=random.randint(2, len(symptoms)))  # random subset of symptoms
    data.append({
        "Symptom_1": selected_symptoms[0] if len(selected_symptoms) > 0 else "",
        "Symptom_2": selected_symptoms[1] if len(selected_symptoms) > 1 else "",
        "Symptom_3": selected_symptoms[2] if len(selected_symptoms) > 2 else "",
        "Symptom_4": selected_symptoms[3] if len(selected_symptoms) > 3 else "",
        "Disease": disease
    })

df = pd.DataFrame(data)

# Step 3: Save dataset
dataset_path = "/content/rare_disease_prediction_dataset.csv"
df.to_csv(dataset_path, index=False)

dataset_path


'/content/rare_disease_prediction_dataset.csv'

1. Dataset Loaded
2. Convert text symptoms into numerical form using TF-IDF vectorization.

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the dataset
df = pd.read_csv("/content/rare_disease_prediction_dataset.csv")

# Step 2: Combine symptom columns into a single text column
df["combined_symptoms"] = df[["Symptom_1", "Symptom_2", "Symptom_3", "Symptom_4"]].fillna("").agg(" ".join, axis=1)

# Step 3: Convert text to numerical form using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["combined_symptoms"])

# Step 4: Target variable
y = df["Disease"]

# Display sample transformed features
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
X_df.head()


Unnamed: 0,abdominal,aged,appearance,appetite,baldness,behavioral,blood,blurred,bone,breathing,...,stomach,sweating,swings,tremors,trouble,urine,vision,weak,weakness,weight
0,0.0,0.0,0.0,0.0,0.0,0.532121,0.0,0.541033,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.457762,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.530094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.724591,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.530094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.405747,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


the Random Forest and XGBoost training and evaluation

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("rare_disease_prediction_dataset.csv")

# Combine symptoms
df["combined_symptoms"] = df[["Symptom_1", "Symptom_2", "Symptom_3", "Symptom_4"]].fillna("").agg(" ".join, axis=1)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["combined_symptoms"])

# Encode target
le = LabelEncoder()
y = le.fit_transform(df["Disease"])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_preds))
print("\nRandom Forest Report:\n", classification_report(y_test, rf_preds))
print("\nXGBoost Report:\n", classification_report(y_test, xgb_preds))


Random Forest Accuracy: 1.0
XGBoost Accuracy: 0.95

Random Forest Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         2
           7       1.00      1.00      1.00         5
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         3

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


XGBoost Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       1.00      0.75      

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Save your trained model

In [None]:
from google.colab import files

files.download("rare_disease_model.pkl")
files.download("tfidf_vectorizer.pkl")
files.download("label_encoder.pkl")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
# ✅ Fully working Colab model training script for rare disease prediction

!pip install xgboost scikit-learn pandas joblib --quiet

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import joblib
from google.colab import files

# Step 1: Create base dataset
data = {
    "Symptoms": [
        "fatigue weakness weight loss",  # Addison's
        "jaundice tremors speech issues",  # Wilson's
        "memory loss confusion vision problems",  # CJD
        "muscle weakness difficulty swallowing",  # ALS
        "rashes joint pain fever",  # Lupus
        "abdominal pain anemia fatigue",  # Celiac
        "muscle spasms stiffness",  # Stiff Person Syndrome
        "shortness of breath fatigue blue lips",  # Pulmonary Hypertension
    ],
    "Disease": [
        "Addison's Disease",
        "Wilson's Disease",
        "Creutzfeldt-Jakob Disease",
        "ALS",
        "Lupus",
        "Celiac Disease",
        "Stiff Person Syndrome",
        "Pulmonary Hypertension"
    ]
}

df = pd.DataFrame(data)

# ✅ Step 2: Expand dataset (so we have enough samples per class)
df = pd.concat([df] * 10, ignore_index=True)

# Step 3: Encode target
le = LabelEncoder()
df["Disease_Label"] = le.fit_transform(df["Disease"])

# Step 4: TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Symptoms"])
y = df["Disease_Label"]

# ✅ Step 5: Safe split with stratify
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Step 6: Train models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=42)

rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Step 7: Evaluate both
rf_acc = accuracy_score(y_test, rf_model.predict(X_test))
xgb_acc = accuracy_score(y_test, xgb_model.predict(X_test))
print(f"Random Forest Accuracy: {rf_acc:.2f}")
print(f"XGBoost Accuracy: {xgb_acc:.2f}")

# Step 8: Choose best model
best_model = rf_model if rf_acc >= xgb_acc else xgb_model
print("✅ Selected Model:", "Random Forest" if rf_acc >= xgb_acc else "XGBoost")

# Step 9: Save
joblib.dump(best_model, "rare_disease_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")
print("✅ Files saved successfully!")

# Step 10: Download
files.download("rare_disease_model.pkl")
files.download("tfidf_vectorizer.pkl")
files.download("label_encoder.pkl")


Random Forest Accuracy: 1.00
XGBoost Accuracy: 1.00
✅ Selected Model: Random Forest
✅ Files saved successfully!


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>