In [1]:
# === Cell 1 — Kaggle setup (fast, no persistence) =========================
# Installs Kaggle CLI and configures your API token for this Colab session.

!pip -q install kaggle

import os
os.makedirs("/root/.kaggle", exist_ok=True)

from google.colab import files
print("→ Upload your kaggle.json (Kaggle API token) from Kaggle: Account → Create API Token.")
uploaded = files.upload()
if "kaggle.json" not in uploaded:
    raise RuntimeError("kaggle.json missing — please upload your Kaggle API token file.")

!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

!kaggle --version
print("✅ Kaggle CLI ready.")

→ Upload your kaggle.json (Kaggle API token) from Kaggle: Account → Create API Token.


Saving kaggle.json to kaggle.json
Kaggle API 1.7.4.5
✅ Kaggle CLI ready.


In [2]:
# === Cell 2 — Download, unzip, load Kaggle triage dataset =================
!mkdir -p /content/data
!kaggle datasets download -d subhro1530/triaging-healthcare-dataset -p /content/data -q
!unzip -o /content/data/triaging-healthcare-dataset.zip -d /content/data > /dev/null

import glob, pandas as pd
csvs = glob.glob("/content/data/**/*.csv", recursive=True) or glob.glob("/content/data/*.csv")
assert csvs, "No CSV found. Check the unzip output above."
print("Using file:", csvs[0])

df = pd.read_csv(csvs[0])
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head(5)

Dataset URL: https://www.kaggle.com/datasets/subhro1530/triaging-healthcare-dataset
License(s): apache-2.0
Using file: /content/data/triaging_healthcare_dataset.csv
Shape: (790, 10)
Columns: ['Disease', 'Age', 'Gender', 'Latitude', 'Longitude', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Severity']


Unnamed: 0,Disease,Age,Gender,Latitude,Longitude,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Severity
0,Common Cold,45,Male,42.387,-95.223,Runny nose,Sneezing,,60.0,
1,Flu,63,Female,41.722,-97.154,Fever,Cough,Fatigue,,80.0
2,Bronchitis,27,Male,44.586,-96.781,Cough,Shortness of breath,,,70.0
3,Migraine,57,Female,43.251,-99.042,Headache,Nausea,,,90.0
4,Allergy,31,Male,45.129,-94.714,Itchy eyes,Skin rash,,50.0,


In [3]:
# === Cell 3 — Prep: text features + 3-class label ========================
import numpy as np
import pandas as pd
import re

# 1) Build a simple text field from Disease + Symptom_1..Symptom_4
sym_cols = [c for c in df.columns if re.match(r"(?i)^symptom_", c)]
def join_text(row):
    parts = [str(row.get("Disease", ""))]
    for c in sym_cols:
        v = row.get(c, "")
        if pd.notna(v) and str(v).strip().lower() != "nan":
            parts.append(str(v))
    return " ".join(parts).strip().lower()

df["symptoms_text"] = df.apply(join_text, axis=1)
df = df[df["symptoms_text"].str.len() > 0].copy()

# 2) Clean/convert Severity to numeric and drop rows without it
df["Severity_num"] = pd.to_numeric(df["Severity"], errors="coerce")
df = df.dropna(subset=["Severity_num"]).reset_index(drop=True)

# 3) Bin numeric severity into 3 buckets using quantiles (balanced classes)
q = df["Severity_num"].quantile([0.33, 0.66]).values
def bin_severity(v):
    if v <= q[0]: return "mild"
    if v <= q[1]: return "moderate"
    return "severe"

df["severity"] = df["Severity_num"].map(bin_severity)

print("Quantile cutoffs (≈):", q)
print("Class counts:\n", df["severity"].value_counts())
df[["symptoms_text", "Disease", "Severity_num", "severity"]].head(8)

Quantile cutoffs (≈): [75. 85.]
Class counts:
 severity
moderate    213
mild        185
severe      123
Name: count, dtype: int64


Unnamed: 0,symptoms_text,Disease,Severity_num,severity
0,flu fever cough fatigue,Flu,80.0,moderate
1,bronchitis cough shortness of breath,Bronchitis,70.0,mild
2,migraine headache nausea,Migraine,90.0,severe
3,flu cough shortness of breath,Flu,80.0,moderate
4,migraine headache nausea,Migraine,90.0,severe
5,bronchitis cough shortness of breath,Bronchitis,70.0,mild
6,flu fever cough fatigue,Flu,80.0,moderate
7,bronchitis cough shortness of breath,Bronchitis,70.0,mild


In [4]:
# === Cell 4 — Train & evaluate baseline ==================================
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

X_text = df["symptoms_text"].values
y = df["severity"].map({"mild":0,"moderate":1,"severe":2}).values
labels = ["mild","moderate","severe"]

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF (uni+bi-grams keeps it simple and effective)
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=2000)
X_train = tfidf.fit_transform(X_train_text)
X_test  = tfidf.transform(X_test_text)

# Models
rf = RandomForestClassifier(
    n_estimators=400, class_weight="balanced_subsample", random_state=42, n_jobs=-1
).fit(X_train, y_train)

xgb = XGBClassifier(
    n_estimators=500, max_depth=6, learning_rate=0.05,
    subsample=0.9, colsample_bytree=0.9, random_state=42,
    objective="multi:softprob", num_class=3, tree_method="hist", n_jobs=-1
).fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Simple probability ensemble
probs = (rf.predict_proba(X_test) + xgb.predict_proba(X_test)) / 2.0
y_pred = np.argmax(probs, axis=1)

print(classification_report(y_test, y_pred, target_names=labels, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

        mild      0.943     0.892     0.917        37
    moderate      0.870     0.930     0.899        43
      severe      0.917     0.880     0.898        25

    accuracy                          0.905       105
   macro avg      0.910     0.901     0.905       105
weighted avg      0.907     0.905     0.905       105

Confusion matrix:
 [[33  4  0]
 [ 1 40  2]
 [ 1  2 22]]


In [10]:
# === Cell 4B — Train & evaluate disease classifier =======================
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Use the same split as Cell 4 by repeating train_test_split with same seed/args
y_dis = df["Disease"].astype(str).values
_, _, y_dis_train, y_dis_test = train_test_split(
    X_text, y_dis, test_size=0.2, random_state=42, stratify=y
)

# Train a multinomial logistic regression on the SAME X_train/X_test features
disease_clf = LogisticRegression(
    max_iter=2000, multi_class="multinomial", class_weight="balanced"
).fit(X_train, y_dis_train)

y_dis_pred = disease_clf.predict(X_test)
print(classification_report(y_dis_test, y_dis_pred, digits=3))
print("Num classes:", len(disease_clf.classes_))



                                              precision    recall  f1-score   support

                                        Acne      1.000     1.000     1.000         1
                            Acoustic Neuroma      0.000     0.000     0.000         1
                           Allergic Rhinitis      0.000     0.000     0.000         0
                         Alzheimer's Disease      1.000     1.000     1.000         1
         Amyotrophic Lateral Sclerosis (ALS)      0.000     0.000     0.000         1
                                      Anemia      1.000     1.000     1.000         2
                                   Arthritis      1.000     1.000     1.000         2
                                      Asthma      1.000     1.000     1.000         1
                        Autoimmune Hepatitis      0.000     0.000     0.000         1
                                Bell's Palsy      0.000     0.000     0.000         0
                              Bladder Cancer      1.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# === Cell 5 — Save artifacts (severity + disease) ========================
import os, json, pathlib, joblib
art = pathlib.Path("/content/artifacts"); art.mkdir(exist_ok=True)

joblib.dump(tfidf,        art/"tfidf.pkl")
joblib.dump(rf,           art/"rf.pkl")
joblib.dump(xgb,          art/"xgb.pkl")
joblib.dump(disease_clf,  art/"disease.pkl")

# include both severity and disease labels in one config
cfg = {
    "severity_labels": ["mild","moderate","severe"],
    "disease_labels": disease_clf.classes_.tolist()
}
(art/"config.json").write_text(json.dumps(cfg, indent=2))

print("Saved:", [p.name for p in art.iterdir()])

Saved: ['disease.pkl', 'rf.pkl', 'xgb.pkl', 'config.json', 'tfidf.pkl', 'versions.json']


In [13]:
# === Cell 6 — Copy artifacts to Google Drive =============================
from google.colab import drive
from pathlib import Path
import shutil

drive.mount('/content/drive')

SRC = Path('/content/artifacts')
DST = Path('/content/drive/MyDrive/SACA/models/saca-triage-v1')
DST.mkdir(parents=True, exist_ok=True)

for fn in ['tfidf.pkl','rf.pkl','xgb.pkl','disease.pkl','config.json']:
    shutil.copy2(SRC/fn, DST/fn)

print("Copied to:", DST)
[p.name for p in DST.iterdir()]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Copied to: /content/drive/MyDrive/SACA/models/saca-triage-v1


['tfidf.pkl', 'rf.pkl', 'xgb.pkl', 'disease.pkl', 'config.json']

In [14]:
# === Cell 7 — Reload & inference helper ==================================
import json, joblib, numpy as np
from pathlib import Path

ART = Path('/content/drive/MyDrive/SACA/models/saca-triage-v1')
tfidf2   = joblib.load(ART/'tfidf.pkl')
rf2      = joblib.load(ART/'rf.pkl')
xgb2     = joblib.load(ART/'xgb.pkl')
dis_clf2 = joblib.load(ART/'disease.pkl')
cfg      = json.loads((ART/'config.json').read_text())

sev_labels = cfg["severity_labels"]
dis_labels = np.array(cfg["disease_labels"])

def predict_one(text: str, topk: int = 3):
    X = tfidf2.transform([text])

    # Severity (ensemble)
    p_rf  = rf2.predict_proba(X)
    p_xgb = xgb2.predict_proba(X)
    p_sev = (p_rf + p_xgb) / 2.0
    i = int(np.argmax(p_sev, axis=1)[0])
    severity = sev_labels[i]
    sev_conf = float(p_sev[0, i])

    # Disease (multinomial LR)
    p_dis = dis_clf2.predict_proba(X)[0]
    idx = np.argsort(p_dis)[::-1][:topk]
    top_diseases = [
        {"disease": dis_labels[j].item() if hasattr(dis_labels[j], "item") else dis_labels[j],
         "p": float(p_dis[j])}
        for j in idx
    ]

    return {
        "severity": severity,
        "confidence": sev_conf,
        "probs": p_sev[0].tolist(),
        "disease_topk": top_diseases
    }

# Try it
print(predict_one("chest pain with shortness of breath for 30 minutes"))
print(predict_one("mild headache since yesterday, no fever"))

{'severity': 'mild', 'confidence': 0.6070738231847896, 'probs': [0.6070738231847896, 0.3138391865953202, 0.07908699627348695], 'disease_topk': [{'disease': 'GERD (Gastroesophageal Reflux Disease)', 'p': 0.04094895374834087}, {'disease': 'Heart Attack', 'p': 0.029867445652245424}, {'disease': 'Lung Cancer', 'p': 0.02942652679310206}]}
{'severity': 'mild', 'confidence': 0.48106806936033897, 'probs': [0.48106806936033897, 0.4422124350079627, 0.07671948818111778], 'disease_topk': [{'disease': 'Encephalitis', 'p': 0.03916338821463785}, {'disease': 'Dengue Fever', 'p': 0.03826014324037502}, {'disease': 'Chickenpox', 'p': 0.017224956024392096}]}
