In [31]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
# tiny shim: Path to str, str to str  (no-op)
def as_str(p): 
    from pathlib import Path
    return str(p) if isinstance(p, Path) else p


In [33]:
from pathlib import Path

DATA_DIR    = Path("/mnt/ssd1/saumia/data/text")
WEIGHTS_DIR = Path("weights"); WEIGHTS_DIR.mkdir(exist_ok=True)

MODEL_NAME  = "microsoft/biogpt"  # public and accessible
NUM_LABELS  = 7                   # adjust based on your label_map.json
MAX_LEN     = 256
BATCH_SIZE  = 8
EPOCHS      = 3
LR          = 2e-5


In [34]:
def clean(txt):
    txt = html.unescape(str(txt))
    return re.sub(r"\s+", " ", txt).strip()

# 1) symptom to disease CSV  (already labelled)
sym = pd.read_csv(DATA_DIR / "symptom2disease.csv")
sym = sym.rename(columns={"description": "text"})
sym["text"] = sym["text"].map(clean)
sym["source"] = "sym2dis"

# 2) mtsamples  (long notes to map specialty to category)
mts = pd.read_csv(DATA_DIR / "mtsamples.csv", quoting=1)      # QUOTE_ALL
mts = mts.rename(columns={"medical_specialty": "label",
                          "transcription": "text"})
SPEC_TO_CAT = {
    "Dermatology": "skin",
    "Bariatrics":  "digestive",
    "Cardiovascular / Pulmonary": "respiratory",
}
mts["label"] = mts["label"].map(SPEC_TO_CAT).fillna("other")
mts["text"]  = mts["text"].map(clean)
mts = mts[mts["label"] != "other"]
mts["source"] = "mtsamples"

# --- concatenate & deduplicate ---
df = pd.concat([sym[["label","text","source"]],
                mts[["label","text","source"]]], ignore_index=True)
df = df.drop_duplicates(subset="text").reset_index(drop=True)
print("Label distribution:\n", df["label"].value_counts())


Label distribution:
 label
Psoriasis                          50
Varicose Veins                     50
Typhoid                            50
Impetigo                           50
Fungal infection                   50
Dengue                             50
peptic ulcer disease               50
Hypertension                       50
drug reaction                      50
allergy                            50
urinary tract infection            50
diabetes                           50
Common Cold                        49
Chicken pox                        49
Cervical spondylosis               49
Bronchial Asthma                   49
gastroesophageal reflux disease    48
Pneumonia                          47
Migraine                           47
Arthritis                          46
Acne                               46
Malaria                            44
Dimorphic Hemorrhoids              41
Jaundice                           38
Name: count, dtype: int64


In [35]:
food_df = pd.read_csv(DATA_DIR / "FoodData.csv")
FOOD2ALLERGY = dict(zip(food_df["Food"].str.lower(), food_df["Allergy"]))
with open("food2allergy.json", "w") as f:
    json.dump(FOOD2ALLERGY, f, indent=2)
print("Loaded", len(FOOD2ALLERGY), "food keywords")


Loaded 183 food keywords


In [44]:
label_map_text = {lbl: i for i, lbl in enumerate(sorted(df["label"].unique()))}
with open("label_map_text.json", "w") as f:
    json.dump(label_map_text, f, indent=2)

# Safe label map
df = df[df["label"].isin(label_map_text)].copy()
df["label_id"] = df["label"].map(label_map_text).astype(int)

assert df["label_id"].isnull().sum() == 0, "Some labels are unmapped!"

train_df, val_df = train_test_split(
    df, test_size=0.1, stratify=df["label"], random_state=42
)

print("Train:", len(train_df), "| Val:", len(val_df))


Train: 1037 | Val: 116


In [60]:
try:
    tokenizer = AutoTokenizer.from_pretrained(as_str(MODEL_NAME))
except Exception:
    MODEL_NAME = "distilbert-base-uncased"
    tokenizer  = AutoTokenizer.from_pretrained(as_str(MODEL_NAME))
    print("BioGPT fallback to DistilBERT")

def encode(frame):
    enc = tokenizer(
        frame["text"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_tensors="tf"
    )
    enc["labels"] = tf.convert_to_tensor(frame["label_id"].values, tf.int32)
    return enc

def make_ds(frame):
    enc = encode(frame)
    ds = tf.data.Dataset.from_tensor_slices({
    "input_ids": enc["input_ids"],
    "attention_mask": enc["attention_mask"],
    "labels": enc["labels"]
})

    return ds.shuffle(len(frame), seed=42).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
assert "label_id" in train_df.columns
assert train_df["label_id"].isnull().sum() == 0

train_ds_text = make_ds(train_df)
val_ds_text   = make_ds(val_df)


In [None]:
# %% [6] Build, compile, train
ckpt_path = WEIGHTS_DIR / "text_best_tf.keras"

text_model = TFAutoModelForSequenceClassification.from_pretrained(
    as_str(MODEL_NAME), num_labels=len(label_map_text)
)

#   DON’T pass an explicit loss = …
#   Just compile with an optimizer (+ optional metrics)
text_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR, epsilon=1e-8),
    metrics=["accuracy"]          # accuracy works because model returns logits
)

history = text_model.fit(
    train_ds_text,
    validation_data=val_ds_text,
    epochs=EPOCHS,
    callbacks=[
        tf.keras.callbacks.ModelCheckpoint(
            str(ckpt_path),
            save_best_only=True,
            monitor="val_accuracy",
            mode="max"
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor="val_accuracy",
            patience=2,
            restore_best_weights=True
        )
    ]
)

tokenizer.save_pretrained(as_str(WEIGHTS_DIR / "text_tokenizer"))
print(" text model saved at", ckpt_path)

# Save the fully trained text model as a .keras package
from pathlib import Path

# ensure the directory exists
Path("models").mkdir(exist_ok=True)

text_model.save("models/text_model.keras")
print(" Text model saved to models/text_model.keras")



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3



Epoch 2/3
Epoch 3/3
✅ text model saved at weights/text_best_tf.keras


In [None]:
print("Train Labels NaN:", train_df["label_id"].isnull().sum())
print("Val Labels NaN:", val_df["label_id"].isnull().sum())


Train Labels NaN: 0
Val Labels NaN: 0


In [55]:
for x, y in train_ds_text:
    print("X keys:", x.keys())
    print("X input_ids shape:", x["input_ids"].shape)
    print("Y (labels) dtype:", y.dtype)
    print("Y (labels) sample:", y.numpy())
    break  # only the first batch


X keys: dict_keys(['input_ids', 'attention_mask'])
X input_ids shape: (8, 256)
Y (labels) dtype: <dtype: 'int32'>
Y (labels) sample: [ 8 22 11 22 19  4 23 13]


NameError: name 'text_model' is not defined