##This is to bring it all together.
load → clean → label → vectorize → train → MLflow log

In [0]:
# install once per cluster (light deps first)
%pip install scikit-learn==1.5.2 mlflow==2.21.3


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%restart_python

In [0]:
dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()


In [0]:
%run "[location]/mimic_mod/io"



In [0]:
%run "[location]/mimic_mod/prep"


In [0]:
%run "[location]/mimic_mod/nlp"


In [0]:
%run "[location]/mimic_mod/train"

In [0]:
# ---- paths & tables (adjust to your volume & tables) ----
notes_tbl = "default.df_anxiety_notes"   # or default.df_anxiety_notes
icd_tbl   = "default.diagnoses_icd"
icd_tbl_d = "default.d_icd_diagnoses"

df_notes = load_table(notes_tbl)
df_icd   = load_table(icd_tbl)
df_icd = df_icd.join(load_table(icd_tbl_d), "icd_code", "left")

# clean/prepare
df_notes = filter_valid_text(df_notes, "text")
df_notes = clean_text(df_notes, "text", "text_clean")

df_labels = anxiety_label_from_icd(df_icd, "long_title")
df_join   = join_notes_labels(df_notes, df_labels, "left")

display(df_join.select("hadm_id","label_anxiety","text_clean").limit(1000))


In [0]:
# collect a manageable slice for modeling (keep compute in check)
pdf = df_join.select("label_anxiety","text_clean").limit(30000).toPandas()
y = pdf["label_anxiety"].astype(int).values
texts = pdf["text_clean"].tolist()

vec, X = tfidf_fit_transform(texts, max_features=30000, ngram_range=(1,2), min_df=3)
clf, report = train_logreg_tfidf(X, y, C=1.0, max_iter=1000)
report




Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

{'0': {'precision': 0.7565095003518649,
  'recall': 0.7147606382978723,
  'f1-score': 0.7350427350427351,
  'support': 1504.0},
 '1': {'precision': 0.7333747669359851,
  'recall': 0.7732634338138925,
  'f1-score': 0.7527910685805422,
  'support': 1526.0},
 'accuracy': 0.7442244224422442,
 'macro avg': {'precision': 0.744942133643925,
  'recall': 0.7440120360558824,
  'f1-score': 0.7439169018116387,
  'support': 3030.0},
 'weighted avg': {'precision': 0.7448581461628773,
  'recall': 0.7442244224422442,
  'f1-score': 0.7439813347056704,
  'support': 3030.0}}

In [0]:
%python
import mlflow

# collect a manageable slice for modeling (keep compute in check)
pdf = df_join.select("label_anxiety", "text_clean").limit(300).toPandas()

# Check class distribution
class_counts = pdf["label_anxiety"].value_counts()
display(class_counts)

# Proceed only if at least two classes are present
if len(class_counts) < 2:
    raise ValueError("Not enough classes in the data for modeling. Please check your label distribution.")

y = pdf["label_anxiety"].astype(int).values
texts = pdf["text_clean"].tolist()

vec, X = tfidf_fit_transform(
    texts,
    max_features=300,
    ngram_range=(1, 2),
    min_df=3
)
clf, report = train_logreg_tfidf(
    X,
    y,
    C=1.0,
    max_iter=1000
)
display(report)
with mlflow.start_run():
    mlflow.sklearn.log_model(
        clf,
        "model",
        input_example=X[0:1]
    )

0    178
1    122
Name: label_anxiety, dtype: int64



Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

{'0': {'precision': 0.7111111111111111,
  'recall': 0.8888888888888888,
  'f1-score': 0.7901234567901234,
  'support': 36.0},
 '1': {'precision': 0.7333333333333333,
  'recall': 0.4583333333333333,
  'f1-score': 0.5641025641025641,
  'support': 24.0},
 'accuracy': 0.7166666666666667,
 'macro avg': {'precision': 0.7222222222222222,
  'recall': 0.673611111111111,
  'f1-score': 0.6771130104463438,
  'support': 60.0},
 'weighted avg': {'precision': 0.7200000000000001,
  'recall': 0.7166666666666667,
  'f1-score': 0.6997150997150997,
  'support': 60.0}}

Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

In [0]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from mlflow.models.signature import infer_signature

pipe = make_pipeline(
    TfidfVectorizer(max_features=300, ngram_range=(1,2), min_df=3),
    LogisticRegression(C=1.0, max_iter=1000, n_jobs=-1)
)

pipe.fit(texts, y)

ex_df = pd.DataFrame({"text": [texts[0]]})
sig = infer_signature(pd.DataFrame({"text": texts[:5]}), pipe.predict(texts[:5]))

with mlflow.start_run():
    mlflow.sklearn.log_model(
        pipe,
        "model",
        input_example=ex_df,
        signature=sig
    )


Uploading artifacts:   0%|          | 0/3 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

In [0]:
spark.version
import mlflow
mlflow.__version__


'2.21.3'