In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import joblib
import os


In [0]:
# =========================
# LOAD FILE FROM WORKSPACE FILES
# =========================

DATA_PATH = "/Workspace/Users/sealex@asu.edu/KaggleV2-May-2016.csv"  

df = pd.read_csv(DATA_PATH)
df.head()


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [0]:
# =========================
# CLEANING & FEATURE ENGINEERING (SAFE VERSION)
# =========================

df = df.copy()
print("Initial rows:", len(df))
print("Columns:", df.columns.tolist())

# 1) Fix target column name
df = df.rename(columns={"No-show": "NoShow"})

# 2) Clean and map target robustly
df["NoShow"] = df["NoShow"].astype(str).str.strip().str.lower()
print("Unique raw NoShow values:", df["NoShow"].unique())

# Keep only yes/no rows and map to 1/0
df = df[df["NoShow"].isin(["yes", "no"])]
df["NoShow"] = df["NoShow"].map({"yes": 1, "no": 0})

print("Rows after target clean:", len(df))
print("NoShow value counts:")
print(df["NoShow"].value_counts())

# 3) Make Age numeric then filter
df["Age"] = pd.to_numeric(df["Age"], errors="coerce")
print("Age NaNs before drop:", df["Age"].isna().sum())

df = df.dropna(subset=["Age"])
df = df[(df["Age"] >= 0) & (df["Age"] <= 120)]
print("Rows after age filter:", len(df))

# 4) Parse date columns
df["ScheduledDay"] = pd.to_datetime(df["ScheduledDay"], errors="coerce")
df["AppointmentDay"] = pd.to_datetime(df["AppointmentDay"], errors="coerce")

print("\nNa counts before dropping bad dates:")
print(df[["ScheduledDay", "AppointmentDay"]].isna().sum())

# Drop rows with invalid dates
df = df.dropna(subset=["ScheduledDay", "AppointmentDay"])
print("Rows after dropping NaN dates:", len(df))

# 5) DaysBetween
df["DaysBetween"] = (df["AppointmentDay"] - df["ScheduledDay"]).dt.days
print("\nDaysBetween summary before filter:")
print(df["DaysBetween"].describe())

df = df[df["DaysBetween"] >= 0]
print("Rows after DaysBetween >= 0:", len(df))

# 6) Weekday + weekend flag
df["ApptWeekday"] = df["AppointmentDay"].dt.day_name()
df["IsWeekend"] = df["AppointmentDay"].dt.weekday.isin([5, 6]).astype(int)

print("\nFinal row count after cleaning:", len(df))
df.head()


Initial rows: 110527
Columns: ['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay', 'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show']
Unique raw NoShow values: ['no' 'yes']
Rows after target clean: 110527
NoShow value counts:
NoShow
0    88208
1    22319
Name: count, dtype: int64
Age NaNs before drop: 0
Rows after age filter: 110526

Na counts before dropping bad dates:
ScheduledDay      0
AppointmentDay    0
dtype: int64
Rows after dropping NaN dates: 110526

DaysBetween summary before filter:
count    110526.000000
mean          9.183794
std          15.255034
min          -7.000000
25%          -1.000000
50%           3.000000
75%          14.000000
max         178.000000
Name: DaysBetween, dtype: float64
Rows after DaysBetween >= 0: 71959

Final row count after cleaning: 71959


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,NoShow,DaysBetween,ApptWeekday,IsWeekend
5,95985130000000.0,5626772,F,2016-04-27 08:36:51+00:00,2016-04-29 00:00:00+00:00,76,REPÚBLICA,0,1,0,0,0,0,0,1,Friday,0
6,733688200000000.0,5630279,F,2016-04-27 15:05:12+00:00,2016-04-29 00:00:00+00:00,23,GOIABEIRAS,0,0,0,0,0,0,1,1,Friday,0
7,3449833000000.0,5630575,F,2016-04-27 15:39:58+00:00,2016-04-29 00:00:00+00:00,39,GOIABEIRAS,0,0,0,0,0,0,1,1,Friday,0
9,78124560000000.0,5629123,F,2016-04-27 12:48:25+00:00,2016-04-29 00:00:00+00:00,19,CONQUISTA,0,0,0,0,0,0,0,1,Friday,0
10,734536200000000.0,5630213,F,2016-04-27 14:58:11+00:00,2016-04-29 00:00:00+00:00,30,NOVA PALESTINA,0,0,0,0,0,0,0,1,Friday,0


In [0]:
# =========================
# FEATURE SETUP & TRAIN/TEST SPLIT (CLEAN)
# =========================

from sklearn.model_selection import train_test_split

target_col = "NoShow"

numeric_features = ["Age", "DaysBetween"]
binary_int_cols = ["Scholarship", "Hipertension", "Diabetes",
                   "Alcoholism", "Handcap", "SMS_received", "IsWeekend"]
categorical_features = ["Gender", "Neighbourhood", "ApptWeekday"]

# Keep only existing columns (safety)
numeric_features = [c for c in numeric_features if c in df.columns]
binary_int_cols = [c for c in binary_int_cols if c in df.columns]
categorical_features = [c for c in categorical_features if c in df.columns]

feature_cols = numeric_features + binary_int_cols + categorical_features

print("Numeric features:", numeric_features)
print("Binary features:", binary_int_cols)
print("Categorical features:", categorical_features)
print("Total features:", len(feature_cols))

X = df[feature_cols]
y = df[target_col].astype(int)

print("X shape:", X.shape)
print("y distribution:")
print(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Numeric features: ['Age', 'DaysBetween']
Binary features: ['Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'IsWeekend']
Categorical features: ['Gender', 'Neighbourhood', 'ApptWeekday']
Total features: 12
X shape: (71959, 12)
y distribution:
NoShow
0    51437
1    20522
Name: count, dtype: int64
Train shape: (57567, 12)
Test shape: (14392, 12)


In [0]:
# =========================
# PIPELINE, TRAINING, EVALUATION
# =========================

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

# numeric (Age, DaysBetween + all 0/1 flags)
all_numeric = numeric_features + binary_int_cols

numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, all_numeric),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", log_reg)
    ]
)

# Train
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1       :", f1)
print("ROC AUC  :", auc)
print("\nClassification report:\n")
print(classification_report(y_test, y_pred))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


# =========================
# MODEL 2 — RANDOM FOREST CLASSIFIER
# =========================

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf_clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", rf)
    ]
)

rf_clf.fit(X_train, y_train)

# Predictions
rf_pred = rf_clf.predict(X_test)
rf_proba = rf_clf.predict_proba(X_test)[:, 1]

# Metrics
rf_acc = accuracy_score(y_test, rf_pred)
rf_prec = precision_score(y_test, rf_pred)
rf_rec = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
rf_auc = roc_auc_score(y_test, rf_proba)

print("=== RANDOM FOREST RESULTS ===")
print("Accuracy :", rf_acc)
print("Precision:", rf_prec)
print("Recall   :", rf_rec)
print("F1       :", rf_f1)
print("ROC AUC  :", rf_auc)
print("\nClassification Report:\n")
print(classification_report(y_test, rf_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))

# =========================
# MODEL 3 — GRADIENT BOOSTING CLASSIFIER
# =========================

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    learning_rate=0.05,
    n_estimators=300,
    max_depth=3,
    random_state=42
)

gb_clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", gb)
    ]
)

gb_clf.fit(X_train, y_train)

# Predictions
gb_pred = gb_clf.predict(X_test)
gb_proba = gb_clf.predict_proba(X_test)[:, 1]

# Metrics
gb_acc = accuracy_score(y_test, gb_pred)
gb_prec = precision_score(y_test, gb_pred)
gb_rec = recall_score(y_test, gb_pred)
gb_f1 = f1_score(y_test, gb_pred)
gb_auc = roc_auc_score(y_test, gb_proba)

print("=== GRADIENT BOOSTING RESULTS ===")
print("Accuracy :", gb_acc)
print("Precision:", gb_prec)
print("Recall   :", gb_rec)
print("F1       :", gb_f1)
print("ROC AUC  :", gb_auc)
print("\nClassification Report:\n")
print(classification_report(y_test, gb_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, gb_pred))

# =========================
# COMPARE ALL THREE MODELS
# =========================

results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "Gradient Boosting"],
    "Accuracy": [acc, rf_acc, gb_acc],
    "Precision": [prec, rf_prec, gb_prec],
    "Recall": [rec, rf_rec, gb_rec],
    "F1 Score": [f1, rf_f1, gb_f1],
    "ROC AUC": [auc, rf_auc, gb_auc]
})

results



Accuracy : 0.5635769872151195
Precision: 0.3416727272727273
Recall   : 0.5723684210526315
F1       : 0.42790782402768923
ROC AUC  : 0.5898881581789492

Classification report:

              precision    recall  f1-score   support

           0       0.77      0.56      0.65     10288
           1       0.34      0.57      0.43      4104

    accuracy                           0.56     14392
   macro avg       0.55      0.57      0.54     14392
weighted avg       0.65      0.56      0.58     14392

Confusion matrix:
[[5762 4526]
 [1755 2349]]
=== RANDOM FOREST RESULTS ===
Accuracy : 0.6856586992773763
Precision: 0.4074074074074074
Recall   : 0.22514619883040934
F1       : 0.2900188323917137
ROC AUC  : 0.6018400925660661

Classification Report:

              precision    recall  f1-score   support

           0       0.74      0.87      0.80     10288
           1       0.41      0.23      0.29      4104

    accuracy                           0.69     14392
   macro avg       0.57     

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,Logistic Regression,0.563577,0.341673,0.572368,0.427908,0.589888
1,Random Forest,0.685659,0.407407,0.225146,0.290019,0.60184
2,Gradient Boosting,0.714981,0.509259,0.013402,0.026116,0.609097


In [0]:
# =========================
# SAVE BEST MODEL (LOGISTIC REGRESSION) TO WORKSPACE FILES
# =========================

import joblib
import os

best_model = clf  # logistic regression pipeline

# Save in the same Workspace area as your CSV
model_path = "/Workspace/Users/sealex@asu.edu/no_show_model.pkl"

# Create dir if needed
os.makedirs(os.path.dirname(model_path), exist_ok=True)

joblib.dump(
    {
        "pipeline": best_model,
        "feature_cols": feature_cols,
        "numeric_features": numeric_features,
        "binary_int_cols": binary_int_cols,
        "categorical_features": categorical_features,
    },
    model_path
)

print(f"Saved best model to: {model_path}")


Saved best model to: /Workspace/Users/sealex@asu.edu/no_show_model.pkl
