In [1]:
# TF smoke test - run this first
import sys, os
print("Python executable:", sys.executable)
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPUs visible:", tf.config.list_physical_devices('GPU'))


Python executable: C:\venvs\wine-tf\Scripts\python.exe
TensorFlow version: 2.20.0
GPUs visible: []


In [4]:
"""
train_adult_mlp_tf.py

- Expects:
    /mnt/data/adult.data
    /mnt/data/adult.test

- Python env: must have tensorflow, scikit-learn, pandas, numpy, joblib installed.

Run:
  python train_adult_mlp_tf.py
or paste into a Jupyter cell and run.

Outputs saved to: /mnt/data/artifacts/
"""

import os
import time
import numpy as np
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)

# TensorFlow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# ----------------- CONFIG -----------------
TRAIN_PATH = "/mnt/data/adult.data"
TEST_PATH  = "/mnt/data/adult.test"
ARTIFACT_DIR = "/mnt/data/artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# ----------------- Load helpers -----------------
COLUMNS = [
    "age","workclass","fnlwgt","education","education-num","marital-status",
    "occupation","relationship","race","sex","capital-gain","capital-loss",
    "hours-per-week","native-country","income"
]

def load_adult(path):
    # robust loading: skip blank lines, treat " ?" as NaN, remove trailing '.' in labels
    df = pd.read_csv(path, header=None, names=COLUMNS, na_values=" ?", skipinitialspace=True, comment='|')
    df = df.dropna(how="all")
    # some test files have a header-like first row; drop rows where age is 'age' string
    df = df[~df['age'].astype(str).str.contains('age', case=False)]
    df = df.reset_index(drop=True)
    # Clean income label (remove trailing dot)
    df['income'] = df['income'].astype(str).str.strip().str.rstrip('.')
    return df

print("Loading data from:", TRAIN_PATH, TEST_PATH)
train_df = load_adult(TRAIN_PATH)
test_df  = load_adult(TEST_PATH)
print(f"Train rows: {len(train_df)}, Test rows: {len(test_df)}")

# Create binary target and drop original income column
for df in (train_df, test_df):
    df['target'] = (df['income'] == ">50K").astype(int)
    df.drop(columns=['income'], inplace=True)

# ----------------- Feature lists -----------------
numeric_features = ["age","fnlwgt","education-num","capital-gain","capital-loss","hours-per-week"]
categorical_features = [c for c in train_df.columns if c not in numeric_features + ['target']]

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

# Optional: reduce rare categories to "Other" to avoid huge OHE (uncomment if needed)
def reduce_cardinality(train_df, test_df, cat_cols, min_count=50):
    for col in cat_cols:
        vc = train_df[col].value_counts()
        keep = vc[vc >= min_count].index
        train_df[col] = train_df[col].where(train_df[col].isin(keep), other="Other")
        test_df[col]  = test_df[col].where(test_df[col].isin(keep), other="Other")
    return train_df, test_df

# If you face memory issues, lower min_count or call this function:
# train_df, test_df = reduce_cardinality(train_df, test_df, categorical_features, min_count=50)

# ----------------- Preprocessing pipeline -----------------
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
], remainder='drop')

# Fit preprocessor on training data
X_train_full = train_df.drop(columns=['target'])
y_train_full = train_df['target'].values
X_test = test_df.drop(columns=['target'])
y_test = test_df['target'].values

print("Fitting preprocessor...")
preprocessor.fit(X_train_full)
X_train_full_proc = preprocessor.transform(X_train_full)
X_test_proc = preprocessor.transform(X_test)
print("Processed feature dimension:", X_train_full_proc.shape[1])

# Save preprocessor
joblib.dump(preprocessor, os.path.join(ARTIFACT_DIR, "adult_preprocessor.joblib"))
print("Saved preprocessor to:", os.path.join(ARTIFACT_DIR, "adult_preprocessor.joblib"))

# ----------------- Train/Val split -----------------
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full_proc, y_train_full, test_size=0.15, random_state=RANDOM_SEED, stratify=y_train_full
)
print("Train/Val shapes:", X_train.shape, X_val.shape)

# ----------------- TF dataset helpers -----------------
batch_size = 128
def make_dataset(X, y, batch_size=128, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((X.astype(np.float32), y.astype(np.float32)))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(10000, X.shape[0]), seed=RANDOM_SEED)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_dataset(X_train, y_train, batch_size=batch_size, shuffle=True)
val_ds   = make_dataset(X_val, y_val, batch_size=batch_size, shuffle=False)
test_ds  = make_dataset(X_test_proc, y_test, batch_size=batch_size, shuffle=False)

# ----------------- MLP model -----------------
input_dim = X_train.shape[1]
def build_mlp(input_dim, hidden=[256,128,64], dropout=0.4):
    inputs = keras.Input(shape=(input_dim,), dtype=tf.float32)
    x = inputs
    for units in hidden:
        x = layers.Dense(units)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation("relu")(x)
        x = layers.Dropout(dropout)(x)
    # final logits
    logits = layers.Dense(1)(x)
    outputs = tf.squeeze(tf.nn.sigmoid(logits), axis=-1)  # produce shape (batch,)
    model = keras.Model(inputs=inputs, outputs=outputs, name="adult_mlp")
    return model

model = build_mlp(input_dim, hidden=[256,128,64], dropout=0.4)
model.summary()

# compile with metrics (we'll compute F1 with sklearn after)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.BinaryCrossentropy(from_logits=False),  # model outputs probabilities (sigmoid)
    metrics=[keras.metrics.BinaryAccuracy(name="accuracy"),
             keras.metrics.Precision(name="precision"),
             keras.metrics.Recall(name="recall"),
             keras.metrics.AUC(name="auc")]
)

# ----------------- Callbacks & Train -----------------
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True, verbose=1),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, verbose=1)
]

epochs = 30
start_time = time.time()
history = model.fit(train_ds, validation_data=val_ds, epochs=epochs, callbacks=callbacks, verbose=2)
print("Training elapsed (s):", time.time() - start_time)

# ----------------- Evaluate on test set -----------------
# get probs/preds using model.predict
probs = model.predict(test_ds, verbose=0).ravel()
preds = (probs >= 0.5).astype(int)

# compute sklearn metrics
acc  = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds, zero_division=0)
rec  = recall_score(y_test, preds, zero_division=0)
f1   = f1_score(y_test, preds, zero_division=0)
try:
    roc  = roc_auc_score(y_test, probs)
except Exception:
    roc = float("nan")

print("\nTest metrics:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"ROC AUC:   {roc:.4f}\n")

print("Classification report:")
print(classification_report(y_test, preds, target_names=['<=50K','>50K'], zero_division=0))
print("Confusion matrix (actual rows, predicted cols):")
print(confusion_matrix(y_test, preds))

# Save predictions
pred_df = pd.DataFrame({"true": y_test, "pred": preds, "prob": probs})
pred_df.to_csv(os.path.join(ARTIFACT_DIR, "adult_test_predictions.csv"), index=False)
print("Saved test predictions CSV to artifacts.")

# ----------------- Save model safely -----------------
# Keras requires a file extension (.keras or .h5). Save native Keras format (.keras)
model_save_path = os.path.join(ARTIFACT_DIR, "adult_mlp_model.keras")
try:
    model.save(model_save_path)
    print("Saved Keras model to:", model_save_path)
except Exception as e:
    print("model.save() failed:", e)
    # fallback to SavedModel folder
    fallback = os.path.join(ARTIFACT_DIR, "adult_mlp_savedmodel")
    try:
        tf.saved_model.save(model, fallback)
        print("Saved model in SavedModel format to:", fallback)
    except Exception as e2:
        print("Fallback saving also failed:", e2)

print("All artifacts (model, preprocessor, predictions) are in:", ARTIFACT_DIR)


Loading data from: /mnt/data/adult.data /mnt/data/adult.test


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/adult.data'