In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import AdamW
import joblib # For saving the preprocessor

# --- 1. Data Loading and Preparation ---
# Assuming 'Group A Dataset.csv' is available in the current environment
# Load the correct dataset into a new DataFrame
group_a_df = pd.read_csv('Group A Dataset.csv')

# Handle missing values (no longer using 'Total Spend')

# Create the binary classification target using the 'label' column
# Convert the 'label' column to a binary target: 1 for '>50K' and 0 for '<=50K'
group_a_df['target'] = (group_a_df['label'].str.strip() == '>50K').astype(int)

# Define features (X) and target (y) using the correct DataFrame and relevant columns
# Using 'age', 'education_num', and 'hour_per_week' as features for this example
X = group_a_df[['age', 'education_num', 'hour_per_week']]
y = group_a_df['target']

# --- 2. Preprocessing Pipeline ---
# Define which columns are numerical and which are categorical
numerical_features = ['age', 'education_num', 'hour_per_week']
# No categorical features being used in this simplified example
categorical_features = []

# Create preprocessing pipelines for both data types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
        # Removed the OneHotEncoder for categorical features since none are used
    ])

# --- 3. Train-Test Split ---
# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit the preprocessor on the training data and transform both sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Save the fitted preprocessor for use in the test script
joblib.dump(preprocessor, 'preprocessor.joblib')

# --- 4. Model Definition ---
def create_model(input_shape):
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Using AdamW optimizer with specified parameters
    optimizer = AdamW(learning_rate=0.001, weight_decay=0.001)

    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Get the input shape after preprocessing
input_shape = X_train_processed.shape[1]
model = create_model(input_shape)
print("Model Summary:")
model.summary() # This will show the parameter count

# --- 5. Model Training ---
print("\nStarting model training...")
history = model.fit(
    X_train_processed,
    y_train,
    epochs=100,
    batch_size=128,
    validation_data=(X_test_processed, y_test),
    verbose=0 # Set to 1 to see epoch-by-epoch progress
)
print("Training complete.")

# --- 6. Evaluation ---
# Make predictions on the test set
y_pred_proba = model.predict(X_test_processed)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate and present the balanced accuracy
final_bal_acc = balanced_accuracy_score(y_test, y_pred)
print(f"\n--- Final Result ---")
print(f"Balanced Accuracy on Test Set: {final_bal_acc:.4f}")

# --- 7. Save the Final Model ---
# The model is saved in the recommended HDF5 format
model.save("final_model.h5")
print("Model saved as final_model.h5")

Model Summary:



Starting model training...
Training complete.
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step





--- Final Result ---
Balanced Accuracy on Test Set: 0.5383
Model saved as final_model.h5


In [12]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import joblib
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split # Import train_test_split

def calculate_balanced_accuracy(model_path, preprocessor_path, test_data_x, test_data_y):
    """
    Loads a trained model and preprocessor, transforms test data,
    and returns the balanced accuracy.

    Args:
        model_path (str): Path to the saved HDF5 model file.
        preprocessor_path (str): Path to the saved joblib preprocessor.
        test_data_x (pd.DataFrame): Test features.
        test_data_y (pd.Series): Test labels.

    Returns:
        float: The balanced accuracy score.
    """
    # Load the trained model and the preprocessor
    model = load_model(model_path)
    preprocessor = joblib.load(preprocessor_path)

    # Apply the exact same transformations to the hold-out test data
    test_x_processed = preprocessor.transform(test_data_x)

    # Make predictions
    y_pred_proba = model.predict(test_x_processed)
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Calculate balanced accuracy
    bal_acc = balanced_accuracy_score(test_data_y, y_pred)

    return bal_acc

# --- Example Usage (simulating the lecturer's device) ---
if __name__ == '__main__':
    # On the lecturer's device, they would load their own hold-out set.
    # Here, we re-create a test set for demonstration.
    # Load the same dataset used for training
    df = pd.read_csv('Group A Dataset.csv')

    # Create the binary classification target using the 'label' column
    df['target'] = (df['label'].str.strip() == '>50K').astype(int)

    # Define features (X) and target (y) using the same columns as the training script
    X = df[['age', 'education_num', 'hour_per_week']]
    y = df['target']

    # Using a different random split to simulate a true hold-out set
    # We only need the hold-out set, so we discard the training parts
    _, hold_out_x, _, hold_out_y = train_test_split(
        X, y, test_size=0.3, random_state=123, stratify=y # Use a different random_state for simulation
    )

    # Define paths to the saved files
    path_to_saved_model = "final_model.h5"
    path_to_preprocessor = "preprocessor.joblib"

    # Calculate the balanced accuracy on the new data
    final_score = calculate_balanced_accuracy(
        path_to_saved_model,
        path_to_preprocessor,
        hold_out_x,
        hold_out_y
    )

    print(f"Balanced accuracy on the hold-out test set is: {final_score:.4f}")



[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Balanced accuracy on the hold-out test set is: 0.5415


**Reasoning**:
Load the dataset and display the head and columns to understand the available features for selection.



In [13]:
group_a_df = pd.read_csv('Group A Dataset.csv')
display(group_a_df.head())
print(group_a_df.columns)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hour_per_week,native_country,label
0,42,Self-emp-not-inc,101709,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,2885,0,40,United-States,<=50K
1,55,Private,202220,HS-grad,9,Married-civ-spouse,Other-service,Wife,Black,Female,2407,0,35,United-States,<=50K
2,34,Private,111985,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States,<=50K
3,55,Private,123515,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
4,33,Private,69748,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hour_per_week', 'native_country',
       'label'],
      dtype='object')


**Reasoning**:
Based on the column names and the goal of improving balanced accuracy for income prediction, I will select potentially relevant numerical and categorical features from the dataset. I will list the selected features.



In [14]:
# Selected numerical features
numerical_features = ['age', 'education_num', 'hour_per_week', 'capital_gain', 'capital_loss']

# Selected categorical features
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

print("Selected Numerical Features:", numerical_features)
print("Selected Categorical Features:", categorical_features)

Selected Numerical Features: ['age', 'education_num', 'hour_per_week', 'capital_gain', 'capital_loss']
Selected Categorical Features: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']


**Reasoning**:
Define numerical and categorical features, create a ColumnTransformer for preprocessing, define features and target, split data, fit and transform data, and save the preprocessor.



In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib

# 1. Define numerical and categorical features
numerical_features = ['age', 'education_num', 'hour_per_week', 'capital_gain', 'capital_loss']
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

# 2. Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 3. Define features (X) and target (y)
# Assuming group_a_df is already loaded from the previous step
X = group_a_df[numerical_features + categorical_features]
y = group_a_df['target']

# 4. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Fit the preprocessor on the training data and transform both sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# 6. Save the fitted preprocessor
joblib.dump(preprocessor, 'preprocessor.joblib')

print("Preprocessing pipeline created, data split and transformed, and preprocessor saved.")
print("Shape of X_train_processed:", X_train_processed.shape)
print("Shape of X_test_processed:", X_test_processed.shape)

Preprocessing pipeline created, data split and transformed, and preprocessor saved.
Shape of X_train_processed: (19536, 106)
Shape of X_test_processed: (4885, 106)


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib

# Assuming group_a_df is already loaded from a previous successful step
# Recreate the binary classification target using the 'label' column
# Convert the 'label' column to a binary target: 1 for '>50K' and 0 for '<=50K'
group_a_df['target'] = (group_a_df['label'].str.strip() == '>50K').astype(int)

# 1. Define numerical and categorical features
numerical_features = ['age', 'education_num', 'hour_per_week', 'capital_gain', 'capital_loss']
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

# 2. Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 3. Define features (X) and target (y)
X = group_a_df[numerical_features + categorical_features]
y = group_a_df['target']

# 4. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Fit the preprocessor on the training data and transform both sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# 6. Save the fitted preprocessor
joblib.dump(preprocessor, 'preprocessor.joblib')

print("Preprocessing pipeline created, data split and transformed, and preprocessor saved.")
print("Shape of X_train_processed:", X_train_processed.shape)
print("Shape of X_test_processed:", X_test_processed.shape)

Preprocessing pipeline created, data split and transformed, and preprocessor saved.
Shape of X_train_processed: (19536, 106)
Shape of X_test_processed: (4885, 106)


**Reasoning**:
Update the create_model function to accept input_shape and use the Input layer, then instantiate and summarize the model with the new input shape.



In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import AdamW

# --- 4. Model Definition ---
def create_model(input_shape):
    """
    Creates a Sequential neural network model with a specified input shape.

    Args:
        input_shape (int): The number of input features.

    Returns:
        tf.keras.models.Sequential: The compiled Keras model.
    """
    model = Sequential([
        Input(shape=(input_shape,)), # Use the Input layer with dynamic shape
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Using AdamW optimizer with specified parameters
    optimizer = AdamW(learning_rate=0.001, weight_decay=0.001)

    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Get the input shape after preprocessing
input_shape = X_train_processed.shape[1]
model = create_model(input_shape)
print("Model Summary:")
model.summary() # This will show the parameter count

Model Summary:


**Reasoning**:
Train the updated model using the preprocessed data, as instructed.



In [18]:
# --- 5. Model Training ---
print("\nStarting model training with expanded features...")
history = model.fit(
    X_train_processed,
    y_train,
    epochs=100,
    batch_size=128,
    validation_data=(X_test_processed, y_test),
    verbose=0 # Set to 1 to see epoch-by-epoch progress
)
print("Training complete.")


Starting model training with expanded features...
Training complete.


**Reasoning**:
Use the trained model to predict on the processed test data, convert probabilities to binary predictions, and calculate balanced accuracy.



In [19]:
# --- 6. Evaluation ---
# Make predictions on the test set
y_pred_proba = model.predict(X_test_processed)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate and present the balanced accuracy
final_bal_acc = balanced_accuracy_score(y_test, y_pred)
print(f"\n--- Evaluation on Test Set ---")
print(f"Balanced Accuracy: {final_bal_acc:.4f}")

[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step

--- Evaluation on Test Set ---
Balanced Accuracy: 0.6277
