# Week 12 - Neural Network

The target variable (Y) in the dataset is continuous and takes on a wide range of numeric values, rather than representing discrete categories or labels. This indicates that the problem at hand is a regression task. Therefore, the appropriate approach is to use a deep learning regression model to predict the output based on the given input features.

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Explicit imports for the new model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1

# Other necessary imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error # To calculate MSE on original scale
import time
import os

In [7]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# --- Configuration ---
DATA_SIZES = [1000, 10000, 100000]
CONFIGURATIONS_DL = [
    {'name': '1 hidden layer 4 nodes', 'builder': 'simple', 'layers': [4]},
    {'name': '2 hidden layers 4 nodes each', 'builder': 'simple', 'layers': [4, 4]},
    # Add the new configuration
    {'name': '3 hidden layers 64 nodes reg/dropout', 'builder': 'model4'}
]
EPOCHS = 50
BATCH_SIZE = 32
VALIDATION_SPLIT = 0.2
RANDOM_STATE = 42
L1_PENALTY = 0.1 # Define penalty for build_model4

# List to store results
dl_results = []

In [None]:
# Function for Simple DL Models
def build_model_simple(n_features, layer_nodes, activation='relu'):
    """Builds the original simple Keras Sequential models."""
    model = Sequential(name=f"{len(layer_nodes)}L_{'_'.join(map(str, layer_nodes))}N")
    model.add(keras.layers.Input(shape=(n_features,)))
    for nodes in layer_nodes:
        model.add(Dense(nodes, activation=activation))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam',
                  loss='mean_squared_error',
                  metrics=['mean_squared_error']) # Track MSE
    return model

# Provided Code in github, Model Builder Function
def build_model4(input_shape_dim):
    """Builds the specified 3-layer network with L1/Dropout."""
    model = Sequential(name="3L_64N_RegDrop") # Give it a unique name
    model.add(Dense(64,
                    activation='relu',
                    kernel_regularizer=l1(L1_PENALTY), # Use defined penalty
                    input_dim=input_shape_dim)) # Use passed dimension
    model.add(Dropout(0.5))
    model.add(Dense(64,
                    activation='relu',
                    kernel_regularizer=l1(L1_PENALTY)))
    model.add(Dropout(0.5))
    model.add(Dense(64,
                    activation='relu',
                    kernel_regularizer=l1(L1_PENALTY)))
    model.add(Dropout(0.5))
    model.add(Dense(1,
                    activation='linear')) # Linear output for regression
    model.compile(optimizer='rmsprop', # As specified
                  loss='mse',          # Use 'mse' for mean squared error loss
                  metrics=['mae', 'mse']) # Track MAE and MSE during training
    return model

In [9]:
# Main Loop for Data Sizes
for size in DATA_SIZES:
    print(f"\n--- Processing Data Size: {size} ---")
    data_file = f'data_{size // 1000}k.csv'

    if not os.path.exists(data_file):
        print(f"ERROR: Data file {data_file} not found. Please generate it first.")
        # Add placeholder results for all configurations for this size
        for config in CONFIGURATIONS_DL:
             dl_results.append({
                'Data size': size,
                'Configuration': config['name'],
                'Training error (MSE)': np.nan,
                'Validation error (MSE)': np.nan,
                'Time of execution (s)': np.nan
            })
        continue

    # Load data
    df = pd.read_csv(data_file)
    X = df[['X1', 'X2', 'X3', 'X4']]
    y = df['Y']

    # Check for extreme values in Y (optional but good practice)
    print(f"  Y variable stats for size {size}:")
    print(y.describe())
    if y.abs().max() > 1e6: # Warning if Y values are very large
         print("  WARNING: Target variable 'Y' has large values. Scaling is crucial.")

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=VALIDATION_SPLIT, random_state=RANDOM_STATE
    )

    # Scale Features (X)
    x_scaler = StandardScaler()
    X_train_scaled = x_scaler.fit_transform(X_train)
    X_val_scaled = x_scaler.transform(X_val)
    n_features = X_train_scaled.shape[1]

    # Scale Target (Y) - IMPORTANT for large Y values
    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
    y_val_scaled = y_scaler.transform(y_val.values.reshape(-1, 1))

    # Deep Learning Training Loop
    print("\n Training Deep Learning Models ")
    for config in CONFIGURATIONS_DL:
        print(f"\n    Training DL Model: {config['name']}")

        # Build the appropriate model based on configuration
        if config['builder'] == 'model4':
            model = build_model4(n_features)
        elif config['builder'] == 'simple':
            model = build_model_simple(n_features, config['layers'])
        else:
            print(f"      ERROR: Unknown builder type '{config.get('builder', 'N/A')}'")
            continue # Skip this configuration

        # print(model.summary()) # Optional: uncomment to see model structure

        start_time = time.time()
        # Train on SCALED features and SCALED target
        history = model.fit(
            X_train_scaled, y_train_scaled,
            validation_data=(X_val_scaled, y_val_scaled),
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            verbose=0 # Suppress epoch output
        )
        end_time = time.time()
        execution_time = end_time - start_time

        # Get loss from history (which is MSE as defined in compile)
        final_train_loss_scaled = history.history['loss'][-1]
        final_val_loss_scaled = history.history['val_loss'][-1]
        print(f"      DL Training Finished.")
        print(f"      DL Final Training Loss (scaled MSE): {final_train_loss_scaled:.4f}")
        print(f"      DL Final Validation Loss (scaled MSE): {final_val_loss_scaled:.4f}")

        # Calculate MSE on ORIGINAL scale
        # Predict using scaled inputs
        pred_train_scaled = model.predict(X_train_scaled, verbose=0)
        pred_val_scaled = model.predict(X_val_scaled, verbose=0)

        # Inverse transform predictions to get them back to original Y scale
        pred_train_orig = y_scaler.inverse_transform(pred_train_scaled)
        pred_val_orig = y_scaler.inverse_transform(pred_val_scaled)

        # Calculate MSE between original y and inverse-transformed predictions
        final_train_error_orig = mean_squared_error(y_train, pred_train_orig)
        final_val_error_orig = mean_squared_error(y_val, pred_val_orig)

        print(f"      DL Final Training MSE (original scale): {final_train_error_orig:.4f}")
        print(f"      DL Final Validation MSE (original scale): {final_val_error_orig:.4f}")
        print(f"      DL Execution Time: {execution_time:.2f} seconds")

        # Store results (using original scale MSE for reporting)
        dl_results.append({
            'Data size': size,
            'Configuration': config['name'],
            'Training error (MSE)': final_train_error_orig,
            'Validation error (MSE)': final_val_error_orig,
            'Time of execution (s)': execution_time
        })

# Display Results
print("\n" + "="*70)
print("               DEEP LEARNING RESULTS SUMMARY")
print("="*70)
dl_results_df = pd.DataFrame(dl_results)
# Format MSE columns for better readability if they are very large or small
pd.set_option('display.float_format', '{:.2f}'.format)
print(dl_results_df.to_string(index=False))
print("="*70)

# Analysis Section
print("\n Analysis ")
if not dl_results_df.dropna().empty:
    best_dl_model_idx = dl_results_df['Validation error (MSE)'].idxmin()
    best_dl_model_info = dl_results_df.loc[best_dl_model_idx]
    print("\n1. Best Deep Learning Model:")
    print(f"Based on the lowest validation MSE (original scale), the best performing deep learning model is:")
    # Use default float format for printing the best model info row
    pd.reset_option('display.float_format')
    print(best_dl_model_info.to_string())
    pd.set_option('display.float_format', '{:.2f}'.format) # Reset for any later prints
    print(f"\nReasoning: This configuration ('{best_dl_model_info['Configuration']}' on data size {best_dl_model_info['Data size']}) achieved the lowest validation MSE ({best_dl_model_info['Validation error (MSE)']:.4f} on the original scale). This estimates the best performance on unseen data.")
else:
    print("\n1. Best Deep Learning Model: No valid DL results were generated.")

# Reset float format if needed elsewhere
pd.reset_option('display.float_format')


--- Processing Data Size: 1000 ---
  Y variable stats for size 1000:
count      1000.000000
mean      45985.086248
std       29024.553144
min       -5145.718206
25%       21964.626880
50%       44883.537973
75%       70894.279025
max      105203.818034
Name: Y, dtype: float64

  --- Training Deep Learning Models ---

    Training DL Model: 1 hidden layer 4 nodes
      DL Training Finished.
      DL Final Training Loss (scaled MSE): 0.0200
      DL Final Validation Loss (scaled MSE): 0.0224
      DL Final Training MSE (original scale): 16838507.5950
      DL Final Validation MSE (original scale): 19117599.3749
      DL Execution Time: 10.58 seconds

    Training DL Model: 2 hidden layers 4 nodes each
      DL Training Finished.
      DL Final Training Loss (scaled MSE): 0.0434
      DL Final Validation Loss (scaled MSE): 0.0438
      DL Final Training MSE (original scale): 36404138.4583
      DL Final Validation MSE (original scale): 37360560.1825
      DL Execution Time: 11.92 seconds

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


      DL Training Finished.
      DL Final Training Loss (scaled MSE): 1.4236
      DL Final Validation Loss (scaled MSE): 1.3623
      DL Final Training MSE (original scale): 852027467.5871
      DL Final Validation MSE (original scale): 799803699.1818
      DL Execution Time: 12.68 seconds

--- Processing Data Size: 10000 ---
  Y variable stats for size 10000:
count     10000.000000
mean     453605.025645
std      298868.066280
min      -63153.513656
25%      193556.409302
50%      460534.064666
75%      711530.726213
max      998732.979548
Name: Y, dtype: float64

  --- Training Deep Learning Models ---

    Training DL Model: 1 hidden layer 4 nodes
      DL Training Finished.
      DL Final Training Loss (scaled MSE): 0.0001
      DL Final Validation Loss (scaled MSE): 0.0001
      DL Final Training MSE (original scale): 5881283.4417
      DL Final Validation MSE (original scale): 6195782.8417
      DL Execution Time: 45.76 seconds

    Training DL Model: 2 hidden layers 4 nodes ea

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


      DL Training Finished.
      DL Final Training Loss (scaled MSE): 1.4224
      DL Final Validation Loss (scaled MSE): 1.4171
      DL Final Training MSE (original scale): 89418029883.5517
      DL Final Validation MSE (original scale): 88894042681.9784
      DL Execution Time: 55.51 seconds

--- Processing Data Size: 100000 ---
  Y variable stats for size 100000:
count    1.000000e+05
mean     4.495216e+06
std      2.962349e+06
min     -6.427087e+05
25%      1.926356e+06
50%      4.500260e+06
75%      7.068165e+06
max      9.735293e+06
Name: Y, dtype: float64

  --- Training Deep Learning Models ---

    Training DL Model: 1 hidden layer 4 nodes
      DL Training Finished.
      DL Final Training Loss (scaled MSE): 0.0000
      DL Final Validation Loss (scaled MSE): 0.0000
      DL Final Training MSE (original scale): 44551305.7487
      DL Final Validation MSE (original scale): 44301406.1709
      DL Execution Time: 366.74 seconds

    Training DL Model: 2 hidden layers 4 nodes e

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


      DL Training Finished.
      DL Final Training Loss (scaled MSE): 1.4222
      DL Final Validation Loss (scaled MSE): 1.4246
      DL Final Training MSE (original scale): 8772232717850.7393
      DL Final Validation MSE (original scale): 8791813609110.1631
      DL Execution Time: 452.31 seconds

               DEEP LEARNING RESULTS SUMMARY
 Data size                        Configuration  Training error (MSE)  Validation error (MSE)  Time of execution (s)
      1000               1 hidden layer 4 nodes           16838507.60             19117599.37                  10.58
      1000         2 hidden layers 4 nodes each           36404138.46             37360560.18                  11.92
      1000 3 hidden layers 64 nodes reg/dropout          852027467.59            799803699.18                  12.68
     10000               1 hidden layer 4 nodes            5881283.44              6195782.84                  45.76
     10000         2 hidden layers 4 nodes each            3608301.

In the deep learning experiments, the 2-layer, 4-node configuration consistently delivered the best performance across different data sizes. This architecture struck an effective balance between low validation error and reasonable execution time, making it the most reliable among the tested deep learning setups. On the other hand, more complex deep models specifically those with 3 hidden layers, 64 nodes per layer, and regularization/dropout performed very poorly. These models produced extremely high training and validation MSE, likely due to overfitting, improper regularization, or general instability during training.