In [None]:
# Import necessary libraries for project

# System libraries
import os
import datetime

# Data manipulation
import numpy as np # Efficient array operations
import pandas as pd # Data structures for working with structured data

# Data visualization
import matplotlib.pyplot as plt # Plotting and data visualization
import seaborn as sns # Further data visualization

# Preprocessing
from sklearn.preprocessing import MinMaxScaler # Scales data to a range [0, 1]

# Deep learning
import tensorflow as tf # Core machine learning framework
from keras.api.models import Model # Model class
from keras.api.layers import Input, Dense # Layers for model
from keras.api.optimizers import Adam, SGD # Optimizer for model
from keras.api.losses import MeanSquaredError # Loss function for model

In [None]:
# Function definition for project

# Function to analyze a dataset (statistical analysis)
def analyze_dataset(train_data, train_labels, test_data, test_labels):
    """
    Perform statistical analysis of dataset, including:
    - Shape and data types
    - Missing values
    - Summary statistics

    Parameters:
        train_data (numpy.ndarray): Training feature set
        test_data (numpy.ndarray): Testing feature set
        train_labels (numpy.ndarray): Training labels
        test_labels (numpy.ndarray): Testing labels
    """


    # Print header for function
    print("\n🎯 Dataset Analysis 🎯\n")

    # Convert to DataFrame for better analysis
    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)
    train_labels_df = pd.DataFrame(train_labels, columns=[''])
    test_labels_df = pd.DataFrame(test_labels, columns=[''])

    # Dataset Shape and Data Types
    print("\n🔹 Dataset Shape & Data Types:\n")
    print(f"Train data shape: {train_data.shape}, Type: {train_data.dtype}")
    print(f"Test data shape: {test_data.shape}, Type: {test_data.dtype}")
    print(f"Train labels shape: {train_labels.shape}, Type: {train_labels.dtype}")
    print(f"Test labels shape: {test_labels.shape}, Type: {test_labels.dtype}")

    # Checking for Missing Values
    print("\n🔹 Missing Values:\n")
    print(f"Train data missing values: {np.isnan(train_data).sum()}")
    print(f"Test data missing values: {np.isnan(test_data).sum()}")
    print(f"Train labels missing values: {np.isnan(train_labels).sum()}")
    print(f"Test labels missing values: {np.isnan(test_labels).sum()}")

    # Summary Statistics (using DataFrame)
    print("\n🔹 Statistical Summary:\n")
    print("\nTrain Data Statistics:\n\n", train_df.describe())
    print("\nTest Data Statistics:\n\n", test_df.describe())
    print("\nTrain Labels Statistics:\n", train_labels_df.describe())
    print("\nTest Labels Statistics:\n", test_labels_df.describe())


# Function to preprocess a dataset (normalization, reshaping, etc.)
def preprocess_dataset(train_data, train_labels, test_data, test_labels):
    """
    Preprocesses data for models:
    - Reshapes labels
    - Prints pre-normalization min/max ranges
    - Applies MinMaxScaler normalization
    - Prints post-normalization min/max ranges
    - Converts data types to float32 for optimization

    Returns:
    - Scaled train_data, train_labels, test_data, test_labels
    """


    # Print header for function
    print("\n🎯 Preprocessing Steps 🎯\n")

    # Reshape labels to ensure compatibility
    train_labels = np.reshape(train_labels, (-1, 1))
    test_labels = np.reshape(test_labels, (-1, 1))

    print("\n🔹 Shapes After Reshaping:")
    print("Train Labels Shape:", train_labels.shape)
    print("Test Labels Shape:", test_labels.shape)

    # Check pre-normalization min/max values
    train_data_min, train_data_max = train_data.min(axis=0), train_data.max(axis=0)
    test_data_min, test_data_max = test_data.min(axis=0), test_data.max(axis=0)
    train_labels_min, train_labels_max = train_labels.min(axis=0), train_labels.max(axis=0)
    test_labels_min, test_labels_max = test_labels.min(axis=0), test_labels.max(axis=0)

    print("\n🔹 Pre-Normalization Data Ranges:")
    print("Train Data Min:", train_data_min, "\nTrain Data Max:", train_data_max)
    print("Test Data Min:", test_data_min, "\nTest Data Max:", test_data_max)

    # Fit scaler on training data only
    min_max_scaler = MinMaxScaler()
    min_max_scaler.fit(train_data)

    # Transform both training and test data using scaler
    train_data = min_max_scaler.transform(train_data)
    test_data = min_max_scaler.transform(test_data)

    # Check post-normalization min/max values
    train_min_post, train_max_post = train_data.min(axis=0), train_data.max(axis=0)
    test_min_post, test_max_post = test_data.min(axis=0), test_data.max(axis=0)

    print("\n🔹 Post-Normalization Data Ranges:")
    print("Post-Normalization Train Data Min:", train_min_post, "\nPost-Normalization Train Data Max:", train_max_post)
    print("Post-Normalization Test Data Min:", test_min_post, "\nPost-Normalization Test Data Max:", test_max_post)

    # Print min/max values for labels
    print("\n🔹 (Optional) Label Ranges:")
    print("Train Labels Min:", train_labels_min, "\nTrain Labels Max:", train_labels_max)
    print("Test Labels Min:", test_labels_min, "\nTest Labels Max:", test_labels_max)

    # Convert dataset values to float32 for optimization
    train_data = train_data.astype(np.float32)
    test_data = test_data.astype(np.float32)
    train_labels = train_labels.astype(np.float32)
    test_labels = test_labels.astype(np.float32)

    print("\n🔹 Data Types After Conversion:")
    print("Train Data Type:", train_data.dtype)
    print("Test Data Type:", test_data.dtype)
    print("Train Labels Type:", train_labels.dtype)
    print("Test Labels Type:", test_labels.dtype)

    return train_data, train_labels, test_data, test_labels

# Function to visualize a dataset (plotting)
def visualize_dataset(train_data, train_labels, test_data, test_labels):
    """
    Visualize dataset by plotting:
    - Feature distributions
    - Correlation heatmap
    - Outlier detection (boxplots)
    - Label distribution

    Parameters:
        train_data (numpy.ndarray): Training feature set
        test_data (numpy.ndarray): Testing feature set
        train_labels (numpy.ndarray): Training labels
        test_labels (numpy.ndarray): Testing labels
    """


    # Print header for function
    print("\n🎯 Dataset Visualization 🎯\n")

    # Feature Distributions
    num_features = train_data.shape[1]
    plt.figure(figsize=(15, num_features * 2))
    for i in range(num_features):
        plt.subplot((num_features // 3) + 1, 3, i + 1)
        sns.histplot(train_data[:, i], kde=True, bins=30, color="blue", label="Train")
        sns.histplot(test_data[:, i], kde=True, bins=30, color="orange", label="Test")
        plt.xlabel(f"Feature {i}")
        plt.ylabel("Count")
        plt.legend()
    plt.suptitle("Feature Distributions (Train vs. Test)\n\n")
    plt.tight_layout()
    plt.show()

    # Correlation Heatmap
    plt.figure(figsize=(12, 8))
    corr_matrix = pd.DataFrame(train_data).corr()
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Feature Correlation Heatmap\n")
    plt.show()

    # Outlier Detection (Boxplots)
    plt.figure(figsize=(15, num_features * 2))
    for i in range(num_features):
        plt.subplot((num_features // 3) + 1, 3, i + 1)
        sns.boxplot(x=train_data[:, i], color="red")
        plt.xlabel(f"Feature {i}")
    plt.suptitle("Feature Outlier Detection (Boxplots)\n\n")
    plt.tight_layout()
    plt.show()

    # Label Distribution
    plt.figure(figsize=(10, 4))
    sns.histplot(train_labels, kde=True, bins=30, color="blue", label="Train Labels")
    sns.histplot(test_labels, kde=True, bins=30, color="orange", label="Test Labels")
    plt.xlabel("Labels")
    plt.ylabel("Count")
    plt.legend()
    plt.title("Label Distribution (Train vs. Test)\n")
    plt.show()


# Function to evaluate a model (actual vs. predicted)
def evaluate_model(model, train_data, train_labels, test_data, test_labels):
    """
    Visualize actual vs. predicted values for both training and test datasets.

    Parameters:
        model: Trained model (callable or with `predict()` method)
        train_data (numpy.ndarray): Training feature set
        test_data (numpy.ndarray): Testing feature set
        train_labels (numpy.ndarray): Training labels
        test_labels (numpy.ndarray): Testing labels
    """


    # Print header for function
    print("\n🎯 Model Evaluation 🎯\n")

    # Predict values
    train_preds = model.predict(train_data)
    test_preds = model.predict(test_data)

    # Number of samples to visualize
    num_samples = min(30, len(train_labels), len(test_labels))

    # Plot setup
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # Plot train data
    axes[0].plot(train_labels[:num_samples], "r-", label="True", alpha=0.7)
    axes[0].plot(train_preds[:num_samples], "b-", label="Predicted", alpha=0.7)
    axes[0].set_title("Train Data: Actual vs. Predicted")
    axes[0].set_xlabel("Sample Index")
    axes[0].set_ylabel("Value")
    axes[0].legend()
    axes[0].grid(True, linestyle="--", alpha=0.6)

    # Plot test data
    axes[1].plot(test_labels[:num_samples], "r-", label="True", alpha=0.7)
    axes[1].plot(test_preds[:num_samples], "b-", label="Predicted", alpha=0.7)
    axes[1].set_title("Test Data: Actual vs. Predicted")
    axes[1].set_xlabel("Sample Index")
    axes[1].set_ylabel("Value")
    axes[1].legend()
    axes[1].grid(True, linestyle="--", alpha=0.6)

    # Display plots
    plt.tight_layout()
    plt.show()


# Function to visualize model training history
def visualize_model_history(model_history):
    """
    Plots training and validation metrics of a Keras model.

    Parameters:
    model_history (History): History object returned by fit method of a Keras model.
    """


    # Print header for function
    print("\n🎯 Training History Visualization 🎯\n")

    # Convert history.history dictionary to a DataFrame
    history_df = pd.DataFrame(model_history.history)

    # Rename columns for better readability
    history_df.rename(columns={
        'loss': 'Training Loss',
        'val_loss': 'Validation Loss'
    }, inplace=True)

    # Plot DataFrame
    history_df.plot(figsize=(10, 6))
    plt.title('Model Training History')
    plt.xlabel('Epoch')
    plt.ylabel('Metric')
    plt.grid(True)

    # Display plot
    plt.show()


# Function to calculate accuracy of a model
def calculate_model_accuracy(model, test_data, test_labels, threshold):
    """
    Calculate accuracy of a model by comparing predictions with actual values.

    Parameters:
        model: Trained model (callable or with `predict()` method)
        test_data (numpy.ndarray): Testing feature set
        test_labels (numpy.ndarray): Testing labels
        error_threshold (float): Threshold for considering a prediction as an error

    Returns:
        accuracy (float): accuracy of model
        num_errors (int): number of errors above threshold
    """


    # Print header for function
    print("\n🎯 Model Accuracy Calculation 🎯\n")

    # Predict values
    model_predictions = model.predict(test_data)

    # Initialize error counter
    num_errors = 0

    # Iterate over predictions and compare with actual values
    print(f"\n🔹 Model errors above {threshold} (threshold):\n")
    for index in range(len(model_predictions)):
        if abs(model_predictions[index] - (test_labels[index])) > threshold:
            print(f"Prediction: {model_predictions[index]}, Actual: {test_labels[index]}")
            num_errors += 1

    # Calculate accuracy
    accuracy = 1.0 - (num_errors / len(model_predictions))

    # Print summary
    print("\n🔹 Model Accuracy Summary:\n")
    print(f"Number of errors: {num_errors}")
    print(f"Accuracy: {accuracy:.2f}")

    # Return accuracy and number of errors
    return(accuracy)


# Function to add experiment results to a csv file
def add_experiment_result(
    train_data,
    train_labels,
    test_data,
    test_labels,
    model,
    batch_size,
    epochs,
    model_history,
    threshold,
    accuracy,
    description=None
):
    """
    Extracts experiment parameters and results from model and history,
    then logs them into a CSV file.
    """


    # Print header for function
    print("\n🎯 Experiment Results Logging 🎯\n")

    # Extract model name
    model_name = model.name

    # Generate a unique identifier using current date and time
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Extract training parameters
    learning_rate = getattr(model.optimizer, "learning_rate", None)
    if hasattr(learning_rate, "numpy"):
        learning_rate = learning_rate.numpy()  # Convert Tensor to float

    # Extract optimizer details
    optimizer = type(model.optimizer).__name__

    # Extract model architecture details
    dense_layers = [layer for layer in model.layers if layer.__class__.__name__ == "Dense"]
    if dense_layers:
        activation_function = dense_layers[0].activation.__name__
        num_layers = len(dense_layers)
        num_units = dense_layers[0].units
    else:
        activation_function = None
        num_layers = len(model.layers)
        num_units = None

    # Extract evaluation metrics
    final_loss = model_history.history["loss"][-1]
    min_loss = min(model_history.history["loss"])
    max_loss = max(model_history.history["loss"])
    final_val_loss = model_history.history.get("val_loss", [None])[-1]

    # Create a dictionary of extracted data
    row_data = {
        "Name": model_name,
        "Timestamp": timestamp,
        "Batch Size": batch_size,
        "Epochs": epochs,
        "Learning Rate": learning_rate,
        "Optimizer": optimizer,
        "Activation Function": activation_function,
        "Number of Layers": num_layers,
        "Number of Units": num_units,
        "Loss": final_loss,
        "Minimum Loss": min_loss,
        "Maximum Loss": max_loss,
        "Validation Loss": final_val_loss,
        "Error Threshold": threshold,
        "Accuracy": accuracy,
        "Description": description
    }

    # Print values being logged
    print("\n🔹 Experiment Results:\n")
    for key, value in row_data.items():
        print(f"  {key}: {value}")

    # Define CSV file path
    csv_path = os.path.expanduser("/home/saeed/projects/ml/src/mr-engineer-playverse/boston-housing/experiment_results.csv")

    # Load existing CSV or create new DataFrame
    try:
        experiment_results = pd.read_csv(csv_path)
    except FileNotFoundError:
        experiment_results = pd.DataFrame(columns=row_data.keys())

    # Ensure new row and experiment_results have matching columns
    new_row = pd.DataFrame([row_data])
    for col in new_row.columns:
        if col not in experiment_results.columns:
            experiment_results[col] = pd.NA

    # Append new row to DataFrame
    experiment_results = pd.concat([new_row, experiment_results.dropna(axis=1, how="all")], ignore_index=True)

    # Save updated DataFrame to CSV and Excel
    excel_path = csv_path.replace(".csv", ".xlsx")
    with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
        experiment_results.to_excel(writer, index=False, sheet_name="Results")

        # Get xlsxwriter workbook and worksheet objects
        workbook = writer.book
        worksheet = writer.sheets["Results"]

        # Set column widths based on max length of data in each column
        for col_idx, col in enumerate(experiment_results.columns):
            max_length = max(experiment_results[col].astype(str).map(len).max(), len(col)) + 2
            worksheet.set_column(col_idx, col_idx, max_length)

        # Create a cell format for centering text horizontally and vertically
        cell_format = workbook.add_format({'align': 'center', 'valign': 'vcenter'})

        # Create a bold cell format for header
        header_format = workbook.add_format({'align': 'center', 'valign': 'vcenter', 'bold': True})

        # Write header with bold formatting
        for col_idx in range(len(experiment_results.columns)):
            worksheet.write(0, col_idx, experiment_results.columns[col_idx], header_format)

        # Write data rows with formatting (starting from row 1)
        for row_idx in range(len(experiment_results)):
            for col_idx in range(len(experiment_results.columns)):
                value = experiment_results.iloc[row_idx, col_idx]

                # Convert NaN/Inf to a safe value
                if pd.isna(value):  # Check for NaN
                    value = "N/A"
                elif value == np.inf:  # Check for positive infinity
                    value = "Infinity"
                elif value == -np.inf:  # Check for negative infinity
                    value = "-Infinity"

                worksheet.write(row_idx + 1, col_idx, value, cell_format)

    # Ensure directory exists and save file
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    experiment_results.to_csv(csv_path, index=False)


In [None]:
# Automatically splits into training and test sets (features and labels)
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.boston_housing.load_data()

# Analyze dataset before preprocessing
analyze_dataset(train_data, train_labels, test_data, test_labels)

# Preprocess dataset
train_data, train_labels, test_data, test_labels = preprocess_dataset(train_data, train_labels, test_data, test_labels)

# Analyze dataset after preprocessing
analyze_dataset(train_data, train_labels, test_data, test_labels)

In [None]:
# Train model and store training history
print("\n🎯 Model Training 🎯\n")

# Set training parameters
batch_size = 8
epochs = 200
threshold = 5.0

# Early stopping callback
# early_stop = EarlyStopping(monitor="val_loss", patience=50, restore_best_weights=True)

# Train model and store training history
history = model.fit( # type: ignore
    x=train_data,
    y=train_labels,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(test_data, test_labels)
    # callbacks=[early_stop]
)

# Evaluate model after training
evaluate_model(model, train_data, train_labels, test_data, test_labels) # type: ignore

# Visualize model training history
visualize_model_history(history)

# Calculate accuracy of model
accuracy = calculate_model_accuracy(model, test_data, test_labels, threshold=5.0) # type: ignore

# Add experiment result to experiment results csv file
add_experiment_result(
    train_data,
    train_labels,
    test_data,
    test_labels,
    model, # type: ignore
    batch_size,
    epochs,
    history,
    threshold,
    accuracy
)

In [None]:
# Create model 1

# Print header for model creation
print("\n🎯 Regression Model Creation 🎯\n")

# Define input layer
input_layer = Input(shape=(13,))

# Define first layer
first_layer = Dense(units=4, activation="relu")(input_layer)

# Define output layer
output_layer = Dense(units=1)(first_layer)

# Define model
model = Model(inputs=input_layer, outputs=output_layer, name="m1")

# Display model summary
model.summary()

# Compile model
print("\n🎯 Model Compilation 🎯\n")
model.compile(optimizer=Adam(), loss=MeanSquaredError())

In [None]:
# Create model 2

# Print header for model creation
print("\n🎯 Regression Model Creation 🎯\n")

# Define input layer
input_layer = Input(shape=(13,))

# Define first layer
first_layer = Dense(units=8, activation="relu")(input_layer)

# Define output layer
output_layer = Dense(units=1)(first_layer)

# Define model
model = Model(inputs=input_layer, outputs=output_layer, name="m2")

# Display model summary
model.summary()

# Compile model
print("\n🎯 Model Compilation 🎯\n")
model.compile(optimizer=Adam(), loss=MeanSquaredError())

In [None]:
# Create model 3

# Print header for model creation
print("\n🎯 Regression Model Creation 🎯\n")

# Define input layer
input_layer = Input(shape=(13,))

# Define first layer
first_layer = Dense(units=8, activation="relu")(input_layer)

# Define second layer
second_layer = Dense(units=4, activation="relu")(first_layer)

# Output layer
output_layer = Dense(units=1)(second_layer)

# Define model
model = Model(inputs=input_layer, outputs=output_layer, name="m3")

# Display model summary
model.summary()

# Compile model
print("\n🎯 Model Compilation 🎯\n")
model.compile(optimizer=Adam(), loss=MeanSquaredError())

In [None]:
# Create model 4

# Print header for model creation
print("\n🎯 Regression Model Creation 🎯\n")

# Define input layer
input_layer = Input(shape=(13,))

# Define first layer
first_layer = Dense(units=8, activation="relu")(input_layer)

# Define second layer
second_layer = Dense(units=4, activation="relu")(first_layer)

# Output layer
output_layer = Dense(units=1)(second_layer)

# Define model
model = Model(inputs=input_layer, outputs=output_layer, name="m4")

# Display model summary
model.summary()

# Compile model
print("\n🎯 Model Compilation 🎯\n")
model.compile(optimizer=SGD(0.01, momentum=0.9), loss=MeanSquaredError())


In [None]:
# Create model 5

# Print header for model creation
print("\n🎯 Regression Model Creation 🎯\n")

# Define input layer
input_layer = Input(shape=(13,))

# Define first layer
first_layer = Dense(units=8, activation="relu")(input_layer)

# Define second layer
second_layer = Dense(units=4, activation="relu")(first_layer)

# Output layer
output_layer = Dense(units=1)(second_layer)

# Define model
model = Model(inputs=input_layer, outputs=output_layer, name="m5")

# Display model summary
model.summary()

# Compile model
print("\n🎯 Model Compilation 🎯\n")
model.compile(optimizer=Adam(learning_rate=0.001), loss=MeanSquaredError())