In [None]:
# 1. Import Libraries

# 1.1. Standard Libraries

# 1.2. Third-party Libraries

# Data manipulation
import numpy as np  # Efficient array operations
import pandas as pd  # Data structures for working with structured data

# Data visualization
import matplotlib.pyplot as plt  # Plotting and data visualization
import seaborn as sns  # Further data visualization

# Preprocessing
from sklearn.preprocessing import MinMaxScaler  # Scales data to a range [0, 1]

# Deep learning
import tensorflow as tf  # Core machine learning framework
from keras.api.models import Model  # Model class
from keras.api.layers import Input, Dense  # Layers for the model
from keras.api.optimizers import Adam  # Optimizer for the model
from keras.api.losses import MeanSquaredError  # Loss function for the model

# 1.3. Local Libraries

In [None]:
# 2. Function Definitions

# 2.1 Dataset Functions

# Function to analyze a dataset
def analyze_dataset(train_data, test_data, train_labels, test_labels):
    """
    Perform a complete analysis of the dataset, including:
    - Shape and data types
    - Missing values
    - Statistical summaries
    - Feature distributions
    - Correlation heatmap
    - Outlier detection
    - Label distribution

    Parameters:
        train_data (numpy.ndarray): Training feature set
        test_data (numpy.ndarray): Testing feature set
        train_labels (numpy.ndarray): Training labels
        test_labels (numpy.ndarray): Testing labels
    """

    # Convert to DataFrame for better analysis
    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)
    train_labels_df = pd.DataFrame(train_labels, columns=[''])
    test_labels_df = pd.DataFrame(test_labels, columns=[''])

    # Dataset Shape and Data Types
    print("\n🔹 Dataset Shape & Data Types:")
    print(f"Train data shape: {train_data.shape}, Type: {train_data.dtype}")
    print(f"Test data shape: {test_data.shape}, Type: {test_data.dtype}")
    print(f"Train labels shape: {train_labels.shape}, Type: {train_labels.dtype}")
    print(f"Test labels shape: {test_labels.shape}, Type: {test_labels.dtype}")

    # Checking for Missing Values
    print("\n🔹 Missing Values:")
    print(f"Train data missing values: {np.isnan(train_data).sum()}")
    print(f"Test data missing values: {np.isnan(test_data).sum()}")
    print(f"Train labels missing values: {np.isnan(train_labels).sum()}")
    print(f"Test labels missing values: {np.isnan(test_labels).sum()}")

    # Summary Statistics (using DataFrame)
    print("\n🔹 Summary Statistics:")
    print("\nTrain Data Statistics:\n\n", train_df.describe())
    print("\nTest Data Statistics:\n\n", test_df.describe())
    print("\nTrain Labels Statistics:\n", train_labels_df.describe())
    print("\nTest Labels Statistics:\n", test_labels_df.describe())

    # Feature Distributions
    num_features = train_data.shape[1]
    plt.figure(figsize=(15, num_features * 2))
    for i in range(num_features):
        plt.subplot((num_features // 3) + 1, 3, i + 1)
        sns.histplot(train_data[:, i], kde=True, bins=30, color="blue", label="Train")
        sns.histplot(test_data[:, i], kde=True, bins=30, color="orange", label="Test")
        plt.xlabel(f"Feature {i}")
        plt.ylabel("Count")
        plt.legend()
    plt.suptitle("Feature Distributions (Train vs. Test)\n\n")
    plt.tight_layout()
    plt.show()

    # Correlation Heatmap
    plt.figure(figsize=(12, 8))
    corr_matrix = train_df.corr()
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Feature Correlation Heatmap\n")
    plt.show()

    # Outlier Detection (Boxplots)
    plt.figure(figsize=(15, num_features * 2))
    for i in range(num_features):
        plt.subplot((num_features // 3) + 1, 3, i + 1)
        sns.boxplot(x=train_data[:, i], color="red")
        plt.xlabel(f"Feature {i}")
    plt.suptitle("Feature Outlier Detection (Boxplots)\n\n")
    plt.tight_layout()
    plt.show()

    # Label Distribution
    plt.figure(figsize=(10, 4))
    sns.histplot(train_labels, kde=True, bins=30, color="blue", label="Train Labels")
    sns.histplot(test_labels, kde=True, bins=30, color="orange", label="Test Labels")
    plt.xlabel("Labels")
    plt.ylabel("Count")
    plt.legend()
    plt.title("Label Distribution (Train vs. Test)\n")
    plt.show()

In [None]:
# 3. Load the Boston Housing dataset from Keras
# Automatically splits into training and test sets (features and labels)
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.boston_housing.load_data()

In [None]:
# 4. Use the analyze_dataset function to check the dataset
analyze_dataset(train_data, test_data, train_labels, test_labels)

In [None]:

# 5. Preprocessing The Dataset

# 5.1. Reshape Labels

# Convert labels to a 2D array with shape (-1, 1) to ensure compatibility with models
train_labels = np.reshape(train_labels, newshape=(-1, 1))
test_labels = np.reshape(test_labels, newshape=(-1, 1))
print("\nTrain Labels Shape:", train_labels.shape)
print("Test Labels Shape:", test_labels.shape)

# 5.2 Normalize Data using Min-Max Scaling

# Check min/max values for each set of data
train_data_min, train_data_max = train_data.min(axis=0), train_data.max(axis=0)
test_data_min, test_data_max = test_data.min(axis=0), test_data.max(axis=0)
train_labels_min, train_labels_max = train_labels.min(axis=0), train_labels.max(axis=0)
test_labels_min, test_labels_max = test_labels.min(axis=0), test_labels.max(axis=0)
print("\nTrain Data Min:", train_data_min, "\nTrain Data Max:", train_data_max)
print("Test Data Min:", test_data_min, "\nTest Data Max:", test_data_max)
print("Train Labels Min:", train_labels_min, "\nTrain Labels Max:", train_labels_max)
print("Test Labels Min:", test_labels_min, "\nTest Labels Max:", test_labels_max)

# Fit the scaler on training data
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(train_data)

# Transform both training and test data
train_data = min_max_scaler.transform(train_data)
test_data = min_max_scaler.transform(test_data)

# Check min/max values for training and test data after normalization
train_min_post, train_max_post = train_data.min(axis=0), train_data.max(axis=0)
test_min_post, test_max_post = test_data.min(axis=0), test_data.max(axis=0)
print("\nPost-Normalization Train Data Min:", train_min_post, "\nPost-Normalization Train Data Max:", train_max_post)
print("Post-Normalization Test Data Min:", test_min_post, "\nPost-Normalization Test Data Max:", test_max_post)

# 5.3. Change Data Types

# Convert dataset values to float32tra to optimize memory usage and computation speed
train_data = train_data.astype(np.float32)
test_data = test_data.astype(np.float32)
train_labels = train_labels.astype(np.float32)
test_labels = test_labels.astype(np.float32)
print("\nTrain Data Type:", train_data.dtype)
print("Test Data Type:", test_data.dtype)
print("Train Labels Type:", train_labels.dtype)
print("Test Labels Type:", test_labels.dtype)


In [None]:
# 6.1. Create regression model with minimal setting

# Define input layer
input_layer = Input(shape=(13,))

# Hidden layers with ReLU activation
first_layer = Dense(units=4, activation="relu")(input_layer)

# Output layer (regression, so no activation)
output_layer = Dense(units=1)(first_layer)

# Define model using Functional API
regression_model = Model(inputs=input_layer, outputs=output_layer)

# Display model summary
regression_model.summary()

In [None]:
# 6.2. Create regression model with 32 batch size for input

# Define input layer
input_layer = Input(shape=(13,), batch_size=32)

# Hidden layers with ReLU activation
first_layer = Dense(units=4, activation="relu")(input_layer)

# Output layer (regression, so no activation)
output_layer = Dense(units=1)(first_layer)

# Define model using Functional API
regression_model = Model(inputs=input_layer, outputs=output_layer)

# Display model summary
regression_model.summary()

In [None]:
# 6.3. Create regression model with 8 units in the hidden layer

# Define input layer
input_layer = Input(shape=(13,))

# Hidden layers with ReLU activation
first_layer = Dense(units=8, activation="relu")(input_layer)

# Output layer (regression, so no activation)
output_layer = Dense(units=1)(first_layer)

# Define model using Functional API
regression_model = Model(inputs=input_layer, outputs=output_layer)

# Display model summary
regression_model.summary()

In [None]:
# 7. Compile the regression model with Adam optimizer and mean squared error loss
regression_model.compile(optimizer="adam", loss="mse")

In [None]:
# 8. Plot training labels and predictions before training
plt.plot(train_labels[:30], "r-", label="y_true")
plt.plot(regression_model(train_data[:30]), "b-", label="y_pred")

In [None]:
# 9. Plot test labels and predictions before training
plt.plot(test_labels[:30], "r-", label="y_true")
plt.plot(regression_model(test_data[:30]), "b-", label="y_pred")

In [None]:
# 9. Train the regression model on the training data
regression_model.fit(x=train_data, y=train_labels, epochs=100, batch_size=8, validation_data=(test_data, test_labels))

In [None]:
# 10. Plot training labels and predictions after training
plt.plot(train_labels[:30], "r-", label="y_true")
plt.plot(regression_model(train_data[:30]), "b-", label="y_pred")

In [None]:
# 11. Plot test labels and predictions after training
plt.plot(test_labels[:30], "r-", label="y_true")
plt.plot(regression_model(test_data[:30]), "b-", label="y_pred")