In [None]:
# 1. Import Libraries

# 1.1. Standard Libraries

# 1.2. Third-party Libraries

# Data manipulation
import numpy as np  # Efficient array operations
import pandas as pd  # Data structures for working with structured data

# Data visualization
import matplotlib.pyplot as plt  # Plotting and data visualization
import seaborn as sns  # Further data visualization

# Preprocessing
from sklearn.preprocessing import MinMaxScaler  # Scales data to a range [0, 1]

# Deep learning
import tensorflow as tf  # Core machine learning framework
from keras.api.models import Model  # Model class
from keras.api.layers import Input, Dense  # Layers for the model

# 1.3. Local Libraries

2025-02-22 18:50:04.804889: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-22 18:50:04.805435: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-22 18:50:04.807986: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-22 18:50:04.815776: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740237604.829356  258616 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740237604.83

In [None]:
# 2. Function Definitions

# 2.1 Dataset Functions

# Function to analyze a dataset
def analyze_dataset(train_data, test_data, train_labels, test_labels):
    """
    Perform a complete analysis of the dataset, including:
    - Shape and data types
    - Missing values
    - Statistical summaries
    - Feature distributions
    - Correlation heatmap
    - Outlier detection
    - Label distribution

    Parameters:
        train_data (numpy.ndarray): Training feature set
        test_data (numpy.ndarray): Testing feature set
        train_labels (numpy.ndarray): Training labels
        test_labels (numpy.ndarray): Testing labels
    """

    # Convert to DataFrame for better analysis
    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)
    train_labels_df = pd.DataFrame(train_labels, columns=[''])
    test_labels_df = pd.DataFrame(test_labels, columns=[''])

    # Dataset Shape and Data Types
    print("\n🔹 Dataset Shape & Data Types:")
    print(f"Train data shape: {train_data.shape}, Type: {train_data.dtype}")
    print(f"Test data shape: {test_data.shape}, Type: {test_data.dtype}")
    print(f"Train labels shape: {train_labels.shape}, Type: {train_labels.dtype}")
    print(f"Test labels shape: {test_labels.shape}, Type: {test_labels.dtype}")

    # Checking for Missing Values
    print("\n🔹 Missing Values:")
    print(f"Train data missing values: {np.isnan(train_data).sum()}")
    print(f"Test data missing values: {np.isnan(test_data).sum()}")
    print(f"Train labels missing values: {np.isnan(train_labels).sum()}")
    print(f"Test labels missing values: {np.isnan(test_labels).sum()}")

    # Summary Statistics (using DataFrame)
    print("\n🔹 Summary Statistics:")
    print("\nTrain Data Statistics:\n\n", train_df.describe())
    print("\nTest Data Statistics:\n\n", test_df.describe())
    print("\nTrain Labels Statistics:\n", train_labels_df.describe())
    print("\nTest Labels Statistics:\n", test_labels_df.describe())

    # Feature Distributions
    num_features = train_data.shape[1]
    plt.figure(figsize=(15, num_features * 2))
    for i in range(num_features):
        plt.subplot((num_features // 3) + 1, 3, i + 1)
        sns.histplot(train_data[:, i], kde=True, bins=30, color="blue", label="Train")
        sns.histplot(test_data[:, i], kde=True, bins=30, color="orange", label="Test")
        plt.xlabel(f"Feature {i}")
        plt.ylabel("Count")
        plt.legend()
    plt.suptitle("Feature Distributions (Train vs. Test)\n\n")
    plt.tight_layout()
    plt.show()

    # Correlation Heatmap
    plt.figure(figsize=(12, 8))
    corr_matrix = train_df.corr()
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Feature Correlation Heatmap\n")
    plt.show()

    # Outlier Detection (Boxplots)
    plt.figure(figsize=(15, num_features * 2))
    for i in range(num_features):
        plt.subplot((num_features // 3) + 1, 3, i + 1)
        sns.boxplot(x=train_data[:, i], color="red")
        plt.xlabel(f"Feature {i}")
    plt.suptitle("Feature Outlier Detection (Boxplots)\n\n")
    plt.tight_layout()
    plt.show()

    # Label Distribution
    plt.figure(figsize=(10, 4))
    sns.histplot(train_labels, kde=True, bins=30, color="blue", label="Train Labels")
    sns.histplot(test_labels, kde=True, bins=30, color="orange", label="Test Labels")
    plt.xlabel("Labels")
    plt.ylabel("Count")
    plt.legend()
    plt.title("Label Distribution (Train vs. Test)\n")
    plt.show()

In [3]:
# 3. Load the Boston Housing dataset from Keras
# Automatically splits into training and test sets (features and labels)
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.boston_housing.load_data()

In [4]:
# 4. Use the analyze_dataset function to check the dataset
analyze_dataset(train_data, test_data, train_labels, test_labels)


🔹 Dataset Shape & Data Types:
Train data shape: (404, 13), Type: float64
Test data shape: (102, 13), Type: float64
Train labels shape: (404,), Type: float64
Test labels shape: (102,), Type: float64

🔹 Missing Values:
Train data missing values: 0
Test data missing values: 0
Train labels missing values: 0
Test labels missing values: 0

🔹 Summary Statistics:

Train Data Statistics:

                0           1           2           3           4           5   \
count  404.000000  404.000000  404.000000  404.000000  404.000000  404.000000   
mean     3.745111   11.480198   11.104431    0.061881    0.557356    6.267082   
std      9.240734   23.767711    6.811308    0.241238    0.117293    0.709788   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.081437    0.000000    5.130000    0.000000    0.453000    5.874750   
50%      0.268880    0.000000    9.690000    0.000000    0.538000    6.198500   
75%      3.674808   12.500000   18.100000    0.0

In [5]:

# 5. Preprocessing The Dataset

# 5.1. Reshape Labels

# Convert labels to a 2D array with shape (-1, 1) to ensure compatibility with models
train_labels = np.reshape(train_labels, newshape=(-1, 1))
test_labels = np.reshape(test_labels, newshape=(-1, 1))
print("\nTrain Labels Shape:", train_labels.shape)
print("Test Labels Shape:", test_labels.shape)

# 5.2 Normalize Data using Min-Max Scaling

# Check min/max values for each set of data
train_data_min, train_data_max = train_data.min(axis=0), train_data.max(axis=0)
test_data_min, test_data_max = test_data.min(axis=0), test_data.max(axis=0)
train_labels_min, train_labels_max = train_labels.min(axis=0), train_labels.max(axis=0)
test_labels_min, test_labels_max = test_labels.min(axis=0), test_labels.max(axis=0)
print("\nTrain Data Min:", train_data_min, "\nTrain Data Max:", train_data_max)
print("Test Data Min:", test_data_min, "\nTest Data Max:", test_data_max)
print("Train Labels Min:", train_labels_min, "\nTrain Labels Max:", train_labels_max)
print("Test Labels Min:", test_labels_min, "\nTest Labels Max:", test_labels_max)

# Fit the scaler on training data
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(train_data)

# Transform both training and test data
train_data = min_max_scaler.transform(train_data)
test_data = min_max_scaler.transform(test_data)

# Check min/max values for training and test data after normalization
train_min_post, train_max_post = train_data.min(axis=0), train_data.max(axis=0)
test_min_post, test_max_post = test_data.min(axis=0), test_data.max(axis=0)
print("\nPost-Normalization Train Data Min:", train_min_post, "\nPost-Normalization Train Data Max:", train_max_post)
print("Post-Normalization Test Data Min:", test_min_post, "\nPost-Normalization Test Data Max:", test_max_post)

# 5.3. Change Data Types

# Convert dataset values to float32tra to optimize memory usage and computation speed
train_data = train_data.astype(np.float32)
test_data = test_data.astype(np.float32)
train_labels = train_labels.astype(np.float32)
test_labels = test_labels.astype(np.float32)
print("\nTrain Data Type:", train_data.dtype)
print("Test Data Type:", test_data.dtype)
print("Train Labels Type:", train_labels.dtype)
print("Test Labels Type:", test_labels.dtype)



Train Labels Shape: (404, 1)
Test Labels Shape: (102, 1)

Train Data Min: [6.3200e-03 0.0000e+00 4.6000e-01 0.0000e+00 3.8500e-01 3.5610e+00
 2.9000e+00 1.1296e+00 1.0000e+00 1.8800e+02 1.2600e+01 3.2000e-01
 1.7300e+00] 
Train Data Max: [ 88.9762 100.      27.74     1.       0.871    8.725  100.      10.7103
  24.     711.      22.     396.9     37.97  ]
Test Data Min: [1.3110e-02 0.0000e+00 1.2200e+00 0.0000e+00 3.9200e-01 4.8800e+00
 6.0000e+00 1.4655e+00 1.0000e+00 1.8700e+02 1.3000e+01 2.4650e+01
 1.9200e+00] 
Test Data Max: [ 25.0461  90.      27.74     1.       0.871    8.78   100.      12.1265
  24.     711.      21.2    396.9     31.99  ]
Train Labels Min: [5.] 
Train Labels Max: [50.]
Test Labels Min: [5.6] 
Test Labels Max: [50.]

Post-Normalization Train Data Min: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] 
Post-Normalization Train Data Max: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Post-Normalization Test Data Min: [ 7.63179629e-05  0.00000000e+00  2.78592375e-02  0.00000000

In [6]:
# 6. Create regression model

# Define input layer
input_layer = Input(shape=(13,))

# Hidden layers with ReLU activation
first_layer = Dense(units=4, activation="relu")(input_layer)

# Output layer (regression, so no activation)
output_layer = Dense(units=1)(first_layer)

# Define model using Functional API
regression_model = Model(inputs=input_layer, outputs=output_layer)

# Display model summary
regression_model.summary()

W0000 00:00:1740237608.266103  258616 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
