In [2]:
import numpy as np
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy


In [3]:
# Data generator
def generate_data(n_samples, n_features=10):
    X = np.random.normal(size=(n_samples, n_features))
    coef = np.random.uniform(-1, 1, size=n_features)
    logits = X @ coef
    y = (logits > 0).astype(int)
    return X, y

In [4]:

# Define dataset sizes and architectures
dataset_sizes = [1000, 10000, 100000]
architectures = ['1-layer', '2-layer']
results = []

In [5]:

# Run experiments
for size in dataset_sizes:
    X, y = generate_data(size)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    for arch in architectures:
        model = Sequential()
        if arch == '1-layer':
            model.add(Dense(4, activation='relu', input_shape=(X.shape[1],)))
        elif arch == '2-layer':
            model.add(Dense(4, activation='relu', input_shape=(X.shape[1],)))
            model.add(Dense(4, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(optimizer=Adam(), loss=BinaryCrossentropy())

        start_time = time.time()
        history = model.fit(X_train, y_train, epochs=10, verbose=0)
        elapsed_time = time.time() - start_time

        train_preds = model.predict(X_train).flatten()
        val_preds = model.predict(X_val).flatten()

        train_error = log_loss(y_train, train_preds)
        val_error = log_loss(y_val, val_preds)

        results.append({
            'Data Size': size,
            'Configuration': arch,
            'Training Error': round(train_error, 4),
            'Validation Error': round(val_error, 4),
            'Time (s)': round(elapsed_time, 2)
        })

# Show results
df_dl_results = pd.DataFrame(results)
print(df_dl_results)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
   Data Size Configuration  Training Error  Validation Error  Time (s)
0       1000       1-layer          0.6450            0.6363      4.11
1       1000       2-layer          0.5723            0.5569      6.16
2      10000       1-layer          0.0584            0.0533      6.38
3      10000       2-layer          0.0271            0.0241      7.70
4     100000       1-layer          0.0116            0.0120     45.10
5     100000       2-layer          0.0064            0.0064     52.83


As dataset size increased, both training and validation errors decreased substantially, indicating improved model generalization. Models with 2 hidden layers consistently outperformed those with 1 hidden layer, achieving lower errors across all dataset sizes. For example, at 100,000 observations, the 2-layer model achieved a validation error of 0.0064, compared to 0.0120 for the 1-layer model. While the 2-layer models required more training time, the gain in accuracy justifies the additional computation. Thus, the 2-layer architecture is superior, especially for larger datasets.