In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')

# Load the data
pima_df = pd.read_csv("week_11_pima.csv")

# Prepare the data
X = pima_df.drop('outcome', axis=1)
y = pima_df['outcome']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Function to train and evaluate models
def train_and_evaluate(X_train, y_train, X_val, y_val, hidden_layers, max_samples=None):
    if max_samples:
        # Limit the training data size
        X_train_subset = X_train[:max_samples]
        y_train_subset = y_train[:max_samples]
    else:
        X_train_subset = X_train
        y_train_subset = y_train

    # Create and train the model
    start_time = time.time()

    model = MLPClassifier(
        hidden_layer_sizes=hidden_layers,
        activation='relu',
        solver='adam',
        max_iter=500,
        random_state=42
    )

    model.fit(X_train_subset, y_train_subset)

    # Calculate training and validation accuracy
    train_pred = model.predict(X_train_subset)
    val_pred = model.predict(X_val)

    train_error = 1 - accuracy_score(y_train_subset, train_pred)
    val_error = 1 - accuracy_score(y_val, val_pred)

    execution_time = time.time() - start_time

    return train_error, val_error, execution_time

# Run experiments
results = []

# Define configurations
configs = [
    (1000, (4,), "1 hidden layer 4 nodes"),
    (10000, (4,), "1 hidden layer 4 nodes"),
    (100000, (4,), "1 hidden layer 4 nodes"),
    (1000, (4, 4), "2 hidden layers of 4 nodes each"),
    (10000, (4, 4), "2 hidden layers of 4 nodes each"),
    (100000, (4, 4), "2 hidden layers of 4 nodes each")
]

print("Data size | Configuration | Training error | Validation error | Time of execution")
print("-" * 80)

for data_size, hidden_layers, config_desc in configs:
    # For datasets larger than available, create synthetic data by resampling
    if data_size > len(X_train):
        # Create synthetic data by resampling with replacement
        indices = np.random.choice(len(X_train), size=data_size, replace=True)
        X_train_resampled = X_train_scaled[indices]
        y_train_resampled = y_train.iloc[indices]

        train_error, val_error, exec_time = train_and_evaluate(
            X_train_resampled, y_train_resampled, X_val_scaled, y_val,
            hidden_layers
        )
    else:
        train_error, val_error, exec_time = train_and_evaluate(
            X_train_scaled, y_train, X_val_scaled, y_val,
            hidden_layers, max_samples=data_size
        )

    print(f"{data_size:8d} | {config_desc:30s} | {train_error:.6f} | {val_error:.6f} | {exec_time:.6f}")

    results.append({
        'Data size': data_size,
        'Configuration': config_desc,
        'Training error': train_error,
        'Validation error': val_error,
        'Time of execution': exec_time
    })

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)
print("\nResults Summary:")
print(results_df)

Data size | Configuration | Training error | Validation error | Time of execution
--------------------------------------------------------------------------------
    1000 | 1 hidden layer 4 nodes         | 0.004000 | 0.015130 | 1.097652
   10000 | 1 hidden layer 4 nodes         | 0.001100 | 0.001515 | 3.505305
  100000 | 1 hidden layer 4 nodes         | 0.000920 | 0.001012 | 14.969793
    1000 | 2 hidden layers of 4 nodes each | 0.002000 | 0.006722 | 1.207694
   10000 | 2 hidden layers of 4 nodes each | 0.000300 | 0.001406 | 3.405355
  100000 | 2 hidden layers of 4 nodes each | 0.000580 | 0.000551 | 22.005155

Results Summary:
   Data size                    Configuration  Training error  \
0       1000           1 hidden layer 4 nodes         0.00400   
1      10000           1 hidden layer 4 nodes         0.00110   
2     100000           1 hidden layer 4 nodes         0.00092   
3       1000  2 hidden layers of 4 nodes each         0.00200   
4      10000  2 hidden layers of 4 node