In [None]:
#  Import the libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Function to train a model
def train_model(X, y, hidden_layers, dataset_name):
    print(f"\nTraining model {hidden_layers} on {dataset_name}")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    model = Sequential()
    for i, units in enumerate(hidden_layers):
        if i == 0:
            model.add(Dense(units, activation='relu', input_shape=(X_train.shape[1],)))
        else:
            model.add(Dense(units, activation='relu'))
    model.add(Dense(1))  # Output layer for regression

    model.compile(optimizer='adam', loss='mse')

    start = time.time()
    model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0, validation_data=(X_val, y_val))
    end = time.time()

    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    train_error = mean_squared_error(y_train, train_pred)
    val_error = mean_squared_error(y_val, val_pred)
    time_taken = end - start

    print(f"Train Error: {train_error:.4f}, Val Error: {val_error:.4f}, Time: {time_taken:.2f}s")
    return {
        "Dataset": dataset_name,
        "Layers": str(hidden_layers),
        "Train Error": train_error,
        "Val Error": val_error,
        "Time (s)": time_taken
    }

# Dataset filenames
files = {
    "1000": "dataset_1000.csv",
    "10000": "dataset_10000.csv",
    "100000": "dataset_1e+05.csv"
}

results = []

# Loop through datasets and train models
for size, filename in files.items():
    try:
        data = pd.read_csv(filename)
        if "target" in data.columns:
            y = data["target"]
            X = data.drop(columns=["target"])
        else:
            # Assume last column is target if not named
            y = data.iloc[:, -1]
            X = data.iloc[:, :-1]

        results.append(train_model(X, y, [4], f"{size} rows - 1 hidden layer"))
        results.append(train_model(X, y, [4, 4], f"{size} rows - 2 hidden layers"))

    except FileNotFoundError:
        print(f"File not found: {filename} (please ensure it's in your directory)")

# Save results
df_results = pd.DataFrame(results)
df_results.to_csv("deep_learning_results.csv", index=False)
print("\nFinal Results Table:\n", df_results)



Training model [4] on 1000 rows - 1 hidden layer


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
Train Error: 0.1423, Val Error: 0.1387, Time: 4.61s

Training model [4, 4] on 1000 rows - 2 hidden layers


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Train Error: 0.0681, Val Error: 0.0724, Time: 5.14s

Training model [4] on 10000 rows - 1 hidden layer


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Train Error: 0.0432, Val Error: 0.0402, Time: 16.11s

Training model [4, 4] on 10000 rows - 2 hidden layers


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Train Error: 0.0137, Val Error: 0.0135, Time: 15.08s

Training model [4] on 100000 rows - 1 hidden layer


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Train Error: 0.0026, Val Error: 0.0026, Time: 146.00s

Training model [4, 4] on 100000 rows - 2 hidden layers


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step  
Train Error: 0.0020, Val Error: 0.0020, Time: 139.29s

Final Results Table:
                          Dataset  Layers  Train Error  Val Error    Time (s)
0     1000 rows - 1 hidden layer     [4]     0.142268   0.138698    4.611115
1    1000 rows - 2 hidden layers  [4, 4]     0.068090   0.072374    5.142219
2    10000 rows - 1 hidden layer     [4]     0.043241   0.040219   16.112213
3   10000 rows - 2 hidden layers  [4, 4]     0.013719   0.013547   15.076488
4   100000 rows - 1 hidden layer     [4]     0.002596   0.002600  146.001660
5  100000 rows - 2 hidden layers  [4, 4]     0.002023   0.002019  139.287077


The results table shows how neural network performance varies with dataset size and model complexity. As the number of rows increases from 1,000 to 100,000, both training and validation errors decrease significantly, highlighting the positive impact of larger datasets on model accuracy. For example, the validation error drops from around 13.9% with 1,000 rows to as low as 0.20% with 100,000 rows, showing that more data helps the model learn patterns more effectively and generalize better. Additionally, increasing model complexity by using two hidden layers instead of one consistently improves performance across all dataset sizes. This is especially noticeable in smaller datasets, where the validation error is nearly halved when a second hidden layer is added. However, the performance gain becomes smaller as the dataset size increases, suggesting diminishing returns from added complexity when ample data is available. In terms of training time, it increases substantially with dataset size, but the number of layers has only a minor effect. Interestingly, for the largest dataset, the two-layer model trained slightly faster than the one-layer model, possibly due to variation in system performance or optimization differences. Overall, the results emphasize the importance of both data quantity and appropriate model complexity for achieving low error rates in neural network training.
