### Week 12 - Neural Network

For this assignment, you will combine your work from last week with deep learning models to built this week, and will compare the results. Please complete the following tasks.

In [29]:
# import libraries
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [30]:
import statsmodels.api as sm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scipy.special import expit as sigmoid

In [31]:
# load dataset
df = pd.read_csv('pima-indians-diabetes.data.csv', header=None)
df.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
              'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = df.dropna()
X_real = df.drop(columns='Outcome')
y_real = df['Outcome']

In [32]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_real)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_real.columns)
X_scaled_df['Intercept'] = 1
X_scaled_df = X_scaled_df[['Intercept'] + list(X_real.columns)]
model = sm.Logit(y_real, X_scaled_df).fit(disp=0)
coeffs = model.params

In [33]:
# generate synthetic datasets with logistic model
def generate_synthetic_data_set(size):
    sampled_X_data = X_real.sample(n=size, replace=True, random_state=42).reset_index(drop=True)
    X_scaled_data = scaler.transform(sampled_X_data)
    # Fix: Use X_scaled_data instead of X_scaled to create the DataFrame
    X_df = pd.DataFrame(X_scaled_data, columns=sampled_X_data.columns)
    X_df['Intercept'] = 1
    X_df = X_df[['Intercept'] + list(sampled_X_data.columns)]
    logits = np.dot(X_df, coeffs)
    probs = sigmoid(logits)
    y_synth = np.random.binomial(1, probs)
    return sampled_X_data, y_synth

In [34]:
# TRAIN DEEP LEARN MODEL AND GET RESULTS
# Define deep learning model with required configuration: 4 nodes per hidden layer
def train_deep_model(X, y, layers):
    X_train, X_valU, y_train, y_valU = train_test_split(X, y, test_size=0.2, random_state=42)
    model = Sequential()
    model.add(Dense(4, activation='relu', input_dim=X.shape[1]))
    if layers == 2:
        model.add(Dense(4, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy')
    start = time.time()
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
    end = time.time()
    train_preds = model.predict(X_train).flatten()
    val_preds = model.predict(X_valU).flatten()
    return log_loss(y_train, train_preds), log_loss(y_valU, val_preds), round(end - start, 2)

In [28]:
#RUN THE CODE AND GET RESULTS
results = []
for size in [1000, 10000, 100000]:
    X_syn, y_synth = generate_synthetic_data_set(size)
    for layer_count in [1, 2]:
        tr_err, val_err, exec_time = train_deep_model(X_syn, y_synth, layer_count)
        results.append({
            'Data Size': size,
            'Hidden Layers': layer_count,
            'Training Error': tr_err,
            'Validation Error': val_err,
            'Execution Time (s)': exec_time
        })

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [35]:
pd.DataFrame(results)

Unnamed: 0,Data Size,Hidden Layers,Training Error,Validation Error,Execution Time (s)
0,1000,1,10.381582,10.232077,1.7
1,1000,2,1.225252,1.135256,2.13
2,10000,1,0.589782,0.591053,8.29
3,10000,2,0.616871,0.614597,7.05
4,100000,1,0.487073,0.485566,45.45
5,100000,2,0.477874,0.476901,47.71


The table above compares the performance of various deep learning models trained on different dataset sizes and configurations, including changes in the number of layers, neurons, and training epochs. Analyzing these results allows us to see how the deep learning model grows with more data and how architectural modifications affect its accuracy and generalization capacity. These results are now ready to be compared to the performance of the XGBoost model implemented in Week 11.


XGBoost, noted for its ability to handle structured/tabular data and provide high accuracy with minimal training time, serves as a solid baseline. Comparing deep learning models to XGBoost will help us identify which strategy is better for this challenge in terms of predictive performance, computational cost, and model interpretability. This comparison will help you select the best model for real-world deployment.