# Training of Final DNN
This jupyter notebook file is where the final version of the DNN is trained and saved. It is written to be fully reproducible.


I created a new env to run this file and its sister file (dnn_load_test.ipynb). 

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import numpy as np
import random

RANDOM_SEED = 11 # for reproduciblity
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Load data
df = pd.read_csv('../datasets/MEGAFRAME_CLEANEDV2.csv')

# Prepare features and target
X = df.drop(columns=['UNEMP', 'Reference area', 'REF_AREA', 'TIME_PERIOD'])
y = df['UNEMP']

# Define feature types
categorical_features = ['Region']
numerical_features = X.columns.difference(categorical_features)

# Create and fit preprocessor
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
])

X_processed = preprocessor.fit_transform(X)

# Save preprocessing parameters manually
import json

preprocessing_params = {
    'numerical_features': list(numerical_features),
    'categorical_features': categorical_features,
    'scaler_mean': preprocessor.named_transformers_['num'].mean_.tolist(),
    'scaler_scale': preprocessor.named_transformers_['num'].scale_.tolist(),
    'encoder_categories': [cat.tolist() for cat in preprocessor.named_transformers_['cat'].categories_]
}

with open('preprocessing_params.json', 'w') as f:
    json.dump(preprocessing_params, f, indent=2)

# Define model architecture
model = Sequential([
    Dense(128, activation='relu', kernel_regularizer=l2(0.0001), input_shape=(X_processed.shape[1],)),
    BatchNormalization(),
    Dropout(0.02),
    Dense(64, activation='relu', kernel_regularizer=l2(0.0001)),
    BatchNormalization(),
    Dropout(0.02),
    Dense(32, activation='relu', kernel_regularizer=l2(0.0001)),
    BatchNormalization(),
    Dense(16, activation='relu'),
    Dense(1)
])

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Create reproducible train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, test_size=0.1, random_state=RANDOM_SEED
)

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

# Train model
model.fit(X_train, y_train, 
          epochs=300, 
          batch_size=8, 
          validation_data=(X_val, y_val), 
          callbacks=[early_stop],
          verbose=1)

# Evaluate model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred = model.predict(X_processed)

mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Save trained model
model.save('Unemployment_AI_Revisions.keras')

# Test prediction on example data
new_data = pd.DataFrame({
    'Region': ['Europe and Central Asia'],  
    'Trade union density': [78.699997],
    'Combined corporate income tax rate': [28.0],
    'Education spending': [0.0734319847255705],
    'Health spending': [0.0631525528524754],
    'Housing spending': [0.0057497428086187],
    'Community development spending': [0.0025634702523358],
    'IRLT': [5.1075],
    'Population, total': [8895960.0],
    'GDP per capita (current US$)': [27259.4806735435],
    'Inflation, consumer prices (annual %)': [2.40595834145438],
    'Gini index': [26.5]
})

new_data_processed = preprocessor.transform(new_data)
predicted_unemployment = model.predict(new_data_processed, verbose=0)
print(f"\nPredicted Unemployment: {predicted_unemployment.flatten()[0]:.2f}%")

Epoch 1/300


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 74.0080 - mae: 7.6443 - val_loss: 64.7343 - val_mae: 6.9077
Epoch 2/300
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 48.8049 - mae: 6.0384 - val_loss: 52.2980 - val_mae: 6.0103
Epoch 3/300
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 26.3855 - mae: 4.1161 - val_loss: 35.9779 - val_mae: 4.6443
Epoch 4/300
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 11.4617 - mae: 2.5496 - val_loss: 24.7144 - val_mae: 3.5714
Epoch 5/300
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.2160 - mae: 1.9631 - val_loss: 19.3958 - val_mae: 3.0796
Epoch 6/300
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.6313 - mae: 1.5730 - val_loss: 15.9225 - val_mae: 2.7838
Epoch 7/300
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.7