In [None]:
# random forest for comparison, this tells us that maybe DNN is not the best model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest R²:", r2_score(y_test, y_pred_rf))

Random Forest R²: 0.9233577682525691


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_csv('../datasets/MEGAFRAME_CLEANEDV2.csv')

# Remove both targets and irrelevant columns from features
X = df.drop(columns=['Gini index', 'UNEMP', 'Reference area', 'REF_AREA', 'Region'])
y = df['Gini index']

categorical_features = ['TIME_PERIOD']
numerical_features = X.columns.difference(categorical_features)

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

X_processed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
# Chat GPT gave me the idea to do droupout layers, the lower I used the better the model performed
model = Sequential([
    Dense(128, activation='relu', kernel_regularizer=l2(0.0001), input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.1),
    Dense(64, activation='relu', kernel_regularizer=l2(0.0001)),
    BatchNormalization(),
    Dropout(0.1),
    Dense(32, activation='relu', kernel_regularizer=l2(0.0001)),
    BatchNormalization(),
    Dense(16, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

model.fit(X_train, y_train, epochs=300, batch_size=8, validation_split=0.1, callbacks=[early_stop])

loss, mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {mae:.2f}")

new_data = pd.DataFrame({
    'TIME_PERIOD': [2001],
    'Trade union density': [78.699997],
    'Combined corporate income tax rate': [28.0],
    'Education spending': [0.0734319847255705],
    'Health spending': [0.0631525528524754],
    'Housing spending': [0.0057497428086187],
    'Community development spending': [0.0025634702523358],
    'IRLT': [5.1075],
    'Population, total': [8895960.0],
    'GDP per capita (current US$)': [27259.4806735435],
    'Inflation, consumer prices (annual %)': [2.40595834145438]
})

new_data_processed = preprocessor.transform(new_data)
predicted_gini = model.predict(new_data_processed)
print(f"Predicted Gini: {predicted_gini.flatten()[0]:.2f}")


Epoch 1/300


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 1023.3794 - mae: 31.4452 - val_loss: 957.7689 - val_mae: 30.7362
Epoch 2/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 885.7900 - mae: 29.3088 - val_loss: 900.7255 - val_mae: 29.7871
Epoch 3/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 749.7896 - mae: 26.9808 - val_loss: 790.8032 - val_mae: 27.8689
Epoch 4/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 629.8449 - mae: 24.5815 - val_loss: 605.0908 - val_mae: 24.1056
Epoch 5/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 472.8452 - mae: 20.9160 - val_loss: 410.5809 - val_mae: 19.1492
Epoch 6/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 321.7488 - mae: 16.5507 - val_loss: 238.1116 - val_mae: 13.2952
Epoch 7/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [49]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
MAE: 2.23
MSE: 10.81
RMSE: 3.29
R² Score: 0.69


In [53]:
from sklearn.metrics import mean_absolute_error, r2_score

mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X_processed):
    X_train, X_val = X_processed[train_index], X_processed[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = Sequential([
        Dense(128, activation='relu', kernel_regularizer=l2(0.0001), input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Dropout(0.1),
        Dense(64, activation='relu', kernel_regularizer=l2(0.0001)),
        BatchNormalization(),
        Dropout(0.1),
        Dense(32, activation='relu', kernel_regularizer=l2(0.0001)),
        BatchNormalization(),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
    
    model.fit(X_train, y_train, epochs=300, batch_size=8, validation_data=(X_val, y_val), callbacks=[early_stop], verbose=0)
    
    y_val_pred = model.predict(X_val).flatten()
    
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    mae_scores.append(mae)
    r2_scores.append(r2)

print(f"Mean MAE over 5 folds: {np.mean(mae_scores):.2f} ± {np.std(mae_scores):.2f}")
print(f"Mean R² over 5 folds: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Mean MAE over 5 folds: 1.93 ± 0.16
Mean R² over 5 folds: 0.7293 ± 0.0907


In [54]:
print("Summary Statistics:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nData Types:")
print(df.dtypes)

Summary Statistics:
       TIME_PERIOD  Trade union density  Combined corporate income tax rate  \
count   400.000000           400.000000                          400.000000   
mean   2010.605000            32.947000                           25.384909   
std       5.088094            21.854253                            6.892238   
min    2000.000000             7.100000                            9.000000   
25%    2006.750000            16.600000                           20.000000   
50%    2011.000000            25.050000                           25.000000   
75%    2015.000000            49.825000                           30.000000   
max    2020.000000            91.599998                           51.611553   

       Education spending  Health spending  Housing spending  \
count          400.000000       400.000000        400.000000   
mean             0.052006         0.064127          0.003168   
std              0.014906         0.017721          0.003309   
min         