## Apply variable transformations

In [4]:
import numpy
import pandas as pd
import os, sys
parent_dir = os.path.abspath('../../')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from functions.create_panel_dataset import custom_log_transform

In [5]:
foundational_df_no_transformations = pd.read_csv('./output/foundational_no_transformations.csv')
foundational_df_no_transformations_accounts = pd.read_csv('./output/foundational_no_transformations_accounts.csv')

1. Transform energy use variable to log

In [6]:
foundational_df_transformed = foundational_df_no_transformations.copy()
min_value = foundational_df_no_transformations["energy"].min()
max_value = foundational_df_no_transformations["energy"].max()
foundational_df_transformed["energy"] = foundational_df_no_transformations["energy"].apply(
    lambda x: custom_log_transform(x, min_value, max_value)
)

Same for accounts data

In [7]:
foundational_df_transformed_accounts = foundational_df_no_transformations_accounts.copy()
min_value = foundational_df_no_transformations_accounts["energy"].min()
max_value = foundational_df_no_transformations_accounts["energy"].max()
foundational_df_transformed_accounts["energy"] = foundational_df_no_transformations_accounts["energy"].apply(
    lambda x: custom_log_transform(x, min_value, max_value)
)

2. Transform need satisfaction variables using saturation transformation

In [8]:
need_satisfaction_variables = {
    "hale": 77,
    "education": 102,
    "socialsupport": 80,
    # "lifesatisfaction": 10,
    # "nutrition": 100.3,
    # "sanitation": 100.7,
    # "incomepoverty": 100.3,
    # "energyaccess": 100.7,
}
for col in need_satisfaction_variables.keys():
    if col in foundational_df_no_transformations.columns:
        foundational_df_transformed[col] = foundational_df_no_transformations[col].apply(
            lambda x: numpy.log(need_satisfaction_variables[col] - x)
        )
        if foundational_df_transformed[col].isna().any():
            print(
                f"Warning: NaN values found in column {col}. Original values might be too close to or exceed the saturation point."
            )
    else:
        print(f"Warning: Column {col} not found in imputed DataFrame.")



Same for accounts data

In [9]:
for col in need_satisfaction_variables.keys():
    if col in foundational_df_no_transformations_accounts.columns:
        foundational_df_transformed_accounts[col] = foundational_df_no_transformations_accounts[col].apply(
            lambda x: numpy.log(need_satisfaction_variables[col] - x)
        )
        if foundational_df_transformed_accounts[col].isna().any():
            print(
                f"Warning: NaN values found in column {col}. Original values might be too close to or exceed the saturation point."
            )
    else:
        print(f"Warning: Column {col} not found in imputed DataFrame.")



Transform provisioning factors using appropriate transformations.
Not applying any transformations to foundational economy variables.

In [10]:
provisioning_factor_variables = [
    "material",
    "other",
    "overlooked",
    "providential"
]
for col in provisioning_factor_variables:
    min_value = foundational_df_no_transformations[col].min()
    max_value = foundational_df_no_transformations[col].max()
    foundational_df_transformed[col] = foundational_df_no_transformations[col].apply(
        lambda x: custom_log_transform(x, min_value, max_value)
    )

Same for accounts data

In [11]:
for col in provisioning_factor_variables:
    min_value = foundational_df_no_transformations_accounts[col].min()
    max_value = foundational_df_no_transformations_accounts[col].max()
    foundational_df_transformed_accounts[col] = foundational_df_no_transformations_accounts[col].apply(
        lambda x: custom_log_transform(x, min_value, max_value)
    )

Standardize by subtracting the mean and dividing by the standard deviation.

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
variables_to_scale = ['material', 'other', 'overlooked', 'providential', 'energy', 'ladder', 'socialsupport', 'hale', 'freedom', 'charity', 'corruption', 'positive', 'negative', 'education', 'wdi_hale', 'goveffectiveness', 'gini']
scaler = StandardScaler()
foundational_df_transformed[variables_to_scale] = scaler.fit_transform(foundational_df_transformed[variables_to_scale])

Same for accounts data

In [14]:
scaler_accounts = StandardScaler()
foundational_df_transformed_accounts[variables_to_scale] = scaler_accounts.fit_transform(foundational_df_transformed_accounts[variables_to_scale])

In [15]:
scaler_df = pd.DataFrame({'variable': variables_to_scale, 'scale': scaler.scale_, 'mean': scaler.mean_})
scaler_df.to_csv('./output/scaler_values.csv', index=False)

In [16]:
scaler_df_accounts = pd.DataFrame({'variable': variables_to_scale, 'scale': scaler_accounts.scale_, 'mean': scaler_accounts.mean_})
scaler_df_accounts.to_csv('./output/scaler_values_accounts.csv', index=False)

In [17]:
foundational_df_transformed.head()

Unnamed: 0,geo,TIME_PERIOD,material,other,overlooked,providential,energy,ladder,socialsupport,hale,freedom,charity,corruption,positive,negative,education,wdi_hale,goveffectiveness,gini
0,AL,2012,0.32441,-2.557666,0.041156,1.808007,-2.368645,-0.980589,1.858332,0.632234,-1.35445,-1.011505,0.647092,-1.127141,0.202938,-0.178674,-0.360672,-1.797876,-0.685966
1,AL,2016,-0.517182,-4.211705,3.545856,1.316092,-2.184553,-2.077999,4.151838,0.264796,-0.444514,-0.021611,0.863043,-0.977443,0.935614,-0.354074,-0.096066,-1.335701,0.464768
2,AL,2020,-2.087624,1.354344,2.896553,-2.020952,-2.283013,-1.139873,3.029019,0.21977,-0.273901,0.129167,0.822297,-1.020214,0.11674,0.458563,-0.718024,-1.596241,-0.588031
3,AT,2008,0.152692,1.45962,0.929579,-1.900301,0.746746,0.855018,-0.486372,-0.050768,0.614709,1.984399,-0.30635,0.615776,-1.20495,-0.318893,0.426401,1.078706,-0.343194
4,AT,2012,0.23337,1.599969,1.018032,-2.142763,0.716622,1.096689,-0.642844,-0.251848,0.906173,0.850282,0.333353,0.573005,-1.434809,-0.312564,0.594233,0.780773,-0.31871


In [18]:
foundational_df_transformed.to_csv('./output/foundational_transformed.csv', index=False) 

In [19]:
foundational_df_transformed_accounts.to_csv('./output/foundational_transformed_accounts.csv', index=False) 

### Next step: conduct regressions! Open [regressions.ipynb](regressions.ipynb) to proceed