# Provisioning Factors, Master Thesis

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings

## Loading the dataset

Load the data, remove inf values and extract countries, variables

In [None]:
all_in_one = pd.read_csv('../data/all_in_one.csv', index_col=0)

In [None]:
all_in_one_no_inf = all_in_one.replace([np.inf, -np.inf], np.nan)
all_variables = all_in_one_no_inf.drop(columns=["Year", "Country.Code", "Country.Name"])
all_countries = all_in_one_no_inf['Country.Name'].unique()

Apply basic styles and a color scheme to all plots. Ignore selected warnings.

In [None]:
# Prepare a color palette for all unique countries in the dataset
sns.set_theme(style="dark")
palette = dict(zip(all_countries, sns.color_palette("husl", len(all_countries))))

In [None]:
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

Create panel data set for each outcome variable.
Start with year 1995 where ecirank data is available, end with 2015, the last year for which energy data is available. 
Only keep observations where energy use data is available.
Remove selected columns (selection process?)

In [None]:
all_in_one_selected_years = all_in_one_no_inf.loc[all_in_one_no_inf['Year'].between(1995, 2015)]
all_in_one_selected_years_with_energy_only = all_in_one_selected_years[
    all_in_one_selected_years[['energy']].notnull().all(1)]

Drop stock variables, only keep flow

In [None]:
stock_variables = ['netmigration', 'wealth', 'patents', 'concentration', 'selfemployed', 'grosscapital', 'gdppercap',
                   'gnipercap', 'trade', 'gini', 'corruption', 'mobilecellular']
all_in_one_selected_cols = all_in_one_selected_years_with_energy_only.drop(
    columns=stock_variables)

For cross-sectional analysis filter the data frame by year 2012

In [None]:
all_in_one_selected_cols_2012 = all_in_one_selected_cols.loc[
    all_in_one_selected_cols['Year'] == 2012]

Begin subsetting the data. Assign outcome variables and indicators to separate data frames.

In [None]:
from functions.create_panel_dataset import remove_cols_with_few_observations, create_outcome_df_with_metadata, \
    remove_outliers_iqr

In [None]:
outcome_variables = ['lifeexpectancy_over_energy', 'nutrition_over_energy', 'education_over_energy',
                     'sanitation_over_energy']
indicator_index_offset = 3
outcome_index_offset = 17

Create separate data frames containing observations with data available for each outcome variable. Remove columns with few observations.
After subsetting is done, create final data frames with metadata.

In [None]:
outcome_dfs_with_metadata = {}
for outcome in outcome_variables:
    outcome_df = all_in_one_selected_cols.dropna(subset=[outcome])
    outcome_df_no_outliers = remove_outliers_iqr(outcome_df)
    outcome_df_no_outliers_no_few_obs = remove_cols_with_few_observations(outcome_df_no_outliers)

    outcome_df_with_metadata = create_outcome_df_with_metadata(
        outcome_df_no_outliers_no_few_obs, outcome, indicator_index_offset, outcome_index_offset
    )
    outcome_dfs_with_metadata[outcome] = outcome_df_with_metadata

In [None]:
outcome_dfs_with_metadata_2012 = {}
for outcome in outcome_variables:
    outcome_df_2012 = all_in_one_selected_cols_2012.dropna(subset=[outcome])
    outcome_df_no_outliers_2012 = remove_outliers_iqr(outcome_df_2012)
    outcome_df_no_outliers_no_few_obs_2012 = remove_cols_with_few_observations(outcome_df_no_outliers_2012)

    outcome_df_with_metadata_2012 = create_outcome_df_with_metadata(
        outcome_df_no_outliers_no_few_obs_2012, outcome, indicator_index_offset, outcome_index_offset
    )
    outcome_dfs_with_metadata_2012[outcome] = outcome_df_with_metadata_2012

## Exploratory analysis

Create a summary data frame for each outcome variable containing the number of observations, countries and years.

In [None]:
from functions.exploratory_analysis import create_summary_df

In [None]:
summary_dfs = {}
indicator_dfs = {}
outcome_dfs = {}
for outcome in outcome_variables:
    outcome_df_with_metadata = outcome_dfs_with_metadata[outcome]
    outcome_df = outcome_df_with_metadata[f"{outcome}_df"]
    outcome_indicators = outcome_df_with_metadata[f"{outcome}_indicators"]
    outcome_outcomes = outcome_df_with_metadata[f"{outcome}_outcome"]
    outcome_countries = outcome_df_with_metadata[f"{outcome}_countries"]
    outcome_all_variables = outcome_df_with_metadata[f"{outcome}_variables"]
    indicator_dfs[outcome] = outcome_indicators
    outcome_dfs[outcome] = outcome_outcomes
    summary_df = create_summary_df(outcome_df, outcome_countries, outcome_all_variables)
    summary_dfs[outcome] = summary_df

In [None]:
summary_dfs_2012 = {}
indicator_dfs_2012 = {}
outcome_dfs_2012 = {}
for outcome in outcome_variables:
    outcome_df_with_metadata_2012 = outcome_dfs_with_metadata_2012[outcome]
    outcome_df_2012 = outcome_df_with_metadata_2012[f"{outcome}_df"]
    outcome_indicators_2012 = outcome_df_with_metadata_2012[f"{outcome}_indicators"]
    outcome_outcomes_2012 = outcome_df_with_metadata_2012[f"{outcome}_outcome"]
    outcome_countries_2012 = outcome_df_with_metadata_2012[f"{outcome}_countries"]
    outcome_all_variables_2012 = outcome_df_with_metadata_2012[f"{outcome}_variables"]
    indicator_dfs_2012[outcome] = outcome_indicators_2012
    outcome_dfs_2012[outcome] = outcome_outcomes_2012
    summary_df_2012 = create_summary_df(outcome_df_2012, outcome_countries_2012, outcome_all_variables_2012)
    summary_dfs_2012[outcome] = summary_df_2012

In [None]:
lifeexp_summary = summary_dfs['lifeexpectancy_over_energy']
lifeexp_indicators = indicator_dfs['lifeexpectancy_over_energy']
lifeexp_outcomes = outcome_dfs['lifeexpectancy_over_energy']

In [None]:
lifeexp_summary_2012 = summary_dfs_2012['lifeexpectancy_over_energy']
lifeexp_indicators_2012 = indicator_dfs_2012['lifeexpectancy_over_energy']
lifeexp_outcomes_2012 = outcome_dfs_2012['lifeexpectancy_over_energy']

Plot the number of observations, countries and years for each outcome variable

In [None]:
from functions.exploratory_analysis import plot_summary_variable

In [None]:
plot_summary_variable(lifeexp_summary, 'observations')
# plot_summary_variable(lifeexp_summary, 'num_countries')
# plot_summary_variable(lifeexp_summary, 'num_years')

Plot histograms for each indicator variable. The chart is big so it's not included by default.

In [None]:
# from functions.exploratory_analysis import plot_histograms

In [None]:
# plot_histograms(lifeexp_indicators, lifeexp_indicators.columns)

Plot outliers. 

In [None]:
from functions.exploratory_analysis import plot_outliers

In [None]:
plot_outliers(lifeexp_indicators)

Plot correlation matrix for all indicators.

In [None]:
from functions.exploratory_analysis import plot_correlation_matrix

In [None]:
plot_correlation_matrix(lifeexp_indicators)

## Cross-validation: Best subset, Lasso, PCA

In [None]:
from sklearn.linear_model import Lasso
from sklearn.impute import KNNImputer
# from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from abess import LinearRegression
from functions.cross_validation import panel_cross_validation, plot_cross_validation_results

In [None]:
# Initialize the models
best_subset_model = LinearRegression()
lasso_model = Lasso(max_iter=10000)
pca_model = PCA()

# Combine these into a list
models = [best_subset_model, lasso_model, pca_model]
model_names = ['Best Subset', 'Lasso', 'PCA']

In [None]:
lifeexp_model, lifeexp_cv_results = panel_cross_validation(models, model_names, lifeexp_indicators, lifeexp_outcomes)

In [None]:
lifeexp_model_2012, lifeexp_cv_results_2012 = panel_cross_validation(models, model_names, lifeexp_indicators_2012,
                                                                     lifeexp_outcomes_2012)

### Anaylzing cross-validation results

In [None]:
from functions.cross_validation import filter_and_find_best_model, plot_cross_validation_results

In [None]:
# plot_cross_validation_results(lifeexp_cv_results)
best_model_no_zero_coef = filter_and_find_best_model(lifeexp_cv_results)
print(best_model_no_zero_coef)

In [None]:
best_model_no_zero_coef_2012 = filter_and_find_best_model(lifeexp_cv_results_2012)
print(best_model_no_zero_coef_2012)

### Best model?
For the full panel dataset: Best Model: Lasso, Mean MSE: 0.3953844254470721, n_splits: 7, Lasso alpha: {'alpha': 0.01}
For the cross-section for 2012: Best Model: Lasso, Mean MSE: 0.18459822705946247, n_splits: 10, Lasso alpha: {'alpha': 0.01}

Try removing "selfemployed" column because it's collinear with "wageworker" and "concentration" as an outlier

In [None]:
lifeexp_lasso = Pipeline([('imputer', KNNImputer(n_neighbors=5)), ('scaler', StandardScaler()),
                          ('model', Lasso(max_iter=10000, alpha=0.01))])

lifeexp_lasso.fit(lifeexp_indicators, lifeexp_outcomes)
lifeexp_lasso_coef = lifeexp_lasso.named_steps['model'].coef_

important_features = []
for i, coef in enumerate(lifeexp_lasso_coef):
    if coef != 0:
        important_features.append((i, coef))

print("Important features:", important_features)
print(lifeexp_lasso.named_steps['model'].intercept_)
indicators_lifeexp_over_energy_important = lifeexp_indicators.iloc[:, [i for i, coef in important_features]]
print(len(indicators_lifeexp_over_energy_important.columns), len(lifeexp_indicators.columns))

In [None]:
lifeexp_lasso_2012 = Pipeline([('imputer', KNNImputer(n_neighbors=5)), ('scaler', StandardScaler()),
                               ('model', Lasso(max_iter=10000, alpha=0.01))])
lifeexp_lasso_2012.fit(lifeexp_indicators_2012, lifeexp_outcomes_2012)
lifeexp_lasso_coef_2012 = lifeexp_lasso_2012.named_steps['model'].coef_

important_features_2012 = []
for i, coef in enumerate(lifeexp_lasso_coef_2012):
    if coef != 0:
        important_features_2012.append((i, coef))

print("Important features:", important_features_2012)
print(lifeexp_lasso_2012.named_steps['model'].intercept_)
indicators_lifeexp_over_energy_important_2012 = lifeexp_indicators_2012.iloc[:,
                                                [i for i, coef in important_features_2012]]
print(len(indicators_lifeexp_over_energy_important_2012.columns), len(lifeexp_indicators_2012.columns))

## Panel regression with cross-validation

In [None]:
# simple linear regression
from sklearn.linear_model import LinearRegression

lifeexp_lasso_linear = Pipeline([('imputer', KNNImputer(n_neighbors=5)), ('model', LinearRegression())])
lifeexp_lasso_linear.fit(indicators_lifeexp_over_energy_important, lifeexp_outcomes)
lifeexp_lasso_linear_coef = lifeexp_lasso_linear.named_steps['model'].coef_
print("Linear regression coefficients:", lifeexp_lasso_linear_coef)
print("Adjusted r squared",
      lifeexp_lasso_linear.score(indicators_lifeexp_over_energy_important, lifeexp_outcomes))

In [None]:
lifeexp_lasso_linear_2012 = Pipeline([('imputer', KNNImputer(n_neighbors=5)), ('model', LinearRegression())])
lifeexp_lasso_linear_2012.fit(indicators_lifeexp_over_energy_important_2012, lifeexp_outcomes_2012)
lifeexp_lasso_linear_coef_2012 = lifeexp_lasso_linear_2012.named_steps['model'].coef_
print("Linear regression coefficients:", lifeexp_lasso_linear_coef_2012)
print("Adjusted r squared",
      lifeexp_lasso_linear_2012.score(indicators_lifeexp_over_energy_important_2012, lifeexp_outcomes_2012))

In [None]:
lifeexp_indicators_2012

Try with selected variables only

In [None]:
indicators_lifeexp_over_energy_2012_selected = lifeexp_indicators_2012.loc[:,
                                               ['goveffectiveness', 'onepcincome', 'resourcerents']]
lifeexp_linear_selected_2012 = Pipeline([('imputer', KNNImputer(n_neighbors=5)), ('model', LinearRegression())])
lifeexp_linear_selected_2012.fit(indicators_lifeexp_over_energy_2012_selected, lifeexp_outcomes_2012)
lifeexp_linear_selected_coef_2012 = lifeexp_linear_selected_2012.named_steps['model'].coef_
print("Linear regression coefficients:", lifeexp_linear_selected_coef_2012)
print("Adjusted r squared",
      lifeexp_linear_selected_2012.score(indicators_lifeexp_over_energy_2012_selected, lifeexp_outcomes_2012))

In [None]:
from matplotlib import pyplot as plt

# Extract the feature names from your DataFrame (change this line if your feature names are stored differently)
indicators_lifeexp_over_energy_names = indicators_lifeexp_over_energy_important.columns

# Create a list of (coefficient, feature_name) tuples and sort them based on the coefficients
indicators_lifeexp_over_energy_names_sorted = sorted(
    zip(lifeexp_lasso_linear_coef, indicators_lifeexp_over_energy_names))

# Separate the tuples into two lists
lifeexp_lasso_coef_sorted, lifeexp_over_energy_names_sorted = zip(*indicators_lifeexp_over_energy_names_sorted)

# Create a bar plot
plt.figure(figsize=(10, 8))
plt.barh(range(len(lifeexp_lasso_coef_sorted)), lifeexp_lasso_coef_sorted, align='center')
plt.yticks(range(len(lifeexp_lasso_coef_sorted)), lifeexp_over_energy_names_sorted)
plt.xlabel('Coefficient Value')
plt.ylabel('Feature Names')
plt.title('Feature Importances')
plt.show()