In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

### Load data, set initial configuration

In [None]:
all_in_one = pd.read_csv('../data/all_in_one.csv')

In [None]:
all_in_one_no_id = all_in_one.drop(columns=['Unnamed: 0'])
all_in_one_no_id_no_inf = all_in_one_no_id.replace([np.inf, -np.inf], np.nan)
all_countries = all_in_one_no_id_no_inf['Country.Name'].unique()
all_variables = all_in_one_no_id_no_inf.columns[4:]

### Apply styles

In [None]:
# Prepare a color palette for all unique countries in the dataset
sns.set_theme(style="dark")
palette = dict(zip(all_countries, sns.color_palette("husl", len(all_countries))))

### Create panel data set for each outcome variable

In [None]:
from functions.create_panel_dataset import remove_cols_with_few_observations, create_outcome_df_with_metadata, \
    remove_outliers_iqr

Start with year 1995 where ecirank data is available, end with 2015, the last year for which energy data is available. 
Only keep observations where energy use data is available.
Remove selected columns (selection process?)

In [None]:
all_in_one_filtered = all_in_one_no_id_no_inf.loc[all_in_one_no_id_no_inf['Year'].between(1995, 2015)]
all_in_one_filtered_with_energy_only = all_in_one_filtered[all_in_one_filtered[['energy']].notnull().all(1)]
all_in_one_filtered_selected_cols = all_in_one_filtered_with_energy_only.drop(
    columns=['netmigration', 'wealth', 'patents', 'concentration', 'selfemployed', 'grosscapital', 'gdppercap',
             'gnipercap'])
# for cross-sectional filter the data frame by year 2012
all_in_one_filtered_selected_cols_2012 = all_in_one_filtered_selected_cols.loc[
    all_in_one_filtered_selected_cols['Year'] == 2012]

Create separate data frames containing observations with data available for each outcome variable. 

After subsetting is done, create final data frames and utility variables. 

Remove columns with few observations.

In [None]:
outcome_variables = ['lifeexpectancy_over_energy', 'nutrition_over_energy', 'education_over_energy',
                     'sanitation_over_energy']
outcome_dfs_with_metadata = []
for outcome in outcome_variables:
    outcome_df = all_in_one_filtered_selected_cols.dropna(subset=[outcome])
    outcome_df_no_outliers = remove_outliers_iqr(outcome_df)
    outcome_df_no_outliers_no_few_obs = remove_cols_with_few_observations(outcome_df_no_outliers)

    outcome_df_with_metadata = create_outcome_df_with_metadata(
        outcome_df_no_outliers_no_few_obs, outcome, 3, 17
    )
    outcome_dfs_with_metadata.append(outcome_df_with_metadata)

outcome_lifeexp_over_energy, outcome_nutrition_over_energy, outcome_education_over_energy, outcome_sanitation_over_energy = outcome_dfs_with_metadata

## Exploratory analysis

In [None]:
from functions.exploratory_analysis import create_summary_df, plot_summary_variable, plot_outliers, \
    plot_correlation_matrix, \
    plot_histograms

In [None]:
outcome_lifeexp_over_energy_summary_df = create_summary_df(outcome_lifeexp_over_energy["lifeexpectancy_over_energy_df"],
                                                           outcome_lifeexp_over_energy[
                                                               "lifeexpectancy_over_energy_countries"],
                                                           outcome_lifeexp_over_energy[
                                                               "lifeexpectancy_over_energy_variables"])

In [None]:
plot_summary_variable(outcome_lifeexp_over_energy_summary_df, 'observations')
# plot_summary_variable(all_lifeexp_over_energy_summary_df, 'num_countries')
# plot_summary_variable(all_lifeexp_over_energy_summary_df, 'num_years')

## Cross-validation: Best subset, Lasso, PCA

In [None]:
from sklearn.linear_model import Lasso
from sklearn.impute import KNNImputer
# from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from abess import LinearRegression
from functions.cross_validation import panel_cross_validation, plot_cross_validation_results

In [None]:
# Initialize the models
best_subset_model = LinearRegression()
lasso_model = Lasso(max_iter=10000)
pca_model = PCA()

# Combine these into a list
models = [best_subset_model, lasso_model, pca_model]
model_names = ['Best Subset', 'Lasso', 'PCA']

In [None]:
lifeexp_model, lifeexp_cv_results = panel_cross_validation(models, model_names,
                                                           indicators_lifeexp_over_energy_with_energy_no_outliers,
                                                           outcome_lifeexp_over_energy)

### Anaylzing cross-validation results

In [None]:
from functions.cross_validation import filter_and_find_best_model

In [None]:
# plot_model_results(lifeexp_cv_results)
# best_model_no_zero_coef = filter_and_find_best_model(lifeexp_cv_results)
# print(best_model_no_zero_coef)

### Best model?
Lasso with alpha 0.59 and n_splits 5. 
! Lasso with alpha 1 has better MSE but all coefficients are 0.

Try removing "selfemployed" column because it's collinear with "wageworker" and "concentration" as an outlier

In [None]:
lifeexp_lasso = Pipeline([('imputer', KNNImputer(n_neighbors=5)), ('scaler', StandardScaler()),
                          ('model', Lasso(max_iter=10000, alpha=0.59))])

lifeexp_lasso.fit(indicators_lifeexp_over_energy_with_energy_no_outliers, outcome_lifeexp_over_energy)
lifeexp_lasso_coef = lifeexp_lasso.named_steps['model'].coef_

important_features = []
for i, coef in enumerate(lifeexp_lasso_coef):
    if
coef != 0:
important_features.append((i, coef))

print("Important features:", important_features)
print(lifeexp_lasso.named_steps['model'].intercept_)
indicators_lifeexp_over_energy_important = indicators_lifeexp_over_energy_with_energy_no_outliers.iloc[:,
                                           [i for i, coef in important_features]]
indicators_lifeexp_over_energy_important_2012 = indicators_lifeexp_over_energy_with_energy_2012_no_outliers.iloc[:,
                                                [i for i, coef in important_features]]
print(len(indicators_lifeexp_over_energy_important.columns), len(indicators_lifeexp_over_energy.columns))

## Panel regression with cross-validation

In [None]:
# simple linear regression
from sklearn.linear_model import LinearRegression

lifeexp_lasso_linear = Pipeline([('imputer', KNNImputer(n_neighbors=5)), ('model', LinearRegression())])
lifeexp_lasso_linear.fit(indicators_lifeexp_over_energy_important, outcome_lifeexp_over_energy)
lifeexp_lasso_linear_coef = lifeexp_lasso_linear.named_steps['model'].coef_
print("Linear regression coefficients:", lifeexp_lasso_linear_coef)
print("Adjusted r squared",
      lifeexp_lasso_linear.score(indicators_lifeexp_over_energy_important, outcome_lifeexp_over_energy))

In [None]:
# try the same regression only for year 2012
lifeexp_lasso_linear_2012 = Pipeline([('imputer', KNNImputer(n_neighbors=5)), ('model', LinearRegression())])
lifeexp_lasso_linear_2012.fit(indicators_lifeexp_over_energy_important_2012, outcome_lifeexp_over_energy_2012)
lifeexp_lasso_linear_coef_2012 = lifeexp_lasso_linear_2012.named_steps['model'].coef_
print("Linear regression coefficients:", lifeexp_lasso_linear_coef_2012)
print("Adjusted r squared",
      lifeexp_lasso_linear_2012.score(indicators_lifeexp_over_energy_important_2012, outcome_lifeexp_over_energy_2012))

In [None]:
from matplotlib import pyplot as plt

# Extract the feature names from your DataFrame (change this line if your feature names are stored differently)
indicators_lifeexp_over_energy_names = indicators_lifeexp_over_energy_important.columns

# Create a list of (coefficient, feature_name) tuples and sort them based on the coefficients
indicators_lifeexp_over_energy_names_sorted = sorted(
    zip(lifeexp_lasso_linear_coef, indicators_lifeexp_over_energy_names))

# Separate the tuples into two lists
lifeexp_lasso_coef_sorted, lifeexp_over_energy_names_sorted = zip(*indicators_lifeexp_over_energy_names_sorted)

# Create a bar plot
plt.figure(figsize=(10, 8))
plt.barh(range(len(lifeexp_lasso_coef_sorted)), lifeexp_lasso_coef_sorted, align='center')
plt.yticks(range(len(lifeexp_lasso_coef_sorted)), lifeexp_over_energy_names_sorted)
plt.xlabel('Coefficient Value')
plt.ylabel('Feature Names')
plt.title('Feature Importances')
plt.show()