# Get feature importance of random forest models

In [None]:
# Libraries
import os
import pandas as pd
import dask.dataframe as dd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [None]:
# Directories
dir02 = '../paper_deficit/output/02_dbase/'
dir03 = '../paper_deficit/output/03_rf/'
dir03p = os.path.join(dir03 + 'files_params/')
dir03i = os.path.join(dir03 + 'files_importance/')

---

In [None]:
# Get database
dbase = dd.read_parquet(dir02 + 'df_dbase.parquet')

# Explanatory variables
vars_exp = ['geom90m_convergence', 'geom90m_cti', 'geom90m_eastness',
            'geom90m_northness', 'geom90m_slope', 'geom90m_spi',
            'soilgrids2017_bdricm', 'soilgrids2017_bdrlog',
            'soilgrids2017_bdticm', 
            'soilgrids2020_cec', 'soilgrids2020_cfvo', 'soilgrids2020_clay', 
            'soilgrids2020_phh2o', 'soilgrids2020_sand', 'soilgrids2020_silt',
            'worldclim_bio1', 'worldclim_bio3', 'worldclim_bio4',
            'worldclim_bio5', 'worldclim_bio6', 'worldclim_bio12', 
            'worldclim_bio13', 'worldclim_bio14', 'worldclim_bio15', 
            'worldclim_elev']

In [None]:
def random_forest_feature_imp(var_tar, scen):

    """
    Calculate Feature importances for 10 best performing models and export
    as csv file
    """

    def get_feature_importance(rank):
        """
        Calculate feature importance for one model
        """
        # Filter parameters for the current rank
        df_params_rank = df_params[df_params.rank_test_score == rank]
        
        rfr = RandomForestRegressor(
            min_samples_leaf=df_params_rank.min_samples_leaf.item(),
            max_features=df_params_rank.max_features.item(),
            n_estimators=df_params_rank.n_estimators.item(),
            random_state=df_params_rank.random_state.item(),
            n_jobs=-1)
            
        rfr.fit(X_train, y_train)
        
        return rfr.feature_importances_

    
    # Get training data
    df_train = dbase[dbase['train_' + scen] == True][[var_tar, *vars_exp]] \
        .repartition(partition_size='200 MiB')
        
    # Split training data in features and target/label
    X_train = df_train[vars_exp].persist()
    y_train = df_train[var_tar].persist()
    
    # Get dataframe with parameters and ranks
    params_file = os.path.join(dir03p, f'df_params_rank_{var_tar}_{scen}.csv')
    df_params = pd.read_csv(params_file)

    # Dataframe to store feature importances
    df_imp = pd.DataFrame(dict(var_exp = df_train[vars_exp].columns))
    # Calculate feature importances for each of the 10 best performing models
    for rank in range(1, 11):
        df_imp[f"rank_{rank}"] = get_feature_importance(rank)
    # Calculate mean, min, and max importance value for each feature
    df_imp_sel = df_imp[[f"rank_{i}" for i in range(1, 11)]]
    df_imp = df_imp.assign(imp_mean = df_imp_sel.mean(axis=1),
                           imp_min = df_imp_sel.min(axis=1),
                           imp_max = df_imp_sel.max(axis=1)
                          )
    
    # Construct output file name
    output_file = os.path.join(dir03i, f'df_feature_imp_{var_tar}_{scen}.csv')
                      
    # Export dataframe as csv file
    df_imp.to_csv(output_file, index=False, mode='w')

In [None]:
# Agbc
# Calculate feature importance of 10 best performing models
for var_tar in ['agbc_min', 'agbc_mean', 'agbc_max']:
    for scen in ['prim', 'secd']:
        %time random_forest_feature_imp(var_tar, scen)

In [None]:
# Bgbc
# Calculate feature importance of 10 best performing models
for var_tar in ['bgbc_min', 'bgbc_mean', 'bgbc_max']:
    for scen in ['prim', 'secd']:
        %time random_forest_feature_imp(var_tar, scen)

In [None]:
# Soc
# Calculate feature importance of 10 best performing models
for var_tar in ['soc_min', 'soc_mean', 'soc_max']:
    for scen in ['prim', 'secd']:
        %time random_forest_feature_imp(var_tar, scen)