# Feature Selection
Src: https://towardsdatascience.com/the-5-feature-selection-algorithms-every-data-scientist-need-to-know-3a6b566efd2

https://heartbeat.fritz.ai/hands-on-with-feature-selection-techniques-filter-methods-f248e0436ce5

https://pbiecek.github.io/ema/doItYourselfWithPython.html

## Why do we need Feature Selection?
1. Curse of Dimensionality - Overfitting

As the number of features (or dimensions) grows, the amount of data we need to generalize accurately grows exponentially.

If the number of features is bigger than the number of samples, we will be able to train the data perfectly, but not generalize it to new samples (overfit).

2. Explainability

We want the models to be simple and explainable.

3. Garbage information

We want to remove unnecessary information. 

## Methods
- **Filter based**: filter features based on some metrics (ex: correlation, chi-square)

- **Wrapped-based**: selection of features is treated as a search problem (ex: recursive feature elimination)

- **Embedded**: use of algorithms that have built-in feature selection methods (ex: lasso and RF)

Import libraries

In [1]:
import pandas as pd
import glob
import numpy as np
root = '../'

In [2]:
suicide = pd.read_csv(root +'CSV/Suicide/suicide_rates_08_18.csv', index_col=0)
suicide

Unnamed: 0,MUNCOD,RATE_08,RATE_09,RATE_10,RATE_11,RATE_12,RATE_13,RATE_14,RATE_15,RATE_16,RATE_17,RATE_18
0,110001,20.344224,8.212203,8.189337,4.127456,12.464166,7.773632,3.898332,11.728829,7.841292,11.793844,4.316485
1,110002,9.458389,2.338060,4.427031,4.368243,9.703818,1.974938,4.860976,4.789226,5.665936,9.315758,1.883807
2,110003,0.000000,14.936520,0.000000,0.000000,0.000000,15.396459,0.000000,31.471282,15.900779,0.000000,18.389114
3,110004,5.110972,7.626311,2.544497,1.266480,5.042229,1.164646,5.776607,6.878683,10.241588,5.649271,9.432516
4,110005,0.000000,0.000000,11.743981,0.000000,11.868028,0.000000,0.000000,11.119760,11.136478,5.576001,6.081245
...,...,...,...,...,...,...,...,...,...,...,...,...
5376,522200,7.874636,0.000000,15.937525,15.817779,15.702285,0.000000,7.494566,0.000000,0.000000,7.312614,0.000000
5377,522205,0.000000,0.000000,0.000000,0.000000,13.199578,12.605572,0.000000,12.238404,0.000000,0.000000,0.000000
5378,522220,67.249496,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5379,522230,0.000000,18.660198,0.000000,0.000000,0.000000,18.315018,0.000000,0.000000,0.000000,0.000000,0.000000


Read dataset

In [3]:
disease = ""
path = root + "CSV/TabNet/Internacoes_Rate/"
all_files = glob.glob(path + "*.csv")
# all_files = all_files + glob.glob(path + "Groups/" + "*.csv")
suicide = pd.read_csv(root +'CSV/Suicide/suicide_rates_08_18.csv', index_col=0)

years = ["08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18"]

final_df = pd.DataFrame()


for i in range(len(years)-1):
    col_year_suicide = "RATE_" + years[i+1]
    year_df = suicide[[col_year_suicide, "MUNCOD"]]
    year_df = year_df.rename(columns={col_year_suicide: "RATE"})
    for file in all_files:
        file_name = file.split("/")[-1]
        disease = file_name.split(".csv")[0]

        col_year_disease = "RATE_" + years[i]
        disease_df = pd.read_csv(file, sep=',', index_col=0)
        disease_df = disease_df[[col_year_disease, "MUNCOD"]]
        disease_df = disease_df.rename(columns={col_year_disease: disease})
        
        year_df = pd.merge(disease_df, year_df, left_on="MUNCOD", right_on="MUNCOD", how='right')
        
    year_df = year_df.drop("MUNCOD", axis=1)
    final_df = pd.concat([final_df, year_df])

final_df = final_df[final_df["RATE"] > 0]
final_df


Unnamed: 0,DIFTERIA,NEOPLASIA_MALIGNA_DO_ESTÔMAGO,FRATURA_DO_CRÂNIO_E_DOS_OSSOS_DA_FACE,DOENÇA_DE_HODGKIN,OUTRAS_DOENÇAS_CEREBROVASCULARES,FEBRES_RECORRENTES,ESTADO_INFEC_ASSINT_VÍRUS_DA_IMUNODEF_HUMANA_[HIV],PNEUMOCONIOSE,NEOPLASIA_MALIGNA_DA_PRÓSTATA,OUTRAS_NEOPL_MALIG_ÓRG_RESPIRAT_E_INTRATORÁCICOS,...,OUTRAS_INFESTAÇÕES_POR_TREMATÓDEOS,ASMA,CERATITE_E_OUTROS_TRANSTORNOS_ESCLERÓTICA_E_CÓRNEA,INFECÇÕES_PELO_VÍRUS_DO_HERPES,NEOPLASIA_MALIGNA_DO_LÁBIO_CAVIDADE_ORAL_E_FARINGE,TUBERCULOSE_PULMONAR,AUSÊNCIA_ATRESIA_E_ESTENOSE_DO_INTESTINO_DELGADO,OUTRAS_DOENÇAS_DO_APARELHO_DIGESTIVO,OUTRAS_TUBERCULOSES_RESPIRATÓRIAS,RATE
0,0.000000,0.000000,16.275379,0.000000,0.000000,0.0,,,0.000000,0.000000,...,,752.736298,0.000000,0.000000,0.000000,0.000000,,28.481914,0.000000,8.212203
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,...,,66.394101,0.000000,0.000000,0.000000,0.000000,,52.416396,,6.922331
2,0.000000,4.483288,4.219565,1.318614,2.637228,0.0,,0.000000,3.692119,0.263723,...,,30.591847,5.274456,3.428397,2.373505,0.527446,0.0,11.603804,10.021467,5.485478
5,0.000000,9.163263,53.670541,0.000000,0.000000,0.0,,0.000000,3.927113,1.309038,...,0.0,261.807519,1.309038,1.309038,3.927113,26.180752,0.0,71.997068,1.309038,9.090437
7,0.331787,5.308595,10.948978,0.000000,3.981446,0.0,,0.331787,4.645021,5.308595,...,,14.930424,2.654298,0.663574,6.635744,11.612552,0.0,65.362079,5.640382,6.536930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5372,,,0.000000,0.000000,0.000000,,,0.000000,0.000000,0.000000,...,,0.000000,0.000000,0.000000,0.000000,0.000000,,24.727992,,25.000000
5373,,,0.000000,,0.000000,,,,29.761905,,...,,0.000000,0.000000,,29.761905,0.000000,,119.047619,0.000000,61.671292
5374,,,25.926886,,0.000000,,,,0.000000,,...,,51.853772,0.000000,,0.000000,,,51.853772,103.707545,26.295030
5377,,,0.000000,33.772374,0.000000,,,,0.000000,,...,,0.000000,,,0.000000,,,67.544748,0.000000,34.867503


Defining X and y

In [10]:
N = 250
print(len(final_df))
Z = final_df.dropna(thresh=N) # At least N non null items

print(len(Z))

X = Z.drop(columns="RATE")
X = X.fillna(0)
y = Z["RATE"]
num_feat = 10

29028
8887


## 1. Filter Based

### 1.1. Correlation Feature Selection

In [11]:
def cor_feature_selector(X,y,n):
    cor_list = []
    for i in list(X.columns):
        cor = np.corrcoef(X[i], y)[0,1]
        cor_list.append([i, cor])
    cor_ranking = sorted(cor_list, key=lambda a : abs(a[1]),reverse=True)
    cor_feature = [x[0] for x in cor_ranking[:n]]
    cor_support = [True if i in cor_feature else False for i in X.columns]
    return cor_support, cor_feature
cor_support, cor_feature = cor_feature_selector(X,y,num_feat)
print(str(len(cor_feature)), 'selected features')
print(cor_feature)

10 selected features
['BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC', 'NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES', 'TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT', 'TRANSTORNOS_DE_HUMOR_[AFETIVOS]', 'COLELITÍASE_E_COLECISTITE', 'NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_ESPEC', 'DOENÇAS_DO_APÊNDICE', 'FLEBITE_TROMBOFLEBITE_EMBOLIA_E_TROMBOSE_VENOSA', 'TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS', 'TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁLCOOL']


## 2. Wrapped-based

### 2.1. Recursive Feature Elimination
"The goal of recursive feature elimination (RFE) is to select features by **recursively considering smaller and smaller sets of features**. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached." (`sklearn` documentation)

In [12]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer 
rfe_selector = RFE(estimator=RandomForestRegressor(), n_features_to_select=num_feat, step=10, verbose=5)
rfe_selector.fit(X, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
print(rfe_feature)

Fitting estimator with 313 features.
Fitting estimator with 303 features.
Fitting estimator with 293 features.
Fitting estimator with 283 features.
Fitting estimator with 273 features.
Fitting estimator with 263 features.
Fitting estimator with 253 features.
Fitting estimator with 243 features.
Fitting estimator with 233 features.
Fitting estimator with 223 features.
Fitting estimator with 213 features.
Fitting estimator with 203 features.
Fitting estimator with 193 features.
Fitting estimator with 183 features.
Fitting estimator with 173 features.
Fitting estimator with 163 features.
Fitting estimator with 153 features.
Fitting estimator with 143 features.
Fitting estimator with 133 features.
Fitting estimator with 123 features.
Fitting estimator with 113 features.
Fitting estimator with 103 features.
Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.
Fittin

### 3.2 Tree-Based: SelectFromModel

In [13]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

embeded_rf_selector = SelectFromModel(RandomForestRegressor())
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
print(embeded_rf_feature)

120 selected features
['NEOPLASIA_MALIGNA_DO_ESTÔMAGO', 'FRATURA_DO_CRÂNIO_E_DOS_OSSOS_DA_FACE', 'DOENÇA_DE_HODGKIN', 'OUTRAS_NEOPL_MALIG_ÓRG_RESPIRAT_E_INTRATORÁCICOS', 'OUTROS_TRANST_ENDÓCRINOS_NUTRICIONAIS_METABÓLICOS', 'TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT', 'EMBOLIA_PULMONAR', 'NEOPLASIA_MALIGNA_DO_PÂNCREAS', 'OUTRAS_DOENÇAS_GLOMERULARES', 'INFARTO_AGUDO_DO_MIOCÁRDIO', 'PNEUMONIA', 'INSUFICIÊNCIA_RENAL', 'COLELITÍASE_E_COLECISTITE', 'HEMORRAGIA_INTRACRANIANA', 'OUTRAS_DOENÇAS_INFECCIOSAS_INTESTINAIS', 'OUTRAS_DOENÇAS_SIST_OSTEOMUSCULAR_E_TEC_CONJUNTIVO', 'FRATURA_DO_PESCOÇO_TÓRAX_OU_PELVE', 'OUTR_SIST_SINAIS_ACHAD_ANORM_EX_CLÍN_LABORAT_NCOP', 'UROLITÍASE', 'OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO', 'OUTRAS_DOENÇAS_DO_ESÔFAGO_ESTÔMAGO_E_DUODENO', 'OUTRAS_DOENÇAS_DO_APARELHO_URINÁRIO', 'SEQÜEL_TRAUM_ENVEN_E_OUTR_CONSEQ_CAUSAS_EXTERNAS', 'TRANSTORNOS_DO_TECIDO_MOLE', 'MALFORMAÇÕES_CONGÊNITAS_DO_APARELHO_CIRCULATÓRIO', 'OUTRAS_DOENÇAS_ISQUÊMICAS_DO_CORAÇÃO', 'ÚLCERA_GÁS

# All together

In [14]:
feature_selection_df = pd.DataFrame({'Feature':X.columns, 'Pearson':cor_support, 'RFE':rfe_support,
                                    'Random Forest':embeded_rf_support})
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feat+5)

Unnamed: 0,Feature,Pearson,RFE,Random Forest,Total
1,TRANSTORNOS_DE_HUMOR_[AFETIVOS],True,True,True,3
2,TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS,True,True,True,3
3,NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES,True,True,True,3
4,COLELITÍASE_E_COLECISTITE,True,True,True,3
5,BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,True,True,True,3
6,TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁL...,True,False,True,2
7,TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT,True,False,True,2
8,RESTANTE_DE_OUTRAS_DOENÇAS_BACTERIANAS,False,True,True,2
9,OUTR_SIST_SINAIS_ACHAD_ANORM_EX_CLÍN_LABORAT_NCOP,False,True,True,2
10,OUTRAS_DOENÇAS_DO_FÍGADO,False,True,True,2


In [19]:
feature_selection_df.to_csv('')

8887