# Regression
Features = rates from selected diseases (FeatureSelection) + suicide rate 

Target = suicide rate of the following year

Import libraries

In [19]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from skopt import gp_minimize
import glob
import time
import os
root = "../../"

In [20]:
dict_uf_cod = {11: 'RO',
12: 'AC',
13: 'AM',
14: 'RR',
15: 'PA',
16: 'AP',
17: 'TO',
21: 'MA',
22: 'PI',
23: 'CE',
24: 'RN',
25: 'PB',
26: 'PE',
27: 'AL',
28: 'SE',
29: 'BA',
31: 'MG',
32: 'ES',
33: 'RJ',
35: 'SP',
41: 'PR',
42: 'SC',
43: 'RS',
50: 'MS',
51: 'MT',
52: 'GO',
53: 'DF'}

Prepare dataset

In [21]:
disease = ""
path = root + "CSV/TabNet/Internacoes_Rate/"
all_files = glob.glob(path + "*.csv")
suicide = pd.read_csv(root +'CSV/Suicide/suicide_rates_08_18.csv', index_col=0)

years = [str(x).zfill(2) for x in range(8,19)]
columns = ["RATE_" + year for year in years]
columns.append("MUNCOD")

disease_list = []

for file in all_files:
    file_name = os.path.basename(file)
    disease = file_name.split(".csv")[0]
    disease_df = pd.read_csv(file, sep=',', index_col=0)
    if(set(disease_df.columns) == set(columns)):
        disease_list.append(disease)
print(len(disease_list), " diseases")

313  diseases


In [22]:
final_df = pd.DataFrame()

for i in range(len(years)-1):
    col_year_suicide = "RATE_" + years[i+1]
    col_year_prev = "RATE_" + years[i]
    year_df = suicide[[col_year_prev, col_year_suicide, "MUNCOD"]]
    year_df = year_df.rename(columns={col_year_suicide: "RATE"})
    year_df = year_df.rename(columns={col_year_prev: "PREVIOUS"})
    for disease in disease_list:
        col_year_disease = "RATE_" + years[i]
        disease_df = pd.read_csv(path + disease + ".csv", sep=',', index_col=0)
        disease_df = disease_df[[col_year_disease, "MUNCOD"]]
        disease_df = disease_df.rename(columns={col_year_disease: disease})

        year_df = pd.merge(disease_df, year_df, left_on="MUNCOD", right_on="MUNCOD", how='right')
        
    final_df = pd.concat([final_df, year_df])

final_df.head()

Unnamed: 0,DIFTERIA,MUNCOD,NEOPLASIA_MALIGNA_DO_ESTÔMAGO,FRATURA_DO_CRÂNIO_E_DOS_OSSOS_DA_FACE,DOENÇA_DE_HODGKIN,OUTRAS_DOENÇAS_CEREBROVASCULARES,FEBRES_RECORRENTES,ESTADO_INFEC_ASSINT_VÍRUS_DA_IMUNODEF_HUMANA_[HIV],PNEUMOCONIOSE,NEOPLASIA_MALIGNA_DA_PRÓSTATA,...,ASMA,CERATITE_E_OUTROS_TRANSTORNOS_ESCLERÓTICA_E_CÓRNEA,INFECÇÕES_PELO_VÍRUS_DO_HERPES,NEOPLASIA_MALIGNA_DO_LÁBIO_CAVIDADE_ORAL_E_FARINGE,TUBERCULOSE_PULMONAR,AUSÊNCIA_ATRESIA_E_ESTENOSE_DO_INTESTINO_DELGADO,OUTRAS_DOENÇAS_DO_APARELHO_DIGESTIVO,OUTRAS_TUBERCULOSES_RESPIRATÓRIAS,PREVIOUS,RATE
0,0.0,110001,0.0,16.275379,0.0,0.0,0.0,,,0.0,...,752.736298,0.0,0.0,0.0,0.0,,28.481914,0.0,20.344224,8.212203
1,0.0,110009,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,66.394101,0.0,0.0,0.0,0.0,,52.416396,,0.0,6.922331
2,0.0,110020,4.483288,4.219565,1.318614,2.637228,0.0,,0.0,3.692119,...,30.591847,5.274456,3.428397,2.373505,0.527446,0.0,11.603804,10.021467,4.483288,5.485478
3,0.0,110120,0.0,0.0,,0.0,,,,0.0,...,9.387027,0.0,,0.0,0.0,,9.387027,0.0,0.0,0.0
4,0.0,110143,0.0,12.53447,,0.0,,,,0.0,...,12.53447,0.0,,0.0,,,25.06894,,0.0,0.0


Get UF from MUNCOD

In [23]:
final_df['UF'] = final_df['MUNCOD'] / 10000
final_df['UF'] = final_df['UF'].astype(int)
final_df['UF'].replace(dict_uf_cod, inplace=True)
dummy = pd.get_dummies(final_df['UF'])
final_df = pd.concat([final_df, dummy], axis=1)
final_df = final_df.drop(['MUNCOD', 'UF'], axis=1)
final_df.head()

Unnamed: 0,DIFTERIA,NEOPLASIA_MALIGNA_DO_ESTÔMAGO,FRATURA_DO_CRÂNIO_E_DOS_OSSOS_DA_FACE,DOENÇA_DE_HODGKIN,OUTRAS_DOENÇAS_CEREBROVASCULARES,FEBRES_RECORRENTES,ESTADO_INFEC_ASSINT_VÍRUS_DA_IMUNODEF_HUMANA_[HIV],PNEUMOCONIOSE,NEOPLASIA_MALIGNA_DA_PRÓSTATA,OUTRAS_NEOPL_MALIG_ÓRG_RESPIRAT_E_INTRATORÁCICOS,...,PR,RJ,RN,RO,RR,RS,SC,SE,SP,TO
0,0.0,0.0,16.275379,0.0,0.0,0.0,,,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
2,0.0,4.483288,4.219565,1.318614,2.637228,0.0,,0.0,3.692119,0.263723,...,0,0,0,1,0,0,0,0,0,0
3,0.0,0.0,0.0,,0.0,,,,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,0.0,0.0,12.53447,,0.0,,,,0.0,,...,0,0,0,1,0,0,0,0,0,0


Define X and y

In [30]:
percentage_valid_values = 0.8
num_feat = 20
corr_min_value = 0.15

print("Shape before null values drop:", final_df.shape)
N = int(final_df.shape[1]*percentage_valid_values)
print("Minimum non-null values: " + str(N) + "/" + str(final_df.shape[1]))
final_zeros_df = final_df.dropna(thresh=N) # At least N non null items

X = final_zeros_df.drop(columns="RATE")
X = X.fillna(0)
y = final_zeros_df["RATE"]

X.shape

Shape before null values drop: (53810, 342)
Minimum non-null values: 273/342


(11920, 341)

Define some functions

In [31]:
def calcUpMetrics(y_pred,y_test,X_test):
    up_df = pd.DataFrame({"Pred": y_pred, "Real": y_test, "Previous": X_test["PREVIOUS"]})
    up_df["UP"] = up_df["Previous"] < up_df["Real"]
    up_df["UP_PRED"] = up_df["Previous"] < up_df["Pred"]
    up_df["UP"] = up_df["UP"].astype(int)
    up_df["UP_PRED"] = up_df["UP_PRED"].astype(int)
    return metrics.accuracy_score(up_df["UP"], up_df["UP_PRED"])

In [32]:
def get_average_feat_importance(feature_importance_list):
    final_feat_df = pd.DataFrame()
    for i, feat_df in enumerate(feature_importance_list):
        feat_df = feat_df.rename(columns={"Importance": i})
        if final_feat_df.empty:
            final_feat_df = feat_df
        else:
            final_feat_df = pd.merge(final_feat_df, feat_df, on="Feature")
    final_feat_df["Avg_importance"] = final_feat_df.sum(axis=1)/(final_feat_df.shape[1] -1)
    final_feat_df = final_feat_df[["Feature", "Avg_importance"]]
    final_feat_df = final_feat_df.sort_values(by=['Avg_importance'], ascending=False)
    return final_feat_df

Create K-folds

In [33]:
def cor_feature_selector(X,y,corr_min_value):
    cor_list = []
    for i in list(X.columns):
        cor = np.corrcoef(X[i], y)[0,1]
        cor_list.append([i, cor])
    cor_feature = [x[0] for x in cor_list if abs(x[1]) > corr_min_value]
    print(len(cor_feature), "selected features (correlation)")
    return cor_feature

First model: no bayesian optimization

In [None]:
from sklearn.model_selection import KFold
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
feature_importance_list = []
count = 1
for train_index, test_index in kf.split(X):
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Fitting the model")
    regressor = RandomForestRegressor(random_state=42)
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Accuracy Train:", up_down_train)
    print("Up/Down Accuracy Test:", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("# Get Feature Importance")
    importance = regressor.feature_importances_
    feature_importance_list.append(pd.DataFrame({"Feature": X_train.columns, "Importance": importance}).sort_values(by="Importance", ascending=False))
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
print("# Average Feature Importance")
final_feat_df = get_average_feat_importance(feature_importance_list)
print(final_feat_df.head(20))
end = time.time()

print("\nTotal run time:", end-start)

num_feat = 50
corr_min_value = 0.05

## Average Metrics
rmse_train             1.660836
rmse_test              4.453015
rmse_baseline_train    5.611418
rmse_baseline_test     5.609371
up_down_train          0.861829
up_down_test           0.661829
dtype: float64
## Average Feature Importance
                                              Feature  Avg_importance
0                                            PREVIOUS        0.147144
1                                                  RS        0.027659
2   BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC        0.016462
3       TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS        0.018203
4                  OUTRAS_NEOPLASIAS_MALIGNAS_DA_PELE        0.011945
5                                 DOENÇAS_DO_APÊNDICE        0.012571
6                              PARTO_ÚNICO_ESPONTÂNEO        0.010764
7                                      OUTRAS_ANEMIAS        0.009904
8                                      OUTRAS_HÉRNIAS        0.011222
9                                  LEIOMIOMA_DO_ÚTERO        0.010401
10                  OUTRAS_DOENÇAS_DO_SISTEMA_NERVOSO        0.009605
11                      LEPTOSPIROSE_NÃO_ESPECIFICADA        0.008508
12                                         UROLITÍASE        0.011273
13     PANCREATITE_AGUDA_E_OUTRAS_DOENÇAS_DO_PÂNCREAS        0.010887
14  LUXAÇÕES_ENTORSE_DISTENSÃO_REG_ESP_E_MÚLTIP_CORPO        0.010263
15               OUTRAS_DOENÇAS_ISQUÊMICAS_DO_CORAÇÃO        0.008905
16                                INSUFICIÊNCIA_RENAL        0.009223
17  NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_E...        0.009562
18         OUTRAS_COMPLICAÇÕES_DA_GRAVIDEZ_E_DO_PARTO        0.010295
19  TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT        0.010612

Total run time: 334.182599067688

Second model: with bayesian optimization

In [34]:
from sklearn.model_selection import KFold
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
feature_importance_list = []
count = 1
for train_index, test_index in kf.split(X):
#     if(count > 1):
#         break # delete this later
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        n_estimators = params[0]
        max_depth = params[1]
        min_samples_leaf = params[2]
        max_features = params[3]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=n_estimators, 
                                      max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))
    space = [
        (5,1000), #n_estimators
        (3,30), #max_depth
        (2,200), #min_samples_leaf
        (0.25,1.00) #max_features
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    print("# Fitting the model")
    regressor = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=resultado_gp.x[0], 
                                  max_depth=resultado_gp.x[1],min_samples_leaf=resultado_gp.x[2],max_features=resultado_gp.x[3])
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Accuracy Train:", up_down_train)
    print("Up/Down Accuracy Test:", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("# Get Feature Importance")
    importance = regressor.feature_importances_
    feature_importance_list.append(pd.DataFrame({"Feature": X_train.columns, "Importance": importance}).sort_values(by="Importance", ascending=False))
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
print("# Average Feature Importance")
final_feat_df = get_average_feat_importance(feature_importance_list)
print(final_feat_df.head(20))
end = time.time()

print("\nTotal run time:", end-start)

# Feature Selection: correlation
22 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 2.1048701218974437
RMSE Test: 4.4528920471160545
RMSE Baseline Train: 5.56172744734071
RMSE Baseline Test: 5.806578274984156
Up/Down Accuracy Train: 0.8456375838926175
Up/Down Accuracy Test: 0.6669463087248322
# Get Feature Importance


# Feature Selection: correlation
22 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.8880160493189444
RMSE Test: 4.480173868786643
RMSE Baseline Train: 5.5877710566138745
RMSE Baseline Test: 5.705686508475829
Up/Down Accuracy Train: 0.7125629194630873
Up/Down Accuracy Test: 0.6619127516778524
# Get Feature Importance


# Feature Selection: correlation
24 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 2.078710902443866
RMSE T



# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 2.3591377014461474
RMSE Test: 4.413704366868539
RMSE Baseline Train: 5.636156820789467
RMSE Baseline Test: 5.512036395326456
Up/Down Accuracy Train: 0.8129194630872483
Up/Down Accuracy Test: 0.6656879194630873
# Get Feature Importance


# Feature Selection: correlation
21 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 2.152486449749439
RMSE Test: 4.52279500733125
RMSE Baseline Train: 5.598847489060014
RMSE Baseline Test: 5.662086826825915
Up/Down Accuracy Train: 0.8390310402684564
Up/Down Accuracy Test: 0.6610738255033557
# Get Feature Importance


# Average Metrics
rmse_train             2.516644
rmse_test              4.456299
rmse_baseline_train    5.611418
rmse_baseline_test     5.609371
up_down_train          0.811640
up_down_test           0.663507
dtype: float64
# Average Feature Importance
                                

In [41]:
# feature_importance_list
# final_feat_df
final_df['RATE'].mean()

7.395308554428299

corr_min_value = 0.10

rmse_train             2.291120
rmse_test              4.423865
rmse_baseline_train    5.611418
rmse_baseline_test     5.609371
up_down_train          0.827118
up_down_test           0.665268
dtype: float64

                                              Feature  Avg_importance
0                                            PREVIOUS        0.140409
2                                                  RS        0.037808
1   BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC        0.035476
3       TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS        0.026530
4                                 DOENÇAS_DO_APÊNDICE        0.023946
9   NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES        0.022797
5                  OUTRAS_NEOPLASIAS_MALIGNAS_DA_PELE        0.021415
10                          COLELITÍASE_E_COLECISTITE        0.021033
6                                          UROLITÍASE        0.020818
8                     TRANSTORNOS_DE_HUMOR_[AFETIVOS]        0.020434
16  TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT        0.019928
12          OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO        0.019202
7                OUTRAS_DOENÇAS_ISQUÊMICAS_DO_CORAÇÃO        0.019121
11                             PARTO_ÚNICO_ESPONTÂNEO        0.019048
14  TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁL...        0.018920
13  LUXAÇÕES_ENTORSE_DISTENSÃO_REG_ESP_E_MÚLTIP_CORPO        0.018874
15  NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_E...        0.018755
21               OUTRAS_DOENÇAS_DO_APARELHO_DIGESTIVO        0.017547
24     PANCREATITE_AGUDA_E_OUTRAS_DOENÇAS_DO_PÂNCREAS        0.017538
18                     OUTROS_TRANSTRONOS_ARTICULARES        0.017464

corr_min_value = 0.15
                                              Feature  Avg_importance
0                                            PREVIOUS        0.164639
1   BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC        0.064926
2       TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS        0.050187
3                                 DOENÇAS_DO_APÊNDICE        0.049416
7   NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES        0.045299
5                           COLELITÍASE_E_COLECISTITE        0.043995
4                  OUTRAS_NEOPLASIAS_MALIGNAS_DA_PELE        0.042728
8                     TRANSTORNOS_DE_HUMOR_[AFETIVOS]        0.041694
18                                                 RS        0.040865
13  TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT        0.040292
9   NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_E...        0.040208
6                              PARTO_ÚNICO_ESPONTÂNEO        0.040160
10  TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁL...        0.039784
11    FLEBITE_TROMBOFLEBITE_EMBOLIA_E_TROMBOSE_VENOSA        0.038593
12                         INFARTO_AGUDO_DO_MIOCÁRDIO        0.036548
14                                INSUFICIÊNCIA_RENAL        0.035963
15                         NEOPLASIA_MALIGNA_DO_CÓLON        0.035847
16                       NEOPLASIA_MALIGNA_DO_ESÔFAGO        0.033213
17                      NEOPLASIA_MALIGNA_DO_PÂNCREAS        0.027446


num_feat = 50
corr_min_value = 0.05

=============== SUMMARY ===============
## Average Metrics
rmse_train             2.349230
rmse_test              4.416373
rmse_baseline_train    5.561727
rmse_baseline_test     5.806578
up_down_train          0.840080
up_down_test           0.661493
dtype: float64
## Average Feature Importance
                                               Feature  Avg_importance
101                                           PREVIOUS        0.136908
109                                                 RS        0.034660
92   BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC        0.029692
25       TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS        0.018840
34                  OUTRAS_NEOPLASIAS_MALIGNAS_DA_PELE        0.015009
85                                 DOENÇAS_DO_APÊNDICE        0.014734
15                                          UROLITÍASE        0.012732
60   NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES        0.012553
20                OUTRAS_DOENÇAS_ISQUÊMICAS_DO_CORAÇÃO        0.012021
12                           COLELITÍASE_E_COLECISTITE        0.011681
57                     TRANSTORNOS_DE_HUMOR_[AFETIVOS]        0.011537
4    TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT        0.010996
86   TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁL...        0.010835
96                                      OUTRAS_HÉRNIAS        0.010786
67                              PARTO_ÚNICO_ESPONTÂNEO        0.010717
50   NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_E...        0.010554
93                       LEPTOSPIROSE_NÃO_ESPECIFICADA        0.010484
16           OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO        0.010393
32                                  LEIOMIOMA_DO_ÚTERO        0.010226
29                                   DIABETES_MELLITUS        0.010103


Third model: with RFE feature selection

In [None]:
def rfe_feature_selector(X,y,n_features_to_select):
    rfe_selector = RFE(estimator=RandomForestRegressor(random_state=42), step=10, verbose=0,n_features_to_select=n_features_to_select)
    rfe_selector.fit(X, y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.loc[:,rfe_support].columns.tolist()
    print(len(rfe_feature), 'selected features (RFE)')
    return rfe_feature

In [None]:
from sklearn.model_selection import KFold
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
feature_importance_list = []
count = 1
for train_index, test_index in kf.split(X):
#     if(count > 1):
#         break # delete this later
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Feature Selection: RFE")
    selected_rfe_features = rfe_feature_selector(X_train,y_train,num_feat)
    X_train = X_train[selected_rfe_features]
    X_test = X_test[selected_rfe_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        n_estimators = params[0]
        max_depth = params[1]
        min_samples_leaf = params[2]
        max_features = params[3]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=n_estimators, 
                                      max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))
    space = [
        (5,1000), #n_estimators
        (3,30), #max_depth
        (2,200), #min_samples_leaf
        (0.25,1.00) #max_features
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    print("# Fitting the model")
    regressor = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=resultado_gp.x[0], 
                                  max_depth=resultado_gp.x[1],min_samples_leaf=resultado_gp.x[2],max_features=resultado_gp.x[3])
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Accuracy Train:", up_down_train)
    print("Up/Down Accuracy Test:", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("# Get Feature Importance")
    importance = regressor.feature_importances_
    feature_importance_list.append(pd.DataFrame({"Feature": X_train.columns, "Importance": importance}).sort_values(by="Importance", ascending=False))
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
print("# Average Feature Importance")
final_feat_df = get_average_feat_importance(feature_importance_list)
print(final_feat_df.head(20))
end = time.time()

print("\nTotal run time:", end-start)

num_feat = 50
corr_min_value = 0.05

=============== SUMMARY ===============
## Average Metrics
rmse_train             1.965664
rmse_test              4.437701
rmse_baseline_train    5.561727
rmse_baseline_test     5.806578
up_down_train          0.849413
up_down_test           0.654782
dtype: float64
## Average Feature Importance
                                              Feature  Avg_importance
48                                           PREVIOUS        0.121668
42  BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC        0.036775
49                                                 RS        0.028701
10      TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS        0.025071
40                                DOENÇAS_DO_APÊNDICE        0.023713
15                 OUTRAS_NEOPLASIAS_MALIGNAS_DA_PELE        0.022220
26  NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES        0.021232
5                                          UROLITÍASE        0.020304
23                    TRANSTORNOS_DE_HUMOR_[AFETIVOS]        0.020008
3                           COLELITÍASE_E_COLECISTITE        0.019774
30                             PARTO_ÚNICO_ESPONTÂNEO        0.019582
8                OUTRAS_DOENÇAS_ISQUÊMICAS_DO_CORAÇÃO        0.019507
45                                     OUTRAS_HÉRNIAS        0.018659
44  LUXAÇÕES_ENTORSE_DISTENSÃO_REG_ESP_E_MÚLTIP_CORPO        0.018452
11                                  DIABETES_MELLITUS        0.018390
22  NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_E...        0.018381
6           OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO        0.018362
1   TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT        0.018163
25                                     OUTRAS_ANEMIAS        0.018000
14                                 LEIOMIOMA_DO_ÚTERO        0.017224




num_feat = 20
corr_min_value = 0.15

=============== SUMMARY ===============
# Average Metrics
rmse_train             2.105260
rmse_test              4.475640
rmse_baseline_train    5.561727
rmse_baseline_test     5.806578
up_down_train          0.844694
up_down_test           0.665268
dtype: float64
# Average Feature Importance
                                              Feature  Avg_importance
19                                           PREVIOUS        0.149098
18  BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC        0.069476
5       TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS        0.054571
16                                DOENÇAS_DO_APÊNDICE        0.051521
11                    TRANSTORNOS_DE_HUMOR_[AFETIVOS]        0.048656
13  NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES        0.047402
6                  OUTRAS_NEOPLASIAS_MALIGNAS_DA_PELE        0.045853
10  NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_E...        0.045354
4                           COLELITÍASE_E_COLECISTITE        0.045267
14                             PARTO_ÚNICO_ESPONTÂNEO        0.044370
0   TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT        0.043200
17  TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁL...        0.043174
8                                    FRATURA_DO_FÊMUR        0.043116
2                          INFARTO_AGUDO_DO_MIOCÁRDIO        0.041894
9     FLEBITE_TROMBOFLEBITE_EMBOLIA_E_TROMBOSE_VENOSA        0.041592
3                                 INSUFICIÊNCIA_RENAL        0.041085
15                         NEOPLASIA_MALIGNA_DO_CÓLON        0.039870
7   TRANST_DISCAIS_CERVIC_E_OUTR_TRANST_DISC_INTER...        0.037363
12                       NEOPLASIA_MALIGNA_DO_ESÔFAGO        0.037084
1                       NEOPLASIA_MALIGNA_DO_PÂNCREAS        0.030058

num_feat = 20
corr_min_value = 0.1

=============== SUMMARY ===============
# Average Metrics
rmse_train             2.712058
rmse_test              4.472363
rmse_baseline_train    5.611418
rmse_baseline_test     5.609371
up_down_train          0.795952
up_down_test           0.662668
dtype: float64
# Average Feature Importance
                                              Feature  Avg_importance
0                                            PREVIOUS        0.185482
1   BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC        0.068365
2       TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS        0.050610
3   NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_E...        0.042919
4                  OUTRAS_NEOPLASIAS_MALIGNAS_DA_PELE        0.044063
5                                 DOENÇAS_DO_APÊNDICE        0.051152
6                                          UROLITÍASE        0.040872
7                              PARTO_ÚNICO_ESPONTÂNEO        0.041154
8                           NEOPLASIA_MALIGNA_DA_MAMA        0.036887
9   LUXAÇÕES_ENTORSE_DISTENSÃO_REG_ESP_E_MÚLTIP_CORPO        0.037709
10                          COLELITÍASE_E_COLECISTITE        0.042670
11          OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO        0.038076
12    FLEBITE_TROMBOFLEBITE_EMBOLIA_E_TROMBOSE_VENOSA        0.037100
13     PANCREATITE_AGUDA_E_OUTRAS_DOENÇAS_DO_PÂNCREAS        0.035638

In [None]:
def from_model_feature_selector(X,y):
    embeded_rf_selector = SelectFromModel(RandomForestRegressor())
    embeded_rf_selector.fit(X, y)
    embeded_rf_support = embeded_rf_selector.get_support()
    embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
    print(str(len(embeded_rf_feature)), 'selected features')
    return embeded_rf_feature

In [None]:
from sklearn.model_selection import KFold
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
feature_importance_list = []
count = 1
for train_index, test_index in kf.split(X):
#     if(count > 1):
#         break # delete this later
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Feature Selection: From Model")
    selected_from_model_features = from_model_feature_selector(X_train,y_train)
    X_train = X_train[selected_from_model_features]
    X_test = X_test[selected_from_model_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        n_estimators = params[0]
        max_depth = params[1]
        min_samples_leaf = params[2]
        max_features = params[3]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=n_estimators, 
                                      max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))
    space = [
        (5,1000), #n_estimators
        (3,30), #max_depth
        (2,200), #min_samples_leaf
        (0.25,1.00) #max_features
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    print("# Fitting the model")
    regressor = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=resultado_gp.x[0], 
                                  max_depth=resultado_gp.x[1],min_samples_leaf=resultado_gp.x[2],max_features=resultado_gp.x[3])
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Accuracy Train:", up_down_train)
    print("Up/Down Accuracy Test:", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("# Get Feature Importance")
    importance = regressor.feature_importances_
    feature_importance_list.append(pd.DataFrame({"Feature": X_train.columns, "Importance": importance}).sort_values(by="Importance", ascending=False))
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
print("# Average Feature Importance")
final_feat_df = get_average_feat_importance(feature_importance_list)
print(final_feat_df.head(20))
end = time.time()

print("\nTotal run time:", end-start)

In [None]:
# selected_rfe_features
feature_importance_list

In [None]:
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
selected_cor_features, cor_list = cor_feature_selector(X_train,y_train,corr_min_value)
cor_list