# Regression
Features = rates from selected diseases (FeatureSelection) + suicide rate 

Target = suicide rate of the following year

Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
import glob
root = "../../"

Prepare dataset

In [2]:
disease = ""
path = root + "CSV/TabNet/Internacoes_Rate/"
all_files = glob.glob(path + "*.csv")
# all_files = all_files + glob.glob(path + "Groups/" + "*.csv")
suicide = pd.read_csv(root +'CSV/Suicide/suicide_rates_08_18.csv', index_col=0)

years = ["08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18"]

final_df = pd.DataFrame()

for i in range(len(years)-1):
    col_year_suicide = "RATE_" + years[i+1]
    col_year_prev = "RATE_" + years[i]
    year_df = suicide[[col_year_prev,col_year_suicide, "MUNCOD"]]
    year_df = year_df.rename(columns={col_year_suicide: "RATE"})
    year_df = year_df.rename(columns={col_year_prev: "PREVIOUS"})
    for file in all_files:
        file_name = file.split("/")[-1]
        disease = file_name.split(".csv")[0]

        disease_df = pd.read_csv(file, sep=',', index_col=0)
        disease_df = disease_df[[col_year_prev, "MUNCOD"]]
        disease_df = disease_df.rename(columns={col_year_prev: disease})
        
        year_df = pd.merge(disease_df, year_df, left_on="MUNCOD", right_on="MUNCOD", how='right')
        
    year_df = year_df.drop("MUNCOD", axis=1)
    final_df = pd.concat([final_df, year_df])

final_df = final_df[(final_df["RATE"] > 0) | (final_df["PREVIOUS"] > 0)]
final_df

Unnamed: 0,DIFTERIA,NEOPLASIA_MALIGNA_DO_ESTÔMAGO,FRATURA_DO_CRÂNIO_E_DOS_OSSOS_DA_FACE,DOENÇA_DE_HODGKIN,OUTRAS_DOENÇAS_CEREBROVASCULARES,FEBRES_RECORRENTES,ESTADO_INFEC_ASSINT_VÍRUS_DA_IMUNODEF_HUMANA_[HIV],PNEUMOCONIOSE,NEOPLASIA_MALIGNA_DA_PRÓSTATA,OUTRAS_NEOPL_MALIG_ÓRG_RESPIRAT_E_INTRATORÁCICOS,...,ASMA,CERATITE_E_OUTROS_TRANSTORNOS_ESCLERÓTICA_E_CÓRNEA,INFECÇÕES_PELO_VÍRUS_DO_HERPES,NEOPLASIA_MALIGNA_DO_LÁBIO_CAVIDADE_ORAL_E_FARINGE,TUBERCULOSE_PULMONAR,AUSÊNCIA_ATRESIA_E_ESTENOSE_DO_INTESTINO_DELGADO,OUTRAS_DOENÇAS_DO_APARELHO_DIGESTIVO,OUTRAS_TUBERCULOSES_RESPIRATÓRIAS,PREVIOUS,RATE
0,0.000000,0.000000,16.275379,0.000000,0.000000,0.0,,,0.000000,0.000000,...,752.736298,0.000000,0.000000,0.000000,0.000000,,28.481914,0.000000,20.344224,8.212203
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,...,66.394101,0.000000,0.000000,0.000000,0.000000,,52.416396,,0.000000,6.922331
2,0.000000,4.483288,4.219565,1.318614,2.637228,0.0,,0.000000,3.692119,0.263723,...,30.591847,5.274456,3.428397,2.373505,0.527446,0.0,11.603804,10.021467,4.483288,5.485478
5,0.000000,9.163263,53.670541,0.000000,0.000000,0.0,,0.000000,3.927113,1.309038,...,261.807519,1.309038,1.309038,3.927113,26.180752,0.0,71.997068,1.309038,5.236150,9.090437
7,0.331787,5.308595,10.948978,0.000000,3.981446,0.0,,0.331787,4.645021,5.308595,...,14.930424,2.654298,0.663574,6.635744,11.612552,0.0,65.362079,5.640382,5.972170,6.536930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5373,,,0.000000,,0.000000,,,,29.761905,,...,0.000000,0.000000,,29.761905,0.000000,,119.047619,0.000000,0.000000,61.671292
5374,,,25.926886,,0.000000,,,,0.000000,,...,51.853772,0.000000,,0.000000,,,51.853772,103.707545,0.000000,26.295030
5376,,,14.511682,,0.000000,,,,0.000000,,...,87.070091,,,0.000000,14.511682,,0.000000,0.000000,14.511682,0.000000
5377,,,0.000000,33.772374,0.000000,,,,0.000000,,...,0.000000,,,0.000000,,,67.544748,0.000000,0.000000,34.867503


In [3]:
final_df.shape

(38331, 315)

Define X and y

In [4]:
percentage_valid_values = 0.8
N = int(final_df.shape[1]*0.8)
print(N)
final_zeros_df = final_df.dropna(thresh=N) # At least N non null items

X = final_zeros_df.drop(columns="RATE")
X = X.fillna(0)
y = final_zeros_df["RATE"]

num_feat = 50
corr_min_value = 0.05
X.shape

252


(9097, 314)

Split data into train and test sets

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Feature Selection

In [6]:
def cor_feature_selector(X,y,n):
    cor_list = []
    for i in list(X.columns):
        cor = np.corrcoef(X[i], y)[0,1]
        cor_list.append([i, cor])
    cor_ranking = sorted(cor_list, key=lambda a : abs(a[1]),reverse=True)
    cor_feature = [x[0] for x in cor_ranking[:n]]
    cor_support = [True if i in cor_feature else False for i in X.columns]
    return cor_support, cor_feature, cor_ranking
cor_support, cor_feature, cor_ranking = cor_feature_selector(X_train,y_train,num_feat)
print(str(len(cor_feature)), 'selected features')
print(cor_feature)

50 selected features
['PREVIOUS', 'BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC', 'NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES', 'TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT', 'COLELITÍASE_E_COLECISTITE', 'NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_ESPEC', 'TRANSTORNOS_DE_HUMOR_[AFETIVOS]', 'DOENÇAS_DO_APÊNDICE', 'TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS', 'FLEBITE_TROMBOFLEBITE_EMBOLIA_E_TROMBOSE_VENOSA', 'NEOPLASIA_MALIGNA_DO_CÓLON', 'TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁLCOOL', 'INSUFICIÊNCIA_RENAL', 'NEOPLASIA_MALIGNA_DO_ESÔFAGO', 'TRANSTORNOS_DOS_NERVOS_RAÍZES_E_PLEXOS_NERVOSOS', 'INFARTO_AGUDO_DO_MIOCÁRDIO', 'OUTRAS_NEOPLASIAS_MALIGNAS_DA_PELE', 'NEOPLASIA_MALIGNA_DO_PÂNCREAS', 'EMBOLIA_E_TROMBOSE_ARTERIAIS', 'OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO', 'NEOPL_MALIG_JUNÇÃO_RETOSSIGM_RETO_ÂNUS_CANAL_ANAL', 'TRANST_DISCAIS_CERVIC_E_OUTR_TRANST_DISC_INTERVERT', 'FRATURA_DO_FÊMUR', 'EMBOLIA_PULMONAR', 'ACID_VASCULAR_CEREBR_ISQUÊM_TRANSIT_E_SÍNDR_CORREL

In [7]:
cor_ranking_filtered = list(filter(lambda a: abs(a[1]) > corr_min_value, cor_ranking))
diseases_filtered = [d[0] for d in cor_ranking_filtered]
X_train = X_train[diseases_filtered]
cor_support = [True if i in cor_feature else False for i in X_train.columns]
X_train.shape

(6094, 123)

### RFE

In [8]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
rfe_selector = RFE(estimator=RandomForestRegressor(), n_features_to_select=num_feat, step=10, verbose=5)
rfe_selector.fit(X_train, y_train)
rfe_support = rfe_selector.get_support()
rfe_feature = X_train.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
print(rfe_feature)

Fitting estimator with 123 features.
Fitting estimator with 113 features.
Fitting estimator with 103 features.
Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.
50 selected features
['PREVIOUS', 'BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC', 'NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES', 'TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT', 'COLELITÍASE_E_COLECISTITE', 'NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_ESPEC', 'TRANSTORNOS_DE_HUMOR_[AFETIVOS]', 'DOENÇAS_DO_APÊNDICE', 'TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS', 'NEOPLASIA_MALIGNA_DO_CÓLON', 'TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁLCOOL', 'INSUFICIÊNCIA_RENAL', 'NEOPLASIA_MALIGNA_DO_ESÔFAGO', 'TRANSTORNOS_DOS_NERVOS_RAÍZES_E_PLEXOS_NERVOSOS', 'INFARTO_AGUDO_DO_MIOCÁRDIO', 'NEOPLASIA_MALIGNA_DO_PÂNCREAS', 'OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO', 'NEOPL_MALIG_JUNÇÃO_RETOSSIG

### SelectFromModel

In [9]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

embeded_rf_selector = SelectFromModel(RandomForestRegressor())
embeded_rf_selector.fit(X_train, y_train)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X_train.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
print(embeded_rf_feature)

30 selected features
['PREVIOUS', 'BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC', 'NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES', 'TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT', 'COLELITÍASE_E_COLECISTITE', 'NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_ESPEC', 'TRANSTORNOS_DE_HUMOR_[AFETIVOS]', 'DOENÇAS_DO_APÊNDICE', 'TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS', 'NEOPLASIA_MALIGNA_DO_CÓLON', 'TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁLCOOL', 'INSUFICIÊNCIA_RENAL', 'OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO', 'PANCREATITE_AGUDA_E_OUTRAS_DOENÇAS_DO_PÂNCREAS', 'PARTO_ÚNICO_ESPONTÂNEO', 'LUXAÇÕES_ENTORSE_DISTENSÃO_REG_ESP_E_MÚLTIP_CORPO', 'LEPTOSPIROSE_NÃO_ESPECIFICADA', 'TRANSTORNOS_DO_TECIDO_MOLE', 'LINFOMA_NÃO-HODGKIN', 'OUTRAS_ANEMIAS', 'OUTRAS_COMPLICAÇÕES_DA_GRAVIDEZ_E_DO_PARTO', 'INFECÇÕES_PELO_VÍRUS_DO_HERPES', 'LESÕES_ESMAG_AMPUT_TRAUMÁT_REG_ESP_E_MÚLTIP_CORPO', 'OUTRAS_HÉRNIAS', 'LEIOMIOMA_DO_ÚTERO', 'PROLAPSO_GENITAL_FEMININO', 'DESNUTRIÇÃO', 'SEQÜEL_TRAUM_E

In [10]:
feature_selection_df = pd.DataFrame({'Feature':X_train.columns, 'Pearson':cor_support, 'RFE':rfe_support,
                                    'Random Forest':embeded_rf_support})
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feat)

Unnamed: 0,Feature,Pearson,RFE,Random Forest,Total
1,TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁL...,True,True,True,3
2,TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT,True,True,True,3
3,TRANSTORNOS_DO_TECIDO_MOLE,True,True,True,3
4,TRANSTORNOS_DE_HUMOR_[AFETIVOS],True,True,True,3
5,TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS,True,True,True,3
6,PREVIOUS,True,True,True,3
7,PARTO_ÚNICO_ESPONTÂNEO,True,True,True,3
8,PANCREATITE_AGUDA_E_OUTRAS_DOENÇAS_DO_PÂNCREAS,True,True,True,3
9,OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO,True,True,True,3
10,NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_E...,True,True,True,3


In [11]:
selected_diseases = feature_selection_df[feature_selection_df['Total'] == 3]['Feature']
selected_diseases

1     TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁL...
2     TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT
3                            TRANSTORNOS_DO_TECIDO_MOLE
4                       TRANSTORNOS_DE_HUMOR_[AFETIVOS]
5         TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS
6                                              PREVIOUS
7                                PARTO_ÚNICO_ESPONTÂNEO
8        PANCREATITE_AGUDA_E_OUTRAS_DOENÇAS_DO_PÂNCREAS
9             OUTRAS_DOENÇAS_DOS_INTESTINOS_E_PERITÔNIO
10    NEOPL_MALIG_OUTR_LOCALIZ_MAL_DEF_SECUN_E_NÃO_E...
11                           NEOPLASIA_MALIGNA_DO_CÓLON
12    NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES
13    LUXAÇÕES_ENTORSE_DISTENSÃO_REG_ESP_E_MÚLTIP_CORPO
14                        LEPTOSPIROSE_NÃO_ESPECIFICADA
15                                  INSUFICIÊNCIA_RENAL
16                                  DOENÇAS_DO_APÊNDICE
17                            COLELITÍASE_E_COLECISTITE
18    BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR

In [12]:
X_train = X_train[selected_diseases]
X_test = X_test[selected_diseases]

## Model

In [13]:
# Bayesian Optimization
def train_model(params):
    n_estimators = params[0]
    max_depth = params[1]
    min_samples_leaf = params[2]
    max_features = params[3]
    
    xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33)
    
    model = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=n_estimators, 
                                  max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)
    
    model.fit(xf_train_params,yf_train_params)
    yf_pred = model.predict(xf_val)
    
    return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))

space = [
    (5,1000), #n_estimators
    (3,30), #max_depth
    (2,200), #min_samples_leaf
    (0.25,1.00) #max_features
]

from skopt import gp_minimize

resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)

In [14]:
resultado_gp.x

[21, 9, 50, 0.7624476391190937]

Fit the RandomForest regression model

In [15]:
regressor = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=resultado_gp.x[0], 
                                  max_depth=resultado_gp.x[1],min_samples_leaf=resultado_gp.x[2],max_features=resultado_gp.x[3])
regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=9, max_features=0.7624476391190937,
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=50, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=21, n_jobs=-1,
                      oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

Calculate metrics corresponding to the regression model

In [16]:
def calcMetrics(y_train, y_pred_train, y_test, y_pred):
    print("Root Mean Squared Error Train: " + str(np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))))
    print("Root Mean Squared Error Test: " + str(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
    #print("MAPE Test: " + str(np.mean(np.abs((y_test - y_pred) / y_test)) * 100))
def calcUpMetrics(y_pred,y_test,X_test):
    up_df = pd.DataFrame({"Pred": y_pred, "Real": y_test, "Previous": X_test["PREVIOUS"]})
    up_df["UP"] = up_df["Previous"] < up_df["Real"]
    up_df["UP_PRED"] = up_df["Previous"] < up_df["Pred"]
    up_df["UP"] = up_df["UP"].astype(int)
    up_df["UP_PRED"] = up_df["UP_PRED"].astype(int)
    print("Accuracy UP/DOWN: ", metrics.accuracy_score(up_df["UP"], up_df["UP_PRED"]))

In [17]:
y_pred = regressor.predict(X_test) 
y_pred_train = regressor.predict(X_train)
print("# Regression Model")
calcMetrics(y_train, y_pred_train, y_test, y_pred)
calcUpMetrics(y_pred,y_test,X_test)

# Regression Model
Root Mean Squared Error Train: 3.9207938104010998
Root Mean Squared Error Test: 4.2173272439201295
Accuracy UP/DOWN:  0.6796536796536796


We also define a baseline model consisting of predicting the previous year suicide rate

In [18]:
y_pred = X_test["PREVIOUS"]
y_pred_train = X_train["PREVIOUS"]
print("# Baseline Model")
calcMetrics(y_train, y_pred_train, y_test, y_pred)

# Baseline Model
Root Mean Squared Error Train: 5.139031942273962
Root Mean Squared Error Test: 5.297373961033153


K-fold cross validation

In [19]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=4)

rmse_regression = []
rmse_baseline = []
accuracy_up_down = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    regressor = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=resultado_gp.x[0], 
                                  max_depth=resultado_gp.x[1],min_samples_leaf=resultado_gp.x[2],max_features=resultado_gp.x[3])

    regressor.fit(X_train,y_train)
    
    rmse_regression.append(
        np.sqrt(metrics.mean_squared_error(regressor.predict(X_test),y_test))
    )
    
    up_df = pd.DataFrame({"Pred": regressor.predict(X_test), "Real": y_test, "Previous": X_test["PREVIOUS"]})
    up_df["UP"] = up_df["Previous"] < up_df["Real"]
    up_df["UP_PRED"] = up_df["Previous"] < up_df["Pred"]
    up_df["UP"] = up_df["UP"].astype(int)
    up_df["UP_PRED"] = up_df["UP_PRED"].astype(int)
    accuracy_up_down.append(metrics.accuracy_score(up_df["UP"], up_df["UP_PRED"]))
    
    rmse_baseline.append(
        np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    )
    
print("Mean RMSE for the regression model was:", np.mean(rmse_regression))
print("Mean accuracy (up/down) for the regression model was:", np.mean(accuracy_up_down))
print("Mean RMSE for the baseline model was:", np.mean(rmse_baseline))

Mean RMSE for the regression model was: 4.172974096206556
Mean accuracy (up/down) for the regression model was: 0.6771459499163984
Mean RMSE for the baseline model was: 5.185396544959999
