# Classification

- Target = 0 if suicide rates remain the same or decrease
- Target = 1 if suicide rates increase

- Features: disease rates

Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import RFE,SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from skopt import gp_minimize
import time
import glob
root = "../../"

In [2]:
dict_uf_cod = {11: 'RO',
12: 'AC',
13: 'AM',
14: 'RR',
15: 'PA',
16: 'AP',
17: 'TO',
21: 'MA',
22: 'PI',
23: 'CE',
24: 'RN',
25: 'PB',
26: 'PE',
27: 'AL',
28: 'SE',
29: 'BA',
31: 'MG',
32: 'ES',
33: 'RJ',
35: 'SP',
41: 'PR',
42: 'SC',
43: 'RS',
50: 'MS',
51: 'MT',
52: 'GO',
53: 'DF'}

Prepare dataset

In [3]:
disease = ""
path = root + "CSV/TabNet/Internacoes_Rate/"
all_files = glob.glob(path + "*.csv")
suicide = pd.read_csv(root +'CSV/Suicide/suicide_rates_08_18.csv', index_col=0)

years = [str(x).zfill(2) for x in range(8,19)]
columns = ["RATE_" + year for year in years]
columns.append("MUNCOD")

disease_list = []

for file in all_files:
    file_name = file.split("\\")[-1]
    disease = file_name.split(".csv")[0]
    disease_df = pd.read_csv(file, sep=',', index_col=0)
    if(set(disease_df.columns) == set(columns)):
        disease_list.append(disease)
print(len(disease_list), " diseases")

313  diseases


First option: features = difference between (disease rate in t-1) and (disease rate in t-2)

Second option: features = (disease rate in t-1)

In [5]:
other_df = pd.DataFrame()

for i in range(0,len(years)-1):
    col_year_suicide = "RATE_" + years[i+1]
    col_year_suicide_prev = "RATE_" + years[i]
    year_df = suicide[[col_year_suicide_prev, col_year_suicide, "MUNCOD"]]
    # RATE = suicide rate for the current year, PREVIOUS = suicide rate for the previous year
    year_df = year_df.rename(columns={col_year_suicide: "RATE", col_year_suicide_prev: "PREVIOUS"})
    for disease in disease_list:
        col_year_disease = "RATE_" + years[i]
        disease_df = pd.read_csv(path + disease + ".csv", sep=',', index_col=0)
        disease_df = disease_df[[col_year_disease, "MUNCOD"]]
        
        disease_df[disease] = disease_df[col_year_disease] 
        disease_df = disease_df.drop(columns={col_year_disease})
        
        year_df = pd.merge(disease_df, year_df, left_on="MUNCOD", right_on="MUNCOD", how='right')
        
    other_df = pd.concat([other_df, year_df])

other_df.head()

Unnamed: 0,MUNCOD,ÚLCERA_GÁSTRICA_E_DUODENAL,VEIAS_VARICOSAS_DAS_EXTREMIDADES_INFERIORES,VARICELA_E_HERPES_ZOSTER,UROLITÍASE,TÉTANO_NEONATAL,TUBERC_INTEST_PERITÔNIO_GLÂNGL_MESENTÉRICOS,TUBERCULOSE_ÓSSEA_E_DAS_ARTICULAÇÕES,TUBERCULOSE_PULMONAR,TUBERCULOSE_MILIAR,...,ANCILOSTOMÍASE,AMEBÍASE,ALGUNS_TRANSTORNOS_ENVOLVENDO_MECANISMO_IMUNITÁRIO,AFECÇ_HEMORRÁG_E_OUTR_DOENÇ_SANG_E_ÓRG_HEMATOPOÉT,ACID_VASCULAR_CEREBR_NÃO_ESPEC_HEMORRÁG_OU_ISQUÊM,ACID_VASCULAR_CEREBR_ISQUÊM_TRANSIT_E_SÍNDR_CORREL,ABORTO_POR_RAZÕES_MÉDICAS,ABORTO_ESPONTÂNEO,PREVIOUS,RATE
0,110001,28.481914,40.688449,0.0,142.40957,,,,0.0,,...,,0.0,,44.757293,40.688449,32.550759,,126.134191,20.344224,8.212203
1,110002,9.458389,54.385737,0.0,15.369882,,0.0,,0.0,0.0,...,,0.0,0.0,1.182299,39.015855,0.0,0.0,1.182299,9.458389,2.33806
2,110003,0.0,14.755792,0.0,29.511583,,,,,,...,,,,0.0,118.046333,0.0,,,0.0,14.93652
3,110004,5.110972,21.721631,0.0,63.88715,,,,0.0,0.0,...,,,0.0,3.833229,106.052669,3.833229,0.0,2.555486,5.110972,7.626311
4,110005,5.958055,41.706387,5.958055,113.203051,,,,0.0,,...,,,,5.958055,95.328885,0.0,0.0,5.958055,0.0,0.0


In [6]:
df = other_df.copy()

In [7]:
percentage_valid_values = 0.8
num_feat = 30

print("Shape before null values drop:", df.shape)
N = int(df.shape[1]*percentage_valid_values)
print("Minimum non-null values: " + str(N) + "/" + str(df.shape[1]))
df = df.dropna(thresh=N) # At least N non null items
df = df.fillna(0)

Shape before null values drop: (53810, 316)
Minimum non-null values: 252/316


In [8]:
a = df["RATE"] - df["PREVIOUS"] 
def get_target(x):   
    diff = x["RATE"] - x["PREVIOUS"] 
    if diff > 0:
        return 1
    else:
        return 0
df["TARGET"] = df.apply(get_target, axis=1)

Get UF from MUNCOD

In [9]:
df['UF'] = df['MUNCOD'] / 10000
df['UF'] = df['UF'].astype(int)
df['UF'].replace(dict_uf_cod, inplace=True)
dummy = pd.get_dummies(df['UF'])
df = pd.concat([df, dummy], axis=1)
df = df.drop(['RATE',"PREVIOUS", 'MUNCOD', 'UF'], axis=1)
df.head()

Unnamed: 0,ÚLCERA_GÁSTRICA_E_DUODENAL,VEIAS_VARICOSAS_DAS_EXTREMIDADES_INFERIORES,VARICELA_E_HERPES_ZOSTER,UROLITÍASE,TÉTANO_NEONATAL,TUBERC_INTEST_PERITÔNIO_GLÂNGL_MESENTÉRICOS,TUBERCULOSE_ÓSSEA_E_DAS_ARTICULAÇÕES,TUBERCULOSE_PULMONAR,TUBERCULOSE_MILIAR,TUBERCULOSE_DO_SISTEMA_NERVOSO,...,PR,RJ,RN,RO,RR,RS,SC,SE,SP,TO
0,28.481914,40.688449,0.0,142.40957,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
1,9.458389,54.385737,0.0,15.369882,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
3,5.110972,21.721631,0.0,63.88715,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
8,0.0,48.921969,0.0,24.460985,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
9,2.466639,9.866555,0.0,12.333194,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
unique, counts = np.unique(df["TARGET"].values, return_counts=True)
dict(zip(unique, counts))

{0: 5376, 1: 4414}

Define X and y

In [15]:
X = df.drop(columns=["TARGET"])
y = df["TARGET"]

X.shape

(9790, 340)

Define some functions

In [16]:
def get_average_feat_importance(feature_importance_list):
    final_feat_df = pd.DataFrame()
    for i, feat_df in enumerate(feature_importance_list):
        feat_df = feat_df.rename(columns={"Importance": i})
        if final_feat_df.empty:
            final_feat_df = feat_df
        else:
            final_feat_df = pd.merge(final_feat_df, feat_df, on="Feature", how="right")
    final_feat_df = final_feat_df.fillna(0)
    final_feat_df["Avg_importance"] = final_feat_df.sum(axis=1)/(final_feat_df.shape[1] -1)
    final_feat_df = final_feat_df[["Feature", "Avg_importance"]]
    return final_feat_df

Create K-folds

In [17]:
def sfm_feature_selector(X_train,y_train,max_features):
    embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100),max_features=max_features)
    embeded_rf_selector.fit(X_train, y_train)

    embeded_rf_support = embeded_rf_selector.get_support()
    embeded_rf_feature = X_train.loc[:,embeded_rf_support].columns.tolist()
    print(str(len(embeded_rf_feature)), 'selected features (SFM)')
    return embeded_rf_feature

First model: no bayesian optimization

In [18]:
from sklearn.model_selection import StratifiedKFold
start = time.time()
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"accuracy_train": [], "accuracy_test": []})
feature_importance_list = []
count = 1
for train_index, test_index in kf.split(X,y):
    if(count > 1):
        break
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: SFM")
    selected_sfm_features = sfm_feature_selector(X_train,y_train,num_feat)
    X_train = X_train[selected_sfm_features]
    X_test = X_test[selected_sfm_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        n_estimators = params[0]
        max_depth = params[1]
        min_samples_leaf = params[2]
        max_features = params[3]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = RandomForestClassifier(n_jobs=-1, random_state=42,n_estimators=n_estimators, 
                                      max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return metrics.accuracy_score(yf_val, yf_pred)
    space = [
        (5,1000), #n_estimators
        (3,30), #max_depth
        (2,200), #min_samples_leaf
        (0.25,1.00) #max_features
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    n_estimators = resultado_gp.x[0]
    max_depth = resultado_gp.x[1]
    min_samples_leaf = resultado_gp.x[2]
    max_features = resultado_gp.x[3]
    
    print("# Fitting the model")
    classifier = RandomForestClassifier(random_state=42,n_estimators=n_estimators, 
                                      max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)
    classifier.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = classifier.predict(X_test) 
    y_pred_train = classifier.predict(X_train)
    
    print("# Calculating metrics")
    print("Train")
    print(metrics.confusion_matrix(y_train, y_pred_train))
    print(metrics.classification_report(y_train, y_pred_train, digits=3))
    print("Test")
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred, digits=3))
    
    print("# Get Feature Importance")
    importance = classifier.feature_importances_
    feature_importance_list.append(pd.DataFrame({"Feature": X_train.columns, "Importance": importance}).sort_values(by="Importance", ascending=False))
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Feature Importance")
final_feat_df = get_average_feat_importance(feature_importance_list)
print(final_feat_df.head(20))
end = time.time()

print("\nTotal run time:", end-start)

# Feature Selection: SFM
30 selected features (SFM)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
Train
[[4106  194]
 [ 378 3154]]
              precision    recall  f1-score   support

           0      0.916     0.955     0.935      4300
           1      0.942     0.893     0.917      3532

    accuracy                          0.927      7832
   macro avg      0.929     0.924     0.926      7832
weighted avg      0.928     0.927     0.927      7832

Test
[[679 397]
 [541 341]]
              precision    recall  f1-score   support

           0      0.557     0.631     0.591      1076
           1      0.462     0.387     0.421       882

    accuracy                          0.521      1958
   macro avg      0.509     0.509     0.506      1958
weighted avg      0.514     0.521     0.515      1958

# Get Feature Importance


# Average Feature Importance
                                              Feature  Avg_importance
22                  