# Classification

- Features: disease rates + state
- Target = 1: municipality is in a high risk cluster (SatScan)
- Target = 0: municipality isn't in a high risk cluster (SatScan)

Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import RFE,SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from skopt import gp_minimize
import time
import glob
root = "../../"

In [2]:
dict_uf_cod = {11: 'RO',
12: 'AC',
13: 'AM',
14: 'RR',
15: 'PA',
16: 'AP',
17: 'TO',
21: 'MA',
22: 'PI',
23: 'CE',
24: 'RN',
25: 'PB',
26: 'PE',
27: 'AL',
28: 'SE',
29: 'BA',
31: 'MG',
32: 'ES',
33: 'RJ',
35: 'SP',
41: 'PR',
42: 'SC',
43: 'RS',
50: 'MS',
51: 'MT',
52: 'GO',
53: 'DF'}

Prepare dataset

In [3]:
disease = ""
path = root + "CSV/TabNet/Internacoes_Rate/"
all_files = glob.glob(path + "*.csv")

years = [str(x).zfill(2) for x in range(8,19)]
columns = ["RATE_" + year for year in years]
columns.append("MUNCOD")

disease_list = []

for file in all_files:
    file_name = file.split("\\")[-1]
    disease = file_name.split(".csv")[0]
    disease_df = pd.read_csv(file, sep=',', index_col=0)
    if(set(disease_df.columns) == set(columns)):
        disease_list.append(disease)
print(len(disease_list), " diseases")

313  diseases


In [4]:
initial_df = pd.DataFrame()
percentage_valid_values = 0.8
for i in range(0,len(years)-1):
    year_df = pd.read_csv(root + "CSV/SatScan/MuncodRisk/cluster_risk_20" + years[i+1] + ".csv", index_col=0)
    for disease in disease_list:
        col_year_disease = "RATE_" + years[i]
        disease_df = pd.read_csv(path + disease + ".csv", sep=',', index_col=0)
        disease_df = disease_df[[col_year_disease, "MUNCOD"]]
        disease_df[disease] = disease_df[col_year_disease] 
        disease_df = disease_df.drop(columns={col_year_disease})
        year_df = pd.merge(disease_df, year_df, left_on="MUNCOD", right_on="MUNCOD", how='right')
    N = int(year_df.shape[1]*percentage_valid_values)
    year_df = year_df.dropna(thresh=N)
    initial_df = pd.concat([initial_df, year_df])

initial_df.head()

Unnamed: 0,MUNCOD,ÚLCERA_GÁSTRICA_E_DUODENAL,VEIAS_VARICOSAS_DAS_EXTREMIDADES_INFERIORES,VARICELA_E_HERPES_ZOSTER,UROLITÍASE,TÉTANO_NEONATAL,TUBERC_INTEST_PERITÔNIO_GLÂNGL_MESENTÉRICOS,TUBERCULOSE_ÓSSEA_E_DAS_ARTICULAÇÕES,TUBERCULOSE_PULMONAR,TUBERCULOSE_MILIAR,...,ANEMIA_POR_DEFICIÊNCIA_DE_FERRO,ANCILOSTOMÍASE,AMEBÍASE,ALGUNS_TRANSTORNOS_ENVOLVENDO_MECANISMO_IMUNITÁRIO,AFECÇ_HEMORRÁG_E_OUTR_DOENÇ_SANG_E_ÓRG_HEMATOPOÉT,ACID_VASCULAR_CEREBR_NÃO_ESPEC_HEMORRÁG_OU_ISQUÊM,ACID_VASCULAR_CEREBR_ISQUÊM_TRANSIT_E_SÍNDR_CORREL,ABORTO_POR_RAZÕES_MÉDICAS,ABORTO_ESPONTÂNEO,RISK
0,110001,28.481914,40.688449,0.0,142.40957,,,,0.0,,...,0.0,,0.0,,44.757293,40.688449,32.550759,,126.134191,1
1,110002,9.458389,54.385737,0.0,15.369882,,0.0,,0.0,0.0,...,0.0,,0.0,0.0,1.182299,39.015855,0.0,0.0,1.182299,0
3,110004,5.110972,21.721631,0.0,63.88715,,,,0.0,0.0,...,2.555486,,,0.0,3.833229,106.052669,3.833229,0.0,2.555486,1
8,110009,0.0,48.921969,0.0,24.460985,,,,0.0,,...,0.0,,0.0,,3.494426,0.0,17.472132,0.0,0.0,1
9,110010,2.466639,9.866555,0.0,12.333194,,0.0,0.0,0.0,0.0,...,41.932858,0.0,2.466639,,2.466639,36.999581,0.0,0.0,197.331097,0


In [5]:
df = initial_df.copy()
df = df.fillna(0)

Checking the number of 0s and 1s in TARGET column

In [6]:
unique, counts = np.unique(df["RISK"].values, return_counts=True)
dict(zip(unique, counts))

{0: 6032, 1: 3318}

Get UF from MUNCOD

In [7]:
df['UF'] = df['MUNCOD'] / 10000
df['UF'] = df['UF'].astype(int)
df['UF'].replace(dict_uf_cod, inplace=True)
dummy = pd.get_dummies(df['UF'])
df = pd.concat([df, dummy], axis=1)
df = df.drop(['MUNCOD', 'UF'], axis=1)
df.head()

Unnamed: 0,ÚLCERA_GÁSTRICA_E_DUODENAL,VEIAS_VARICOSAS_DAS_EXTREMIDADES_INFERIORES,VARICELA_E_HERPES_ZOSTER,UROLITÍASE,TÉTANO_NEONATAL,TUBERC_INTEST_PERITÔNIO_GLÂNGL_MESENTÉRICOS,TUBERCULOSE_ÓSSEA_E_DAS_ARTICULAÇÕES,TUBERCULOSE_PULMONAR,TUBERCULOSE_MILIAR,TUBERCULOSE_DO_SISTEMA_NERVOSO,...,PR,RJ,RN,RO,RR,RS,SC,SE,SP,TO
0,28.481914,40.688449,0.0,142.40957,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
1,9.458389,54.385737,0.0,15.369882,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
3,5.110972,21.721631,0.0,63.88715,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
8,0.0,48.921969,0.0,24.460985,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
9,2.466639,9.866555,0.0,12.333194,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0


Define X and y

In [8]:
X = df.drop(columns=["RISK"])
y = df["RISK"]

X.shape

(9350, 340)

Define some functions

In [9]:
def get_average_feat_importance(feature_importance_list):
    final_feat_df = pd.DataFrame()
    for i, feat_df in enumerate(feature_importance_list):
        feat_df = feat_df.rename(columns={"Importance": i})
        if final_feat_df.empty:
            final_feat_df = feat_df
        else:
            final_feat_df = pd.merge(final_feat_df, feat_df, on="Feature", how="right")
    final_feat_df = final_feat_df.fillna(0)
    final_feat_df["Avg_importance"] = final_feat_df.sum(axis=1)/(final_feat_df.shape[1] -1)
    final_feat_df = final_feat_df[["Feature", "Avg_importance"]]
    return final_feat_df.sort_values(by="Avg_importance", ascending=False)

In [10]:
def sfm_feature_selector(X_train,y_train,max_features):
    embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100),max_features=max_features)
    embeded_rf_selector.fit(X_train, y_train)

    embeded_rf_support = embeded_rf_selector.get_support()
    embeded_rf_feature = X_train.loc[:,embeded_rf_support].columns.tolist()
    print(str(len(embeded_rf_feature)), 'selected features (SFM)')
    return embeded_rf_feature

In [11]:
def get_average(scores_list):
    scores_avg_test = []
    for i in range(len(scores_list[0])):
        arr_list = [x[i] for x in scores_list]
        avg_result = np.array(arr_list).mean(axis=0)
        scores_avg_test.append(avg_result)
    return scores_avg_test

First model

In [13]:
from sklearn.model_selection import StratifiedKFold
start = time.time()
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"accuracy_train": [], "accuracy_test": [], "scores_train": [], "scores_test": []})
feature_importance_list = []
num_feat = 30
count = 1
for train_index, test_index in kf.split(X,y):
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: SFM")
    selected_sfm_features = sfm_feature_selector(X_train,y_train,num_feat)
    X_train = X_train[selected_sfm_features]
    X_test = X_test[selected_sfm_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        n_estimators = params[0]
        max_depth = params[1]
        min_samples_leaf = params[2]
        max_features = params[3]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = RandomForestClassifier(n_jobs=-1, random_state=42,n_estimators=n_estimators, 
                                      max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return metrics.accuracy_score(yf_val, yf_pred)
    space = [
        (5,1000), #n_estimators
        (3,30), #max_depth
        (2,200), #min_samples_leaf
        (0.25,1.00) #max_features
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    n_estimators = resultado_gp.x[0]
    max_depth = resultado_gp.x[1]
    min_samples_leaf = resultado_gp.x[2]
    max_features = resultado_gp.x[3]
    
    print("# Fitting the model")
    classifier = RandomForestClassifier(random_state=42,n_estimators=n_estimators, 
                                      max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)
    classifier.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = classifier.predict(X_test) 
    y_pred_train = classifier.predict(X_train)
    
    print("# Calculating metrics")
    print("Train")
    print(metrics.confusion_matrix(y_train, y_pred_train))
    print(metrics.classification_report(y_train, y_pred_train, digits=3))
    print("Test")
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred, digits=3))
    scores = scores.append({"accuracy_train": metrics.accuracy_score(y_train,y_pred_train),
                  "accuracy_test": metrics.accuracy_score(y_test,y_pred),
                  "scores_train": metrics.precision_recall_fscore_support(y_train,y_pred_train),
                  "scores_test": metrics.precision_recall_fscore_support(y_test,y_pred)}, ignore_index=True)
    
    print("# Get Feature Importance")
    importance = classifier.feature_importances_
    feature_importance_list.append(pd.DataFrame({"Feature": X_train.columns, "Importance": importance}).sort_values(by="Importance", ascending=False))
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Feature Importance")
final_feat_df = get_average_feat_importance(feature_importance_list)
print(final_feat_df.head(20))

print("# Average Metrics")
print("## Train")
avg_accuracy_train = scores["accuracy_train"].mean()
avg_score_train = get_average(scores["scores_train"])
print("accuracy_train:", avg_accuracy_train)
print("precision_train:", avg_score_train[0].mean())
print("recall_train:", avg_score_train[1].mean())
print("f1_score_train:", avg_score_train[2].mean())
print("## Test")
avg_accuracy_test = scores["accuracy_test"].mean()
avg_score_test = get_average(scores["scores_test"])
print("accuracy_test:", avg_accuracy_test)
print("precision_test:", avg_score_test[0].mean())
print("recall_test:", avg_score_test[1].mean())
print("f1_score_test:", avg_score_test[2].mean())

end = time.time()

print("\nTotal run time:", end-start)

# Feature Selection: SFM
30 selected features (SFM)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
Train
[[4409  417]
 [1311 1343]]
              precision    recall  f1-score   support

           0      0.771     0.914     0.836      4826
           1      0.763     0.506     0.609      2654

    accuracy                          0.769      7480
   macro avg      0.767     0.710     0.722      7480
weighted avg      0.768     0.769     0.755      7480

Test
[[1099  107]
 [ 327  337]]
              precision    recall  f1-score   support

           0      0.771     0.911     0.835      1206
           1      0.759     0.508     0.608       664

    accuracy                          0.768      1870
   macro avg      0.765     0.709     0.722      1870
weighted avg      0.767     0.768     0.755      1870

# Get Feature Importance


# Feature Selection: SFM
30 selected features (SFM)
# Bayesian Optimization
# Fitting the model
# Making prediction



# Fitting the model
# Making predictions
# Calculating metrics
Train
[[4377  448]
 [1156 1499]]
              precision    recall  f1-score   support

           0      0.791     0.907     0.845      4825
           1      0.770     0.565     0.651      2655

    accuracy                          0.786      7480
   macro avg      0.780     0.736     0.748      7480
weighted avg      0.784     0.786     0.776      7480

Test
[[1092  115]
 [ 301  362]]
              precision    recall  f1-score   support

           0      0.784     0.905     0.840      1207
           1      0.759     0.546     0.635       663

    accuracy                          0.778      1870
   macro avg      0.771     0.725     0.738      1870
weighted avg      0.775     0.778     0.767      1870

# Get Feature Importance


# Feature Selection: SFM
30 selected features (SFM)
# Bayesian Optimization




# Fitting the model
# Making predictions
# Calculating metrics
Train
[[4415  410]
 [1293 1362]]
              precision    recall  f1-score   support

           0      0.773     0.915     0.838      4825
           1      0.769     0.513     0.615      2655

    accuracy                          0.772      7480
   macro avg      0.771     0.714     0.727      7480
weighted avg      0.772     0.772     0.759      7480

Test
[[1092  115]
 [ 338  325]]
              precision    recall  f1-score   support

           0      0.764     0.905     0.828      1207
           1      0.739     0.490     0.589       663

    accuracy                          0.758      1870
   macro avg      0.751     0.697     0.709      1870
weighted avg      0.755     0.758     0.744      1870

# Get Feature Importance


# Average Feature Importance
                                              Feature  Avg_importance
0                                                  RS        0.481585
1       TRANSTORNOS_DE

In [None]:
from sklearn.model_selection import StratifiedKFold
start = time.time()
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"accuracy_train": [], "accuracy_test": [], "scores_train": [], "scores_test": []})
feature_importance_list = []
num_feat = 30
count = 1
for train_index, test_index in kf.split(X,y):
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: SFM")
    selected_sfm_features = sfm_feature_selector(X_train,y_train,num_feat)
    X_train = X_train[selected_sfm_features]
    X_test = X_test[selected_sfm_features]
    
    
    print("# Fitting the model")
    classifier = RandomForestClassifier(random_state=42,n_estimators=n_estimators, 
                                      max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)
    classifier.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = classifier.predict(X_test) 
    y_pred_train = classifier.predict(X_train)
    
    print("# Calculating metrics")
    print("Train")
    print(metrics.confusion_matrix(y_train, y_pred_train))
    print(metrics.classification_report(y_train, y_pred_train, digits=3))
    print("Test")
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred, digits=3))
    scores = scores.append({"accuracy_train": metrics.accuracy_score(y_train,y_pred_train),
                  "accuracy_test": metrics.accuracy_score(y_test,y_pred),
                  "scores_train": metrics.precision_recall_fscore_support(y_train,y_pred_train),
                  "scores_test": metrics.precision_recall_fscore_support(y_test,y_pred)}, ignore_index=True)
      
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print("## Train")
avg_accuracy_train = scores["accuracy_train"].mean()
avg_score_train = get_average(scores["scores_train"])
print("accuracy_train:", avg_accuracy_train)
print("precision_train:", avg_score_train[0].mean())
print("recall_train:", avg_score_train[1].mean())
print("f1_score_train:", avg_score_train[2].mean())
print("## Test")
avg_accuracy_test = scores["accuracy_test"].mean()
avg_score_test = get_average(scores["scores_test"])
print("accuracy_test:", avg_accuracy_test)
print("precision_test:", avg_score_test[0].mean())
print("recall_test:", avg_score_test[1].mean())
print("f1_score_test:", avg_score_test[2].mean())

end = time.time()

print("\nTotal run time:", end-start)