# Spearman Correlation
The idea is to analyse the correlation between the disease rates and suicide rates in each municipality.

Import libraries

In [1]:
import pandas as pd
import glob
from scipy.stats import spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

root = "../"

Read suicide csv

In [2]:
suicide = pd.read_csv(root + 'CSV/Suicide/suicide_rates_08_18.csv', sep=',', index_col=0)
suicide.head()

Unnamed: 0,MUNCOD,RATE_08,RATE_09,RATE_10,RATE_11,RATE_12,RATE_13,RATE_14,RATE_15,RATE_16,RATE_17,RATE_18
0,110001,20.344224,8.212203,8.189337,4.127456,12.464166,7.773632,3.898332,11.728829,7.841292,11.793844,4.316485
1,110002,9.458389,2.33806,4.427031,4.368243,9.703818,1.974938,4.860976,4.789226,5.665936,9.315758,1.883807
2,110003,0.0,14.93652,0.0,0.0,0.0,15.396459,0.0,31.471282,15.900779,0.0,18.389114
3,110004,5.110972,7.626311,2.544497,1.26648,5.042229,1.164646,5.776607,6.878683,10.241588,5.649271,9.432516
4,110005,0.0,0.0,11.743981,0.0,11.868028,0.0,0.0,11.11976,11.136478,5.576001,6.081245


Rates for SP

In [3]:
suicide[suicide["MUNCOD"] == 355030]

Unnamed: 0,MUNCOD,RATE_08,RATE_09,RATE_10,RATE_11,RATE_12,RATE_13,RATE_14,RATE_15,RATE_16,RATE_17,RATE_18
3685,355030,4.549487,4.683992,4.722364,4.763117,4.931138,4.593181,4.488944,4.411829,3.164932,3.559948,1.741006


In [4]:
disease = ""
path = root + "CSV/TabNet/Internacoes_Rate/"
all_files = glob.glob(path + "*.csv")

years = ["08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18"]
suicide_df = suicide.copy()
suicide_df["SUICIDE"] = suicide_df.drop(columns="MUNCOD").sum(axis=1)/(len(suicide_df.columns) - 1)
suicide_df = suicide_df[["MUNCOD", "SUICIDE"]]

diseases_list = []
corrs_list = []
p_value_list = []
size_list = []

for file in all_files:
    file_name = file.split("\\")[1]
    disease = file_name.split(".csv")[0]
    disease_df = pd.read_csv(path + disease + '.csv', sep=',', index_col=0) 
    disease_df["Total"] = disease_df.drop(columns="MUNCOD").sum(axis=1)/(len(disease_df.columns) - 1)
    disease_df = disease_df[["MUNCOD", "Total"]]
    disease_df = disease_df[(disease_df["Total"] != 0)] # Excluded rows with 0 suicides
    if disease_df.shape[0] > 2500:
        final_df = pd.merge(disease_df, suicide_df, on="MUNCOD")
        size_list.append(final_df.shape[0])
        corr, p_value = spearmanr(final_df["Total"],final_df["SUICIDE"])
        diseases_list.append(disease)
        corrs_list.append(corr)
        p_value_list.append(p_value)

In [5]:
corrs_data = {'Doenças': diseases_list, 'Correlação com suicidio': corrs_list, 'P value': p_value_list, "Qtd Municipios": size_list}
corrs_df = pd.DataFrame(data=corrs_data)
corrs_df.sort_values(by=['Correlação com suicidio'], ascending=False).reset_index(drop=True).head(50)

Unnamed: 0,Doenças,Correlação com suicidio,P value,Qtd Municipios
0,TRANSTORNOS_DE_HUMOR_[AFETIVOS],0.43442,1.360525e-154,3357
1,TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁL...,0.397686,6.620686e-131,3445
2,NEOPLASIA_MALIGNA_DO_CÓLON,0.37222,2.34218e-105,3189
3,NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES,0.37146,5.702495e-89,2696
4,TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT,0.369459,1.2016119999999999e-90,2780
5,BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,0.368975,9.287829e-156,4833
6,NEOPL_MALIG_JUNÇÃO_RETOSSIGM_RETO_ÂNUS_CANAL_ANAL,0.352598,9.779801e-86,2903
7,OUTRAS_NEOPLASIAS_MALIGNAS_DA_PELE,0.352155,8.054332e-95,3226
8,TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS,0.346575,2.8394560000000003e-120,4252
9,FLEBITE_TROMBOFLEBITE_EMBOLIA_E_TROMBOSE_VENOSA,0.34461,4.8381430000000004e-99,3532
