# Time Series

## 1 - Data Prep

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import numpy as np
%matplotlib inline

# Import Statsmodels
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic

from datetime import datetime
root = "../"

In [4]:
population = pd.read_csv(root + 'CSV/Population/population_08_18.csv', index_col=[0])
for col in population:
    if col != "MUNCOD":
        year = "20" + col.split("_")[1]
        population = population.rename(columns={col: year})
population = population.astype(int)
population.head()

Unnamed: 0,2018,MUNCOD,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008
0,23167,110001,25437,25506,25578,25652,25728,24069,24228,24422,24354,24577
1,106168,110002,107345,105896,104401,102860,101269,92747,91570,90354,85541,84581
2,5438,110003,6224,6289,6355,6424,6495,6132,6221,6309,6695,6777
3,84813,110004,88507,87877,87226,86556,85863,79330,78959,78601,78675,78263
4,16444,110005,17934,17959,17986,18013,18041,16852,16939,17030,16622,16784


In [5]:

path = root + 'CSV/TabNet/Internacoes/'
all_files = glob.glob(path + "*")
df_diseases = pd.DataFrame()
#list_of_diseases = ["CANCER","TRANSTORNOS_MENTAIS_E_COMPORTAMENTAIS", "DPOC"]
list_of_diseases = ["TRANSTORNOS_DE_HUMOR_[AFETIVOS]", "TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁLCOOL", "NEOPLASIA_MALIGNA_DO_CÓLON", "NEOPL_MALIG_JUNÇÃO_RETOSSIGM_RETO_ÂNUS_CANAL_ANAL", "BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC"]

month_dic = {
    "Jan": "January",
    "Fev": "February",
    "Mar": "March",
    "Abr": "April",
    "Mai": "May",
    "Jun": "June",
    "Jul": "July",
    "Ago": "August",
    "Set": "September",
    "Out": "October",
    "Nov": "November",
    "Dez": "December"
}

for disease in list_of_diseases:
    df_disease = pd.read_csv(path + disease + ".csv", sep=";")
    df_disease["MUNCOD"] = [x.split(" ")[0] for x in df_disease["Município"]]
    df_disease = df_disease[df_disease["MUNCOD"] != "Total"]
    df_disease= df_disease.filter(regex='MUNCOD|2017|2018', axis=1)
    df_disease = df_disease.replace("-",0)
    df_disease = df_disease.astype(int)
    df_disease = df_disease.drop(columns="2018/Dez")
    for col in df_disease.columns:
        if col != "MUNCOD":
            ano = col.split("/")[0]
            mes = col.split("/")[1]
            month = month_dic[mes]
            df_pop = population[[ano, "MUNCOD"]]
            df_disease = pd.merge(df_pop, df_disease, on="MUNCOD")
            df_disease[col] = df_disease[col]/df_disease[ano] * 100000
            df_disease = df_disease.drop(columns=ano)
            datetime_object = datetime.strptime(month, "%B")
            newcol = str(datetime_object.month) + "/" + str(ano)+ "/" + disease
            df_disease = df_disease.rename(columns={col: newcol})
    if df_diseases.empty:
        df_diseases = df_disease
    else:
        df_diseases = pd.merge(df_disease, df_diseases, on="MUNCOD")        

df_diseases.head()

Unnamed: 0,MUNCOD,1/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,2/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,3/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,4/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,5/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,6/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,7/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,8/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,9/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,...,2/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS],3/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS],4/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS],5/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS],6/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS],7/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS],8/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS],9/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS],10/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS],11/2018/TRANSTORNOS_DE_HUMOR_[AFETIVOS]
0,110001,7.862562,19.656406,39.312812,31.45025,23.587687,23.587687,35.381531,51.106656,23.587687,...,0.0,0.0,0.0,4.316485,0.0,0.0,0.0,0.0,0.0,0.0
1,110002,3.726303,7.452606,9.315758,8.384182,13.973636,11.178909,8.384182,4.657879,9.315758,...,0.0,0.0,0.941903,2.82571,0.0,0.0,0.941903,2.82571,0.941903,1.883807
2,110004,4.519417,7.908979,6.779125,5.649271,2.259708,6.779125,7.908979,2.259708,5.649271,...,0.0,0.0,1.179065,0.0,0.0,0.0,1.179065,0.0,1.179065,0.0
3,110005,11.152002,5.576001,11.152002,16.728003,5.576001,16.728003,16.728003,11.152002,5.576001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,110006,10.830129,10.830129,21.660259,10.830129,16.245194,10.830129,54.150647,5.415065,0.0,...,6.162569,12.325137,0.0,18.487706,6.162569,0.0,0.0,0.0,0.0,0.0


In [6]:
path = root + 'CSV/TabNet/Suicides/'
all_files = glob.glob(path + "*")
df_suicides = pd.DataFrame()
list_of_diseases = ["CANCER","TRANSTORNOS_MENTAIS_E_COMPORTAMENTAIS", "DPOC"]
month_dic = {
    "Janeiro": "January",
    "Fevereiro": "February",
    "Março": "March",
    "Abril": "April",
    "Maio": "May",
    "Junho": "June",
    "Julho": "July",
    "Agosto": "August",
    "Setembro": "September",
    "Outubro": "October",
    "Novembro": "November",
    "Dezembro": "December"
}
for file in all_files[-2:]:
    file_name = file.split("\\")[1]
    year = file_name.split("_")[1].split(".")[0]
    df_year = pd.read_csv(path + file_name, sep=";")
    df_year = df_year[df_year["Município"] != "Total"]
    df_year["MUNCOD"] = [x.split(" ")[0] for x in df_year["Município"]]
    df_year["MUNCOD"] = df_year["MUNCOD"].astype(int)
    df_year = df_year.drop(columns=["Município", "Total"])
    df_year = df_year.replace("-",0)
    df_year = df_year.astype(int)
    for col in df_year.columns:
         if col != "MUNCOD":
            df_pop = population[[year, "MUNCOD"]]
            df_year = pd.merge(df_pop, df_year, on="MUNCOD")
            df_year[col] = df_year[col]/df_year[year] * 100000
            df_year = df_year.drop(columns=year)
            month = month_dic[col]
            datetime_object = datetime.strptime(month, "%B")
            newcol = str(datetime_object.month) + "/" + str(year)
            df_year = df_year.rename(columns={col: newcol})
    
    if df_suicides.empty:
        df_suicides = df_year
    else:
        df_suicides = pd.merge(df_year, df_suicides, on="MUNCOD") 
df_suicides.head()

Unnamed: 0,MUNCOD,1/2018,2/2018,3/2018,4/2018,5/2018,6/2018,7/2018,8/2018,9/2018,...,3/2017,4/2017,5/2017,6/2017,7/2017,8/2017,9/2017,10/2017,11/2017,12/2017
0,110001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.931281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.931281
1,110002,0.0,0.941903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.863152,0.0,0.0,0.0,0.931576,0.931576,1.863152,0.0,1.863152,0.931576
2,110004,0.0,2.358129,0.0,1.179065,0.0,1.179065,0.0,1.179065,0.0,...,0.0,1.129854,0.0,1.129854,0.0,1.129854,1.129854,1.129854,0.0,0.0
3,110005,0.0,0.0,0.0,6.081245,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.576001,0.0,0.0,0.0,0.0
4,110006,0.0,6.162569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.415065,5.415065,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df = pd.merge(df_diseases, df_suicides, on="MUNCOD")
df = df.drop(columns="MUNCOD")
df = df.astype(float)
df.head()

Unnamed: 0,1/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,2/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,3/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,4/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,5/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,6/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,7/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,8/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,9/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,10/2017/BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC,...,3/2017,4/2017,5/2017,6/2017,7/2017,8/2017,9/2017,10/2017,11/2017,12/2017
0,7.862562,19.656406,39.312812,31.45025,23.587687,23.587687,35.381531,51.106656,23.587687,47.175374,...,0.0,3.931281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.931281
1,3.726303,7.452606,9.315758,8.384182,13.973636,11.178909,8.384182,4.657879,9.315758,5.589455,...,1.863152,0.0,0.0,0.0,0.931576,0.931576,1.863152,0.0,1.863152,0.931576
2,4.519417,7.908979,6.779125,5.649271,2.259708,6.779125,7.908979,2.259708,5.649271,4.519417,...,0.0,1.129854,0.0,1.129854,0.0,1.129854,1.129854,1.129854,0.0,0.0
3,11.152002,5.576001,11.152002,16.728003,5.576001,16.728003,16.728003,11.152002,5.576001,27.880004,...,0.0,0.0,0.0,0.0,0.0,5.576001,0.0,0.0,0.0,0.0
4,10.830129,10.830129,21.660259,10.830129,16.245194,10.830129,54.150647,5.415065,0.0,5.415065,...,0.0,5.415065,5.415065,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df = df[df["12/2018"] > 0]

In [None]:
xf = df.drop(columns="12/2018")
yf = df["12/2018"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(xf, yf, test_size = 1/3, random_state = 0)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
from sklearn import metrics
print("Root Mean Squared Error: " + str(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print("Score: "+ str(regressor.score(X_test,y_test))) #R^2

In [None]:
plt.scatter(y_test,y_pred)

In [None]:
from sklearn.linear_model import LassoCV
reg = LassoCV()
reg.fit(X_train,y_train)
print(reg.alpha_)
print(reg.score(X_train,y_train))
coef = pd.Series(reg.coef_, index=X_train.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other "+ str(sum(coef==0)) + " variables")
imp_coef = coef.sort_values(ascending=False)

In [None]:
feats = [] 
for i,v in enumerate(imp_coef):
    if v > 0:
        feats.append(imp_coef.index[i])

In [None]:
xf = df.drop(columns="12/2018")
yf = df["12/2018"]
xf = xf[feats]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(xf, yf, test_size = 1/3, random_state = 0)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
from sklearn import metrics
print("Root Mean Squared Error: " + str(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print("Score: "+ str(regressor.score(X_test,y_test))) #R^2

In [None]:
plt.scatter(y_test,y_pred)