# Time Series

## 1 - Data Prep

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import numpy as np
%matplotlib inline

from datetime import datetime
root = "../"

In [2]:
population = pd.read_csv(root + 'CSV/Population/population_08_18.csv', index_col=[0])
for col in population:
    if col != "MUNCOD":
        year = "20" + col.split("_")[1]
        population = population.rename(columns={col: year})
population = population.astype(int)
population.head()

Unnamed: 0,2018,MUNCOD,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008
0,23167,110001,25437,25506,25578,25652,25728,24069,24228,24422,24354,24577
1,106168,110002,107345,105896,104401,102860,101269,92747,91570,90354,85541,84581
2,5438,110003,6224,6289,6355,6424,6495,6132,6221,6309,6695,6777
3,84813,110004,88507,87877,87226,86556,85863,79330,78959,78601,78675,78263
4,16444,110005,17934,17959,17986,18013,18041,16852,16939,17030,16622,16784


In [4]:
path = root + 'CSV/TabNet/Internacoes/'
suicide_path = root + 'CSV/TabNet/Suicides/'
df_final = pd.DataFrame()
list_of_diseases = ['TRANSTORNOS_DE_HUMOR_[AFETIVOS]',
 'TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁLCOOL',
 'NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES',
 'ARTROSE',
 'TRANST_DISCAIS_CERVIC_E_OUTR_TRANST_DISC_INTERVERT',
 'TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS',
 'COLELITÍASE_E_COLECISTITE',
 'TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT',
 'NEOPLASIA_MALIGNA_DA_PELE',
 'OUTRAS_DOENÇAS_ISQUÊMICAS_DO_CORAÇÃO',
 'NEOPLASIA_MALIGNA_DO_CÓLON',
 'NEOPLASIA_MALIGNA_DO_PÂNCREAS',
 'TRANSTORNOS_DOS_NERVOS_RAÍZES_E_PLEXOS_NERVOSOS',
 'EMBOLIA_E_TROMBOSE_ARTERIAIS',
 'BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC']
month_dic = {
    "Jan": "January",
    "Fev": "February",
    "Mar": "March",
    "Abr": "April",
    "Mai": "May",
    "Jun": "June",
    "Jul": "July",
    "Ago": "August",
    "Set": "September",
    "Out": "October",
    "Nov": "November",
    "Dez": "December"
}

def prepare_df(df):
    df["MUNCOD"] = [x.split(" ")[0] for x in df["Município"]]
    df = df[df["MUNCOD"] != "Total"]
    df = df.filter(regex="^((?!2007|2019|2020).)*$")
    df = df.drop(columns=["Município"])
    df = df.replace("-",0)
    df = df.astype(int)
    return df
for disease in list_of_diseases:
    df_disease = pd.read_csv(path + disease + ".csv", sep=";")
    df_disease = prepare_df(df_disease)
    df_disease = df_disease.drop(columns=["Total"])
    years = [str(x) for x in range(2008,2019)]
    for year in years:
        suicide_df = pd.read_csv(suicide_path + "suicide_"+ year + ".csv", sep=";")
        suicide_df = prepare_df(suicide_df)
        suicide_df = suicide_df[["Total", "MUNCOD"]]
        count = df_disease.filter(regex=year, axis=1).sum(axis=1)
        df_disease = df_disease.filter(regex="^((?!" + year+ ").)*$")
        df_disease["QTD_" + year] = count
        df_disease = pd.merge(suicide_df, df_disease, on="MUNCOD")
        df_pop = population[["MUNCOD", year]]
        df_disease = pd.merge(df_disease, df_pop, on="MUNCOD")
        df_disease["QTD_" + year] = df_disease["QTD_" + year] / df_disease[year] * 100000
        df_disease["SUICIDE_" + year] = df_disease["Total"] / df_disease[year] * 100000
        df_disease = df_disease.drop(columns=[year, "Total"])
    years.pop()
    df_all_years = pd.DataFrame()
    for year in years:
        next_year = str(int(year) + 1)
        df_med = df_disease[["MUNCOD", "QTD_" + year, "SUICIDE_" + year, "SUICIDE_" + next_year]].copy()
        df_med["YEAR"] = year
        df_med.columns = ["MUNCOD", disease, "PREVIOUS", "TARGET", "YEAR"]
        df_all_years = pd.concat([df_all_years,df_med])
    if df_final.empty:
        df_final = df_all_years
    else:
        df_all_years = df_all_years[["MUNCOD", disease, "YEAR"]]
        df_final = pd.merge(df_final, df_all_years, on=["MUNCOD", "YEAR"])
df_final.head()

Unnamed: 0,MUNCOD,TRANSTORNOS_DE_HUMOR_[AFETIVOS],PREVIOUS,TARGET,YEAR,TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁLCOOL,NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES,ARTROSE,TRANST_DISCAIS_CERVIC_E_OUTR_TRANST_DISC_INTERVERT,TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS,COLELITÍASE_E_COLECISTITE,TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT,NEOPLASIA_MALIGNA_DA_PELE,OUTRAS_DOENÇAS_ISQUÊMICAS_DO_CORAÇÃO,NEOPLASIA_MALIGNA_DO_CÓLON,NEOPLASIA_MALIGNA_DO_PÂNCREAS,TRANSTORNOS_DOS_NERVOS_RAÍZES_E_PLEXOS_NERVOSOS,EMBOLIA_E_TROMBOSE_ARTERIAIS,BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC
0,110001,8.13769,20.344224,8.212203,2008,0.0,0.0,0.0,0.0,20.344224,146.478415,0.0,0.0,20.344224,0.0,0.0,28.481914,0.0,252.268381
1,110002,1.182299,9.458389,2.33806,2008,1.182299,0.0,0.0,1.182299,9.458389,102.85998,0.0,0.0,21.281375,5.911493,0.0,14.187583,5.911493,55.568035
2,110004,1.277743,5.110972,7.626311,2008,0.0,1.277743,1.277743,5.110972,22.999374,98.386211,1.277743,0.0,47.276491,3.833229,0.0,1.277743,0.0,61.331664
3,110080,0.0,5.831924,5.69898,2008,0.0,0.0,0.0,0.0,17.495772,64.151163,0.0,0.0,11.663848,0.0,0.0,34.991544,5.831924,34.991544
4,110010,4.933277,2.466639,7.359796,2008,2.466639,0.0,0.0,0.0,12.333194,36.999581,0.0,0.0,0.0,0.0,0.0,2.466639,2.466639,66.599245


In [5]:
df = df_final.drop(columns=["MUNCOD"])
df = df.astype(float)
df.head()

Unnamed: 0,TRANSTORNOS_DE_HUMOR_[AFETIVOS],PREVIOUS,TARGET,YEAR,TRANST_MENTAIS_E_COMPORTAMENTAIS_DEV_USO_DE_ÁLCOOL,NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES,ARTROSE,TRANST_DISCAIS_CERVIC_E_OUTR_TRANST_DISC_INTERVERT,TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS,COLELITÍASE_E_COLECISTITE,TRANST_MENTAIS_COMPORT_DEV_USO_OUTR_SUBST_PSICOAT,NEOPLASIA_MALIGNA_DA_PELE,OUTRAS_DOENÇAS_ISQUÊMICAS_DO_CORAÇÃO,NEOPLASIA_MALIGNA_DO_CÓLON,NEOPLASIA_MALIGNA_DO_PÂNCREAS,TRANSTORNOS_DOS_NERVOS_RAÍZES_E_PLEXOS_NERVOSOS,EMBOLIA_E_TROMBOSE_ARTERIAIS,BRONQUITE_ENFISEMA_E_OUTR_DOENÇ_PULM_OBSTR_CRÔNIC
0,8.13769,20.344224,8.212203,2008.0,0.0,0.0,0.0,0.0,20.344224,146.478415,0.0,0.0,20.344224,0.0,0.0,28.481914,0.0,252.268381
1,1.182299,9.458389,2.33806,2008.0,1.182299,0.0,0.0,1.182299,9.458389,102.85998,0.0,0.0,21.281375,5.911493,0.0,14.187583,5.911493,55.568035
2,1.277743,5.110972,7.626311,2008.0,0.0,1.277743,1.277743,5.110972,22.999374,98.386211,1.277743,0.0,47.276491,3.833229,0.0,1.277743,0.0,61.331664
3,0.0,5.831924,5.69898,2008.0,0.0,0.0,0.0,0.0,17.495772,64.151163,0.0,0.0,11.663848,0.0,0.0,34.991544,5.831924,34.991544
4,4.933277,2.466639,7.359796,2008.0,2.466639,0.0,0.0,0.0,12.333194,36.999581,0.0,0.0,0.0,0.0,0.0,2.466639,2.466639,66.599245


In [6]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=["TARGET", "YEAR"])
y = df["TARGET"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
from sklearn.linear_model import LassoCV
reg = LassoCV()
reg.fit(X_train,y_train)
print(reg.alpha_)
print(reg.score(X_train,y_train))
coef = pd.Series(reg.coef_, index=X_train.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other "+ str(sum(coef==0)) + " variables")
imp_coef = coef.sort_values(ascending=False)

0.35148607332715687
0.30362873575957794
Lasso picked 14 variables and eliminated the other 2 variables


In [8]:
feats = list(imp_coef[imp_coef > 0].index)
X_train = X_train[feats]
X_test = X_test[feats]

In [18]:
# # Grid Search
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV

# regressor = RandomForestRegressor()
# #parameters = [{'n_estimators' : [150,200,250,300], 'max_features' : ['auto','sqrt','log2']}]
# parameters = {'bootstrap': [True, False],
#  'max_depth': [10, 20, 40, 60, 80, 100, None],
#  'max_features': ['auto', 'sqrt'],
#  'min_samples_leaf': [1, 2, 4],
#  'min_samples_split': [2, 5, 10],
#  'n_estimators': [200, 400, 600, 800, 1000]}
# grid_search = GridSearchCV(estimator = regressor, param_grid = parameters,n_jobs=-1)
# grid_search = grid_search.fit(X_train, y_train)
# best_parameters = grid_search.best_params_

In [20]:
#best_parameters

In [9]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 200, max_features = 'sqrt', max_depth=7,min_samples_leaf=2) 
regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=7, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [10]:
def mape(y_pred,y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [11]:
from sklearn import metrics
y_pred = regressor.predict(X_test) 
y_pred_train = regressor.predict(X_train) 
print("Root Mean Squared Error Train: " + str(np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))))
print("Root Mean Squared Error Test: " + str(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print("Score Test: "+ str(regressor.score(X_test,y_test))) #R^2
print("MAPE Test:" + str(mape(y_pred,y_test)))

Root Mean Squared Error Train: 4.113140479302953
Root Mean Squared Error Test: 5.21105610796891
Score Test: 0.3493888673890111
MAPE Test:62.02098104273877


In [12]:
from sklearn import metrics
y_pred = X_test["PREVIOUS"]
y_pred_train = X_train["PREVIOUS"]
print("Root Mean Squared Error Train: " + str(np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))))
print("Root Mean Squared Error Test: " + str(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print("MAPE Test:" + str(mape(y_pred,y_test)))

Root Mean Squared Error Train: 5.828398087088399
Root Mean Squared Error Test: 5.952607232919866
MAPE Test:61.1653405605082
