# Regression
Features = rates from selected diseases (FeatureSelection) + suicide rate 

Target = suicide rate of the following year

Import libraries

In [15]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from skopt import gp_minimize
from scipy import stats
import time
import glob
import os
root = "../../"

In [10]:
dict_uf_cod = {11: 'RO',
12: 'AC',
13: 'AM',
14: 'RR',
15: 'PA',
16: 'AP',
17: 'TO',
21: 'MA',
22: 'PI',
23: 'CE',
24: 'RN',
25: 'PB',
26: 'PE',
27: 'AL',
28: 'SE',
29: 'BA',
31: 'MG',
32: 'ES',
33: 'RJ',
35: 'SP',
41: 'PR',
42: 'SC',
43: 'RS',
50: 'MS',
51: 'MT',
52: 'GO',
53: 'DF'}

Prepare dataset

In [11]:
disease = ""
path = root + "CSV/TabNet/Internacoes_Rate/"
all_files = glob.glob(path + "*.csv")
suicide = pd.read_csv(root +'CSV/Suicide/suicide_rates_08_18.csv', index_col=0)

years = [str(x).zfill(2) for x in range(8,19)]
columns = ["RATE_" + year for year in years]
columns.append("MUNCOD")

disease_list = []

for file in all_files:
    file_name = os.path.basename(file)
    disease = file_name.split(".csv")[0]
    disease_df = pd.read_csv(file, sep=',', index_col=0)
    if(set(disease_df.columns) == set(columns)):
        disease_list.append(disease)
print(len(disease_list), " diseases")

313  diseases


In [12]:
final_df = pd.DataFrame()

for i in range(len(years)-1):
    col_year_suicide = "RATE_" + years[i+1]
    col_year_prev = "RATE_" + years[i]
    year_df = suicide[[col_year_prev, col_year_suicide, "MUNCOD"]]
    year_df = year_df.rename(columns={col_year_suicide: "RATE"})
    year_df = year_df.rename(columns={col_year_prev: "PREVIOUS"})
    for disease in disease_list:
        col_year_disease = "RATE_" + years[i]
        disease_df = pd.read_csv(path + disease + ".csv", sep=',', index_col=0)
        disease_df = disease_df[[col_year_disease, "MUNCOD"]]
        disease_df = disease_df.rename(columns={col_year_disease: disease})

        year_df = pd.merge(disease_df, year_df, left_on="MUNCOD", right_on="MUNCOD", how='right')
        
    final_df = pd.concat([final_df, year_df])

final_df.head()

Unnamed: 0,DIFTERIA,MUNCOD,NEOPLASIA_MALIGNA_DO_ESTÔMAGO,FRATURA_DO_CRÂNIO_E_DOS_OSSOS_DA_FACE,DOENÇA_DE_HODGKIN,OUTRAS_DOENÇAS_CEREBROVASCULARES,FEBRES_RECORRENTES,ESTADO_INFEC_ASSINT_VÍRUS_DA_IMUNODEF_HUMANA_[HIV],PNEUMOCONIOSE,NEOPLASIA_MALIGNA_DA_PRÓSTATA,...,ASMA,CERATITE_E_OUTROS_TRANSTORNOS_ESCLERÓTICA_E_CÓRNEA,INFECÇÕES_PELO_VÍRUS_DO_HERPES,NEOPLASIA_MALIGNA_DO_LÁBIO_CAVIDADE_ORAL_E_FARINGE,TUBERCULOSE_PULMONAR,AUSÊNCIA_ATRESIA_E_ESTENOSE_DO_INTESTINO_DELGADO,OUTRAS_DOENÇAS_DO_APARELHO_DIGESTIVO,OUTRAS_TUBERCULOSES_RESPIRATÓRIAS,PREVIOUS,RATE
0,0.0,110001,0.0,16.275379,0.0,0.0,0.0,,,0.0,...,752.736298,0.0,0.0,0.0,0.0,,28.481914,0.0,20.344224,8.212203
1,0.0,110009,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,66.394101,0.0,0.0,0.0,0.0,,52.416396,,0.0,6.922331
2,0.0,110020,4.483288,4.219565,1.318614,2.637228,0.0,,0.0,3.692119,...,30.591847,5.274456,3.428397,2.373505,0.527446,0.0,11.603804,10.021467,4.483288,5.485478
3,0.0,110120,0.0,0.0,,0.0,,,,0.0,...,9.387027,0.0,,0.0,0.0,,9.387027,0.0,0.0,0.0
4,0.0,110143,0.0,12.53447,,0.0,,,,0.0,...,12.53447,0.0,,0.0,,,25.06894,,0.0,0.0


Get UF from MUNCOD

In [13]:
final_df['UF'] = final_df['MUNCOD'] / 10000
final_df['UF'] = final_df['UF'].astype(int)
final_df['UF'].replace(dict_uf_cod, inplace=True)
dummy = pd.get_dummies(final_df['UF'])
final_df = pd.concat([final_df, dummy], axis=1)
final_df = final_df.drop(['MUNCOD', 'UF'], axis=1)
final_df.head()

Unnamed: 0,DIFTERIA,NEOPLASIA_MALIGNA_DO_ESTÔMAGO,FRATURA_DO_CRÂNIO_E_DOS_OSSOS_DA_FACE,DOENÇA_DE_HODGKIN,OUTRAS_DOENÇAS_CEREBROVASCULARES,FEBRES_RECORRENTES,ESTADO_INFEC_ASSINT_VÍRUS_DA_IMUNODEF_HUMANA_[HIV],PNEUMOCONIOSE,NEOPLASIA_MALIGNA_DA_PRÓSTATA,OUTRAS_NEOPL_MALIG_ÓRG_RESPIRAT_E_INTRATORÁCICOS,...,PR,RJ,RN,RO,RR,RS,SC,SE,SP,TO
0,0.0,0.0,16.275379,0.0,0.0,0.0,,,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
2,0.0,4.483288,4.219565,1.318614,2.637228,0.0,,0.0,3.692119,0.263723,...,0,0,0,1,0,0,0,0,0,0
3,0.0,0.0,0.0,,0.0,,,,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,0.0,0.0,12.53447,,0.0,,,,0.0,,...,0,0,0,1,0,0,0,0,0,0


Define X and y

In [16]:
percentage_valid_values = 0.8
num_feat = 20
corr_min_value = 0.15

print("Shape before null values drop:", final_df.shape)
N = int(final_df.shape[1]*percentage_valid_values)
print("Minimum non-null values: " + str(N) + "/" + str(final_df.shape[1]))
final_zeros_df = final_df.dropna(thresh=N) # At least N non null items

final_zeros_df = final_zeros_df[(np.abs(stats.zscore(final_zeros_df["RATE"])) < 3)] # Remove outliers

X = final_zeros_df.drop(columns="RATE")
X = X.fillna(0)
y = final_zeros_df["RATE"]

X.shape

Shape before null values drop: (53810, 342)
Minimum non-null values: 273/342


(11734, 341)

Define some functions

In [26]:
def calcUpMetrics(y_pred,y_test,X_test):
    up_df = pd.DataFrame({"Pred": y_pred, "Real": y_test, "Previous": X_test["PREVIOUS"]})
    up_df["UP"] = up_df["Previous"] < up_df["Real"]
    up_df["UP_PRED"] = up_df["Previous"] < up_df["Pred"]
    up_df["UP"] = up_df["UP"].astype(int)
    up_df["UP_PRED"] = up_df["UP_PRED"].astype(int)
    accuracy = metrics.accuracy_score(up_df["UP"], up_df["UP_PRED"])
    recall = metrics.recall_score(up_df["UP"], up_df["UP_PRED"])
    precision = metrics.precision_score(up_df["UP"], up_df["UP_PRED"])
    return accuracy, recall, precision

In [37]:
def get_average_feat_importance(feature_importance_list):
    final_feat_df = pd.DataFrame()
    for i, feat_df in enumerate(feature_importance_list):
        feat_df = feat_df.rename(columns={"Importance": i})
        if final_feat_df.empty:
            final_feat_df = feat_df
        else:
            final_feat_df = pd.merge(final_feat_df, feat_df, on="Feature")
    final_feat_df["Avg_importance"] = final_feat_df.sum(axis=1)/(final_feat_df.shape[1] -1)
    final_feat_df = final_feat_df[["Feature", "Avg_importance"]]
    final_feat_df = final_feat_df.sort_values(by=['Avg_importance'], ascending=False)
    return final_feat_df

In [19]:
def cor_feature_selector(X,y,corr_min_value):
    cor_list = []
    for i in list(X.columns):
        cor = np.corrcoef(X[i], y)[0,1]
        cor_list.append([i, cor])
    cor_feature = [x[0] for x in cor_list if abs(x[1]) > corr_min_value]
    print(len(cor_feature), "selected features (correlation)")
    return cor_feature

# Models

## 1. Tree Models

### 1.1 Random Forest

In [31]:
from sklearn.model_selection import KFold
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
feature_importance_list = []
count = 1
for train_index, test_index in kf.split(X):
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        n_estimators = params[0]
        max_depth = params[1]
        min_samples_leaf = params[2]
        max_features = params[3]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=n_estimators, 
                                      max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))
    space = [
        (5,1000), #n_estimators
        (3,30), #max_depth
        (2,200), #min_samples_leaf
        (0.25,1.00) #max_features
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    print("# Fitting the model")
    regressor = RandomForestRegressor(n_jobs=-1, random_state=42,n_estimators=resultado_gp.x[0], 
                                  max_depth=resultado_gp.x[1],min_samples_leaf=resultado_gp.x[2],max_features=resultado_gp.x[3])
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Train - (accuracy, recall, precision):", up_down_train)
    print("Up/Down Test - (accuracy, recall, precision):", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("# Get Feature Importance")
    importance = regressor.feature_importances_
    feature_importance_list.append(pd.DataFrame({"Feature": X_train.columns, "Importance": importance}).sort_values(by="Importance", ascending=False))
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
print("# Average Feature Importance")
final_feat_df = get_average_feat_importance(feature_importance_list)
print(final_feat_df.head(20))
end = time.time()

print("\nTotal run time:", end-start)

# Feature Selection: correlation
20 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.576262524035043
RMSE Test: 3.923516559799
RMSE Baseline Train: 5.2921351954763844
RMSE Baseline Test: 5.28396408634204
Up/Down Train - (accuracy, recall, precision): (0.700862895493768, 0.8396965247185512, 0.6144340974212035)
Up/Down Test - (accuracy, recall, precision): (0.6791648913506604, 0.7992530345471522, 0.6140602582496413)
# Get Feature Importance


# Feature Selection: correlation
18 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 2.8737863564293087
RMSE Test: 3.8588940770578075
RMSE Baseline Train: 5.2719074649413455
RMSE Baseline Test: 5.364227050771937
Up/Down Train - (accuracy, recall, precision): (0.7702141259188239, 0.8963267278878685, 0.682177671510024)
Up/Down Test - (accuracy, recall, precision): (0.6804431188751597, 0.7

### 1.2 AdaBoost

In [30]:
from sklearn.model_selection import KFold
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
feature_importance_list = []
count = 1
for train_index, test_index in kf.split(X):
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        n_estimators = params[0]
        learning_rate = params[1]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = AdaBoostRegressor(random_state=42,n_estimators=n_estimators, learning_rate=learning_rate)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))
    space = [
        (30,200), #n_estimators
        (0.01, 1) #learning_rate
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    print("# Fitting the model")
    regressor = AdaBoostRegressor(random_state=42,n_estimators=resultado_gp.x[0], learning_rate=resultado_gp.x[1])
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Train - (accuracy, recall, precision):", up_down_train)
    print("Up/Down Test - (accuracy, recall, precision):", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("# Get Feature Importance")
    importance = regressor.feature_importances_
    feature_importance_list.append(pd.DataFrame({"Feature": X_train.columns, "Importance": importance}).sort_values(by="Importance", ascending=False))
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
print("# Average Feature Importance")
final_feat_df = get_average_feat_importance(feature_importance_list)
print(final_feat_df.head(20))
end = time.time()

print("\nTotal run time:", end-start)

# Feature Selection: correlation
20 selected features (correlation)
# Bayesian Optimization




# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.9248790145147616
RMSE Test: 3.9779997325255096
RMSE Baseline Train: 5.2921351954763844
RMSE Baseline Test: 5.28396408634204
Up/Down Train - (accuracy, recall, precision): (0.6596356663470757, 0.8002936857562408, 0.5788635156664896)
Up/Down Test - (accuracy, recall, precision): (0.6697912228376651, 0.7871148459383753, 0.6064748201438849)
# Get Feature Importance


# Feature Selection: correlation
18 selected features (correlation)
# Bayesian Optimization




# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.930814182098174
RMSE Test: 3.956192085216573
RMSE Baseline Train: 5.2719074649413455
RMSE Baseline Test: 5.364227050771937
Up/Down Train - (accuracy, recall, precision): (0.658783423884095, 0.7979700338327694, 0.5824660433938966)
Up/Down Test - (accuracy, recall, precision): (0.654026416702173, 0.7801766437684003, 0.5748373101952278)
# Get Feature Importance


# Feature Selection: correlation
22 selected features (correlation)
# Bayesian Optimization




# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.9105766284507655
RMSE Test: 4.025466594777486
RMSE Baseline Train: 5.271751057605574
RMSE Baseline Test: 5.364841803049234
Up/Down Train - (accuracy, recall, precision): (0.6552679237242995, 0.8185968284478616, 0.5786345108695652)
Up/Down Test - (accuracy, recall, precision): (0.6489135066041756, 0.8221105527638191, 0.5583617747440273)
# Get Feature Importance


# Feature Selection: correlation
21 selected features (correlation)
# Bayesian Optimization




# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.935032238395969
RMSE Test: 3.937236584318569
RMSE Baseline Train: 5.3037791577667734
RMSE Baseline Test: 5.237061707262408
Up/Down Train - (accuracy, recall, precision): (0.6545222115691914, 0.8177604293730178, 0.573187414500684)
Up/Down Test - (accuracy, recall, precision): (0.6518960374946741, 0.8119092627599244, 0.5815842924847664)
# Get Feature Importance


# Feature Selection: correlation
21 selected features (correlation)
# Bayesian Optimization




# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.9283415035191154
RMSE Test: 3.9654921523604543
RMSE Baseline Train: 5.312804137538967
RMSE Baseline Test: 5.200297529203352
Up/Down Train - (accuracy, recall, precision): (0.6570089475926715, 0.8155925657735941, 0.5790916880891174)
Up/Down Test - (accuracy, recall, precision): (0.6423699914748509, 0.8254437869822485, 0.5583722481654436)
# Get Feature Importance


# Average Metrics
rmse_train             3.925929
rmse_test              3.972477
rmse_baseline_train    5.290475
rmse_baseline_test     5.290078
dtype: float64
# Average Feature Importance
                                              Feature  Avg_importance
0                                            PREVIOUS        0.595156
1                                                  RS        0.076896
2       TRANSTORNOS_DE_CONDUÇÃO_E_ARRITMIAS_CARDÍACAS        0.066733
3   NEOPLASIA_MALIGNA_DE_TRAQUÉIA_BRÔNQUIOS_E_PULMÕES        0.042590
4   BRONQUITE_ENF

### 1.3 Gradient Boosting

In [36]:
from sklearn.model_selection import KFold
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
feature_importance_list = []
count = 1
for train_index, test_index in kf.split(X):
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        n_estimators = params[0]
        learning_rate = params[1]
        max_depth = params[2]
        min_samples_split = params[3]
        min_samples_leaf = params[4]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = GradientBoostingRegressor(random_state=42,n_estimators=n_estimators, learning_rate=learning_rate,
                                          max_depth=max_depth,min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))
    space = [
        (30,200),  #n_estimators
        (0.01, 1), #learning_rate
        (1,32),    #max_depth
        (0.1, 1),  #min_samples_split 
        (0.1, 0.5) #min_samples_leafs 
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    print("# Fitting the model")
    regressor = GradientBoostingRegressor(random_state=42,n_estimators=resultado_gp.x[0], learning_rate=resultado_gp.x[1],
                                         max_depth=resultado_gp.x[2], min_samples_split=resultado_gp.x[3], min_samples_leaf=resultado_gp.x[4])
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Train - (accuracy, recall, precision):", up_down_train)
    print("Up/Down Test - (accuracy, recall, precision):", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("# Get Feature Importance")
    importance = regressor.feature_importances_
    feature_importance_list.append(pd.DataFrame({"Feature": X_train.columns, "Importance": importance}).sort_values(by="Importance", ascending=False))
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
print("# Average Feature Importance")
final_feat_df = get_average_feat_importance(feature_importance_list)
print(final_feat_df.head(20))
end = time.time()

print("\nTotal run time:", end-start)

# Feature Selection: correlation
20 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.8388525397963744
RMSE Test: 3.9589824343820785
RMSE Baseline Train: 5.2921351954763844
RMSE Baseline Test: 5.28396408634204
Up/Down Train - (accuracy, recall, precision): (0.6742303185256205, 0.8012726382770435, 0.5931159420289855)
Up/Down Test - (accuracy, recall, precision): (0.6817213463996591, 0.7936507936507936, 0.6177325581395349)
# Get Feature Importance


# Feature Selection: correlation
18 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.881875315522406
RMSE Test: 3.9068397870751723
RMSE Baseline Train: 5.2719074649413455
RMSE Baseline Test: 5.364227050771937
Up/Down Train - (accuracy, recall, precision): (0.6734846063705124, 0.7945867568873852, 0.597492276939851)
Up/Down Test - (accuracy, recall, precision): (0.671921602045164,

## 2. SVR

In [39]:
mm = MinMaxScaler()
sc = StandardScaler()
X_mm = mm.fit_transform(X)
X_sc = sc.fit_transform(X)

### 2.1 MinMax

In [40]:
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
count = 1

X = pd.DataFrame(X_mm, index=X.index, columns=X.columns)
for train_index, test_index in kf.split(X):
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        gamma = params[0]
        C = params[1]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = SVR(gamma=gamma, C=C)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))
    space = [
        (0.001,0.9), #gamma
        (1,10000), #C
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    print("# Fitting the model")
    regressor = SVR(gamma=resultado_gp.x[0], C=resultado_gp.x[1])
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Train - (accuracy, recall, precision):", up_down_train)
    print("Up/Down Test - (accuracy, recall, precision):", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
end = time.time()

print("\nTotal run time:", end-start)

# Feature Selection: correlation
20 selected features (correlation)
# Bayesian Optimization




# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.918899295685411
RMSE Test: 4.003903536483257
RMSE Baseline Train: 7.076445293537384
RMSE Baseline Test: 7.202308707512127
Up/Down Train - (accuracy, recall, precision): (0.8788750399488654, 1.0, 0.8788750399488654)
Up/Down Test - (accuracy, recall, precision): (0.8900724328930549, 1.0, 0.8900724328930549)


# Feature Selection: correlation
18 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.9409342025818517
RMSE Test: 3.9296992665347963
RMSE Baseline Train: 7.099346455464094
RMSE Baseline Test: 7.111597614553929
Up/Down Train - (accuracy, recall, precision): (0.8811121764141898, 1.0, 0.8811121764141898)
Up/Down Test - (accuracy, recall, precision): (0.8811248402215595, 1.0, 0.8811248402215595)


# Feature Selection: correlation
22 selected features (correlation)
# Bayesian Optimization




# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.91521633366929
RMSE Test: 3.982284165133578
RMSE Baseline Train: 7.101044438443785
RMSE Baseline Test: 7.104814059263473
Up/Down Train - (accuracy, recall, precision): (0.8828166613401512, 1.0, 0.8828166613401512)
Up/Down Test - (accuracy, recall, precision): (0.8743076267575628, 1.0, 0.8743076267575628)


# Feature Selection: correlation
21 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.937095055199476
RMSE Test: 3.946688258327955
RMSE Baseline Train: 7.099399761518281
RMSE Baseline Test: 7.11138477634814
Up/Down Train - (accuracy, recall, precision): (0.881325237029935, 1.0, 0.881325237029935)
Up/Down Test - (accuracy, recall, precision): (0.8802726885385599, 1.0, 0.8802726885385599)


# Feature Selection: correlation
21 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calcu

### 2.2 StandardScaler

In [43]:
from sklearn.model_selection import KFold
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
count = 1

X = pd.DataFrame(X_sc, index=X.index, columns=X.columns)
for train_index, test_index in kf.split(X):
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        gamma = params[0]
        C = params[1]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = SVR(gamma=gamma, C=C)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))
    space = [
        (0.001,0.9), #gamma
        (1,10000), #C
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    print("# Fitting the model")
    regressor = SVR(gamma=resultado_gp.x[0], C=resultado_gp.x[1])
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Train - (accuracy, recall, precision):", up_down_train)
    print("Up/Down Test - (accuracy, recall, precision):", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
end = time.time()

print("\nTotal run time:", end-start)

# Feature Selection: correlation
20 selected features (correlation)
# Bayesian Optimization




# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.9612596082465816
RMSE Test: 4.002651905256515
RMSE Baseline Train: 7.075759400095559
RMSE Baseline Test: 7.199461182761993
Up/Down Train - (accuracy, recall, precision): (0.9630339831682113, 1.0, 0.9630339831682113)
Up/Down Test - (accuracy, recall, precision): (0.95994887089902, 1.0, 0.95994887089902)


# Feature Selection: correlation
18 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 3.7533657341924838
RMSE Test: 3.911208331177055
RMSE Baseline Train: 7.099368922374057
RMSE Baseline Test: 7.105892757727923
Up/Down Train - (accuracy, recall, precision): (0.962607861936721, 0.9998893193137798, 0.9627024722932651)
Up/Down Test - (accuracy, recall, precision): (0.962079250106519, 1.0, 0.962079250106519)


# Feature Selection: correlation
22 selected features (correlation)
# Bayesian Optimization
# Fitting the model
# Making predi

In [55]:
def calcUpMetrics2(y_pred, y_test, X_test):
    up_df = pd.DataFrame({"Pred": y_pred, "Real": y_test, "Previous": X_test["PREVIOUS"]})
    up_df["UP"] = up_df["Previous"] < up_df["Real"]
    up_df["UP_PRED"] = up_df["Previous"] < up_df["Pred"]
    up_df["UP"] = up_df["UP"].astype(int)
    up_df["UP_PRED"] = up_df["UP_PRED"].astype(int)
    return up_df

up_down_train = calcUpMetrics2(y_pred_train,y_train,X_train)
up_down_test = calcUpMetrics2(y_pred,y_test,X_test)

In [77]:
# metrics.confusion_matrix(up_df["UP"], up_df["UP_PRED"])
# up_down_train[(up_down_train["UP"] == 0) & (up_down_train["Real"] != 0.0)].head(50)
up_down_test[up_down_test["UP"] == 1]

Unnamed: 0,Pred,Real,Previous,UP,UP_PRED
1,3.090559,6.922331,-1.229330,1,1
6,1.804035,0.000000,-1.229330,1,1
7,5.504720,6.536930,0.025231,1,1
13,3.600306,7.731441,-0.255336,1,1
15,3.674976,5.063291,-0.578576,1,1
...,...,...,...,...,...
5149,6.347618,11.035977,0.055122,1,1
5154,4.579530,1.463250,-0.176965,1,1
5160,5.375281,3.406807,0.139369,1,1
5170,4.846229,6.827570,-0.486279,1,1


## 3. ElasticNet

In [42]:
start = time.time()
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = pd.DataFrame({"rmse_train": [], "rmse_test": [], "rmse_baseline_train": [], "rmse_baseline_test": [], "up_down_train": [], "up_down_test": []})
count = 1

X = pd.DataFrame(X_mm, index=X.index, columns=X.columns)
for train_index, test_index in kf.split(X):
    print("=============== RUN {} ===============".format(count))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("# Feature Selection: correlation")
    selected_cor_features = cor_feature_selector(X_train,y_train,corr_min_value)
    X_train = X_train[selected_cor_features]
    X_test = X_test[selected_cor_features]
    
    print("# Bayesian Optimization")
    def train_model(params):
        alpha = params[0]
        l1_ratio = params[1]

        xf_train_params,xf_val,yf_train_params,yf_val = train_test_split(X_train,y_train,test_size=0.33,random_state=42)

        model = ElasticNet(random_state=42,alpha=alpha, l1_ratio=l1_ratio)

        model.fit(xf_train_params,yf_train_params)
        yf_pred = model.predict(xf_val)

        return np.sqrt(metrics.mean_squared_error(yf_val, yf_pred))
    space = [
        (1,20), #alpha
        (0,1), #l1_ratio
    ]
    resultado_gp = gp_minimize(train_model, space, random_state=42, verbose=0, n_calls=30, n_random_starts=10)
    
    print("# Fitting the model")
    regressor = ElasticNet(random_state=42, alpha=resultado_gp.x[0], l1_ratio=resultado_gp.x[1])
    regressor.fit(X_train,y_train)
    
    print("# Making predictions")
    y_pred = regressor.predict(X_test) 
    y_pred_train = regressor.predict(X_train)
    
    print("# Calculating metrics")
    rmse_train = np.sqrt(metrics.mean_squared_error(y_pred_train,y_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
    rmse_baseline_train = np.sqrt(metrics.mean_squared_error(X_train["PREVIOUS"],y_train))
    rmse_baseline_test = np.sqrt(metrics.mean_squared_error(X_test["PREVIOUS"],y_test))
    up_down_train = calcUpMetrics(y_pred_train,y_train,X_train)
    up_down_test = calcUpMetrics(y_pred,y_test,X_test)
                                 
    print("RMSE Train:", rmse_train)
    print("RMSE Test:", rmse_test)
    print("RMSE Baseline Train:", rmse_baseline_train)
    print("RMSE Baseline Test:", rmse_baseline_test)
    print("Up/Down Train - (accuracy, recall, precision):", up_down_train)
    print("Up/Down Test - (accuracy, recall, precision):", up_down_test)
    scores = scores.append({'rmse_train':rmse_train, 'rmse_test':rmse_test, "rmse_baseline_train": rmse_baseline_train, "rmse_baseline_test": rmse_baseline_test, "up_down_train": up_down_train, "up_down_test": up_down_test},ignore_index=True)
    
    print("\n")
    count += 1
print("=============== SUMMARY ===============")
print("# Average Metrics")
print(scores.mean())
end = time.time()

print("\nTotal run time:", end-start)

# Feature Selection: correlation
20 selected features (correlation)
# Bayesian Optimization


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)


# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 4.273827544887062
RMSE Test: 4.285061986573005
RMSE Baseline Train: 7.076445293537384
RMSE Baseline Test: 7.202308707512127
Up/Down Train - (accuracy, recall, precision): (0.8788750399488654, 1.0, 0.8788750399488654)
Up/Down Test - (accuracy, recall, precision): (0.8900724328930549, 1.0, 0.8900724328930549)


# Feature Selection: correlation


  positive)
  positive)


18 selected features (correlation)
# Bayesian Optimization


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)


# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 4.280743541093666
RMSE Test: 4.264264804507611
RMSE Baseline Train: 7.099346455464094
RMSE Baseline Test: 7.111597614553929
Up/Down Train - (accuracy, recall, precision): (0.8811121764141898, 1.0, 0.8811121764141898)
Up/Down Test - (accuracy, recall, precision): (0.8811248402215595, 1.0, 0.8811248402215595)


# Feature Selection: correlation
22 selected features (correlation)


  positive)
  positive)


# Bayesian Optimization


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)


# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 4.266773673327274
RMSE Test: 4.308602565516161
RMSE Baseline Train: 7.101044438443785
RMSE Baseline Test: 7.104814059263473
Up/Down Train - (accuracy, recall, precision): (0.8828166613401512, 1.0, 0.8828166613401512)
Up/Down Test - (accuracy, recall, precision): (0.8743076267575628, 1.0, 0.8743076267575628)


# Feature Selection: correlation


  positive)
  positive)


21 selected features (correlation)
# Bayesian Optimization


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)


# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 4.280448724379263
RMSE Test: 4.258570862120364
RMSE Baseline Train: 7.099399761518281
RMSE Baseline Test: 7.11138477634814
Up/Down Train - (accuracy, recall, precision): (0.881325237029935, 1.0, 0.881325237029935)
Up/Down Test - (accuracy, recall, precision): (0.8802726885385599, 1.0, 0.8802726885385599)


# Feature Selection: correlation


  positive)


21 selected features (correlation)
# Bayesian Optimization


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)


# Fitting the model
# Making predictions
# Calculating metrics
RMSE Train: 4.277376772443649
RMSE Test: 4.2664581158016155
RMSE Baseline Train: 7.132640598046738
RMSE Baseline Test: 6.977013666590294
Up/Down Train - (accuracy, recall, precision): (0.8814443971026843, 1.0, 0.8814443971026843)
Up/Down Test - (accuracy, recall, precision): (0.8797953964194374, 1.0, 0.8797953964194374)


# Average Metrics
rmse_train             4.275834
rmse_test              4.276592
rmse_baseline_train    7.101775
rmse_baseline_test     7.101424
dtype: float64

Total run time: 24.039926052093506


  positive)
