In [1]:
get_ipython().magic('re-sf')

# Models

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import neighbors
from sklearn import feature_selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, LassoCV, Lasso
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import StackingRegressor
import xgboost as xgb
import time
import dalex as dx
from sklearn.ensemble import RandomForestRegressor
import torch
import torch.nn as nn
from torch.autograd import Variable 
from captum.attr import (
    IntegratedGradients,
    GradientShap,
    DeepLift,
    DeepLiftShap,
    LayerConductance,
    NeuronConductance,
    NoiseTunnel,
)
import lime.lime_tabular
import pickle
import matplotlib.pyplot as plt
import sys
from torch.utils.data import DataLoader
from sklearn.ensemble import GradientBoostingRegressor
from LSTM import LSTM
from captum.attr import NeuronConductance

First, I import pre-cleaned data, after I divide the dataset in training and testing, 60% and 40% respectively. Everything has to be scaled and I prepare the folds for the 5 cross validation. The steps I am going to follow in wach model are the following:
- Recursive feature elimination, I am going to remove the non important features
- Grid search, in order to try all the hyperparameters and choose the best ones
- Create model with the hyperparameters selected previously and fit it
- Make the predictions
- Shapley values to understand the model and the predictions

In [2]:
data_info_original = pd.read_csv('info_data.csv')
data_info_original = data_info_original.drop(columns = ['DAY_OF_WEEK'])
data_info_original = data_info_original.sort_values(['ANO_FACTURA', 'MES_FACTURA', 'FECHA_FACTURA'])
data_info = pd.read_csv('info_datav2.csv')
data_info = data_info.drop(columns = ['DAY_OF_WEEK'])
data_info = data_info.sort_values(['ANO_FACTURA', 'MES_FACTURA', 'FECHA_FACTURA'])
data_gaussian = data_info[data_info.ANO_FACTURA == 2020]

In [3]:
display(data_info_original[(data_info_original['PRODUCTO_ID'] == 'K400381-001') & (data_info_original['OUTSOLE_SUELA_SUBTIPO'] != 'HIGH')]['OUTSOLE_SUELA_SUBTIPO'])
display(data_info_original[(data_info_original['PRODUCTO_ID'] == 'K400381-001') & (data_info_original['PLANTILLA_EXTRAIBLE'] != 'NO')]['PLANTILLA_EXTRAIBLE'])
display(data_info_original[(data_info_original['PRODUCTO_ID'] == 'K400381-001') & (data_info_original['CONSUMER_COLOR'] != 'Black')]['CONSUMER_COLOR']) # ALWAYS BROWN
display(data_info_original[(data_info_original['PRODUCTO_ID'] == 'K400381-001') & (data_info_original['CREMALLERA'] != 'SI')]['CREMALLERA']) #Always Orxford
#display(data_info_original[(data_info_original['PRODUCTO_ID'] == '16002-194')]['PRODUCTO_ID'])

In [4]:
print(data_info.shape)
print(data_info_original.shape)

In [5]:
def groupData(df, col_groups, ref_col):
    testing =  df.groupby(col_groups)[ref_col].sum()
    grouped_testing = []
    for k,v in zip(testing.index, testing.values):
        val = [k[0],k[1],k[2],v,k[3], k[4],k[5],k[6],k[7],k[8],k[9],k[10],k[11],k[12],k[13],k[14],k[15],k[16],k[17],k[18],k[19],k[20],k[21],k[22],k[23],k[24], k[25]]
        grouped_testing.append(val)
    grouped_testing = pd.DataFrame(grouped_testing, columns = df.columns)
    #grouped_testing = grouped_testing.reset_index(drop=True)
    return grouped_testing

def getISOCountry(iso_country):
    indexes = data_info_original[data_info_original['NUMERO_DEUDOR_PAIS_ID'] == iso_country].index
    iso_code = data_info.iloc[[indexes[0]]]['NUMERO_DEUDOR_PAIS_ID']
    return iso_code.values[0]

def getEncode(dto, dt, val, col):
    indexes = dto[-dto[col] == val].index
    code = dt.iloc[[indexes[0]]][col]
    return code.values[0]

def getValuesFilter(ds, value, columns, target):
    if value == '*': #no filter 
        return ds
    indexes = getIndexFilter(ds, value, target)
    datat = getValues(ds, indexes, columns)
    return datat

def getIndexFilter(dt, value, target):
    indexes = dt[dt[target] == value].index
    return indexes

def getValues(dt, indexes, columns):
    datat = []
    for k, v in zip(dt.index, dt.values):
        if k in indexes:
            datat.append(v)
    df = pd.DataFrame(datat, columns = columns)
    return df

def saveModelToFile(dat, name_file):
    dbfile = open('./Models/'+name_file, 'ab')
    pickle.dump(dat, dbfile)

def loadModelFromFile(name_file):
    dbfile = open('./Models/'+name_file, 'rb')
    md = pickle.load(dbfile)
    return md

In [6]:
columns = ['ANO_FACTURA', 'MES_FACTURA', 'FECHA_FACTURA', 'TEMPORADA_COMERCIAL_ID', 'PRODUCTO_ID', 'TALLA', 'ESFUERZO_VENTA_ID', 'NUMERO_DEUDOR_PAIS_ID', 'JERARQUIA_PROD_ID', 'GRUPO_ARTICULO_PRODUCTO_ID', 'GENERO_PRODUCTO', 'CATEGORIA', 'TIPOLOGIA', 'CONSUMER_COLOR', 'CREMALLERA', 'CORDONES', 'OUTSOLE_SUELA_TIPO', 'OUTSOLE_SUELA_SUBTIPO', 'PLANTILLA_EXTRAIBLE', 'CONTACTO_SN', 'EDAD_SN', 'GENERO_CONTACTO', 'EDAD_COMPRA', 'EDAD_RANGO_COMPRA', 'CIUDAD_CONTACTO', 'IDIOMA_CONTACTO']

In [7]:
data_info.columns

In [8]:
data_info_grouped = groupData(data_info, columns, 'IMP_VENTA_NETO_EUR')
data_info_grouped = data_info_grouped.sort_values(['ANO_FACTURA', 'MES_FACTURA', 'FECHA_FACTURA'])
data_info_original_grouped = groupData(data_info_original, columns, 'IMP_VENTA_NETO_EUR')
data_info_original_grouped = data_info_original_grouped.sort_values(['ANO_FACTURA', 'MES_FACTURA', 'FECHA_FACTURA'])

In [9]:
print(data_info_grouped.shape)
print(data_info_original_grouped.shape)

In [10]:
data_info_filtered_grouped = getValuesFilter(data_info_grouped, '*', data_info_grouped.columns, 'NUMERO_DEUDOR_PAIS_ID') # iso = * -> no filter by country
data_info_filtered_grouped = data_info_filtered_grouped.sort_values(['ANO_FACTURA', 'MES_FACTURA', 'FECHA_FACTURA'])
data_info_original_filtered_grouped = getValuesFilter(data_info_original_grouped, '*', data_info_original_grouped.columns, 'NUMERO_DEUDOR_PAIS_ID') # iso = * -> no filter by country
data_info_original_filtered_grouped = data_info_original_filtered_grouped.sort_values(['ANO_FACTURA', 'MES_FACTURA', 'FECHA_FACTURA'])

In [11]:
traindataset_original, testdataset_original = train_test_split(data_info_original_filtered_grouped, test_size=0.4, shuffle= False) # To use all the data, change to -> data_info
traindataset, testdataset = train_test_split(data_info_filtered_grouped, test_size=0.4, shuffle= False) # To use all the data, change to -> data_info
x_train = traindataset.loc[:, traindataset.columns != 'IMP_VENTA_NETO_EUR']
y_train = traindataset.loc[:, traindataset.columns == 'IMP_VENTA_NETO_EUR']
x_train = x_train.drop(columns=['EDAD_RANGO_COMPRA'])
x_test = testdataset.loc[:, testdataset.columns != 'IMP_VENTA_NETO_EUR']
y_test = testdataset.loc[:, testdataset.columns == 'IMP_VENTA_NETO_EUR']
x_test = x_test.drop(columns = 'EDAD_RANGO_COMPRA')

Experiment 1: 2018 and 2019

In [39]:
x_train = data_info_filtered_grouped[data_info_filtered_grouped['ANO_FACTURA'] == 2018]
y_train = x_train['IMP_VENTA_NETO_EUR']
x_train = x_train.loc[:, x_train.columns != 'IMP_VENTA_NETO_EUR']
x_train = x_train.drop(columns=['EDAD_RANGO_COMPRA'])
x_test = data_info_filtered_grouped[data_info_filtered_grouped['ANO_FACTURA'] == 2019]
y_test = x_test['IMP_VENTA_NETO_EUR']
x_test = x_test.loc[:, x_test.columns != 'IMP_VENTA_NETO_EUR']
x_test = x_test.drop(columns = 'EDAD_RANGO_COMPRA')

Experiment 2: Rolling window

In [274]:
x_train = data_info_filtered_grouped[(data_info_filtered_grouped['ANO_FACTURA'] == 2020) & (data_info_filtered_grouped['MES_FACTURA'] < 10)]
y_train = x_train['IMP_VENTA_NETO_EUR']
x_train = x_train.loc[:, x_train.columns != 'IMP_VENTA_NETO_EUR']
x_train = x_train.drop(columns=['EDAD_RANGO_COMPRA'])
x_test = data_info_filtered_grouped[(data_info_filtered_grouped['ANO_FACTURA'] == 2020) & (data_info_filtered_grouped['MES_FACTURA'] > 9)]
y_test = x_test['IMP_VENTA_NETO_EUR']
x_test = x_test.loc[:, x_test.columns != 'IMP_VENTA_NETO_EUR']
x_test = x_test.drop(columns = 'EDAD_RANGO_COMPRA')

In [12]:
normalizer = MinMaxScaler(feature_range = (-1, 1))
x_train = pd.DataFrame(normalizer.fit_transform(x_train), columns= x_train.columns, index = x_train.index)
x_test = pd.DataFrame(normalizer.fit_transform(x_test), columns= x_test.columns, index = x_test.index)
mseresults = pd.DataFrame()
timeexecution = pd.DataFrame()

In [13]:
folds = KFold(n_splits = 5, shuffle = False) # if shuffle false, random state doesn't matter

## Linear Model

I have to choose which are the most important features in the model, I will use R2 to get it.

In [14]:
hyper_params = [{'n_features_to_select': list(range(1, 26))}]
lm = LinearRegression()
rfe = RFE(lm)
model_cv = GridSearchCV(estimator = rfe, param_grid = hyper_params, scoring= 'r2', cv = folds, verbose = 1, return_train_score=True)      
model_cv.fit(x_train, y_train)
cv_results = pd.DataFrame(model_cv.cv_results_)

In [15]:
display(cv_results)
print(model_cv.best_params_)

In [16]:
#plt.figure(figsize=(16,6))
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='lower right')
plt.savefig('../Output/testscoretrain.png')
plt.show()

display(cv_results[['param_n_features_to_select', 'mean_train_score', 'mean_test_score']])

In [14]:
n_features_optimal = 25 # model_cv.best_params_['n_features_to_select']
lm = LinearRegression()
rfe = RFE(lm, n_features_to_select= n_features_optimal)             
rfe = rfe.fit(x_train, y_train)

I select only the parameters I am interested in

In [15]:
def removeFeatures(dt_train, dt_test, rfe_model):
    chosen = pd.DataFrame(rfe_model.support_, index=dt_train.columns, columns=['Rank'])
    featuresnotselected = []
    for k, v in zip(chosen.index, chosen.values):
        if v == False:
            featuresnotselected.append(k)
    dt_train_final = dt_train.drop(columns= featuresnotselected)
    dt_test_final = dt_test.drop(columns= featuresnotselected)
    return dt_train_final, dt_test_final

In [16]:
x_train_LM, x_test_LM = removeFeatures(x_train, x_test, rfe)

Finally, I create the model with the best parametrization possible

In [17]:
len(y_test)

In [26]:
lm_model = LinearRegression()
start_time = time.time()
lm_model.fit(x_train_LM, y_train)
timeexecution['lm'] = (time.time() - start_time)
y_pred = lm_model.predict(x_test_LM)
results = pd.DataFrame(index= testdataset.index, columns = ['Import'])
#results = pd.DataFrame()
results['Import'] = y_test.copy()
results['lm'] = y_pred

In [17]:
saveModelToFile(lm_model, 'LinearModel')

#De moment no ho utilitzar�
F-test, p-values:

In [18]:
f_val, p_val = feature_selection.f_regression(x_train_LM, y_train) #Repassar
display(list(zip(x_train_LM.columns, p_val)))
print('')
display(list(zip(x_train_LM.columns, f_val)))

In [281]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2: ', metrics.r2_score(y_test, y_pred))
mseresults['lm'] = metrics.mean_squared_error(y_test, y_pred)

In [20]:
plt.barh(range(0, 25), lm_model.coef_[0])
plt.yticks(range(0, 25), x_train_LM.columns, fontsize = 8)
plt.title('Coefficients')
plt.savefig('../Output/coefflinear.png')
plt.show()

In [19]:
variable_groups = {
    'Date': ['ANO_FACTURA', 'MES_FACTURA', 'FECHA_FACTURA', 'TEMPORADA_COMERCIAL_ID'],
    'Product': ['PRODUCTO_ID', 'TALLA', 'GRUPO_ARTICULO_PRODUCTO_ID', 'GENERO_PRODUCTO', 'CATEGORIA', 'TIPOLOGIA', 'CONSUMER_COLOR', 'CREMALLERA', 'CORDONES', 'OUTSOLE_SUELA_TIPO', 'OUTSOLE_SUELA_SUBTIPO', 'PLANTILLA_EXTRAIBLE'],
    'Age': ['EDAD_SN', 'EDAD_COMPRA'],
    'ContanctInfo': ['NUMERO_DEUDOR_PAIS_ID', 'CIUDAD_CONTACTO', 'IDIOMA_CONTACTO', 'GENERO_CONTACTO', 'CONTACTO_SN'],
    'SalePerson': ['ESFUERZO_VENTA_ID']
}

In [19]:
linear_explainer = dx.Explainer(lm_model, x_test_LM, y_test)
explanation = linear_explainer.model_parts()
explanation.plot()
groupedexpl = linear_explainer.model_parts(variable_groups=variable_groups)
groupedexpl.plot()

In [18]:
indexspaindata = getIndexFilter(testdataset, 14, 'NUMERO_DEUDOR_PAIS_ID')
indexspaindata_original = getIndexFilter(testdataset_original, 'ES', 'NUMERO_DEUDOR_PAIS_ID')
usa_iso = getISOCountry('US')
indexusadata = getIndexFilter(testdataset, usa_iso, 'NUMERO_DEUDOR_PAIS_ID')
indexusadata_original = getIndexFilter(testdataset_original, 'US', 'NUMERO_DEUDOR_PAIS_ID')

In [22]:
indexspaindata[0]

Spain data

In [23]:
display(testdataset_original.loc[indexspaindata_original[0]])
display(testdataset_original.loc[indexspaindata_original[20]])
display(testdataset_original.loc[indexspaindata_original[200]])
display(testdataset_original.loc[indexspaindata_original[500]])

In [24]:
bd_henry_es_0 = linear_explainer.predict_parts(np.ravel(x_test_LM.loc[[indexspaindata[0]]]), type = 'shap', B= 100)
bd_henry_es_0.plot()
bd_henry_es_1 = linear_explainer.predict_parts(np.ravel(x_test_LM.loc[[indexspaindata[20]]]), type = 'shap', B= 100)
bd_henry_es_1.plot()
bd_henry_es_2 = linear_explainer.predict_parts(np.ravel(x_test_LM.loc[[indexspaindata[200]]]), type = 'shap', B= 100)
bd_henry_es_2.plot()
bd_henry_es_3 = linear_explainer.predict_parts(np.ravel(x_test_LM.loc[[indexspaindata[500]]]), type = 'shap', B= 100)
bd_henry_es_3.plot()

US data

In [25]:
display(testdataset_original.loc[indexusadata_original[0]])
display(testdataset_original.loc[indexusadata_original[20]])
display(testdataset_original.loc[indexusadata_original[200]])
display(testdataset_original.loc[indexusadata_original[500]])

In [26]:
bd_henry_us_1 = linear_explainer.predict_parts(np.ravel(x_test_LM.loc[[indexusadata[0]]]), type = 'shap', B= 100)
bd_henry_us_1.plot()
bd_henry_us_2 = linear_explainer.predict_parts(np.ravel(x_test_LM.loc[[indexusadata[20]]]), type = 'shap', B= 100)
bd_henry_us_2.plot()
bd_henry_us_3 = linear_explainer.predict_parts(np.ravel(x_test_LM.loc[[indexusadata[200]]]), type = 'shap', B= 100)
bd_henry_us_3.plot()
bd_henry_us_4 = linear_explainer.predict_parts(np.ravel(x_test_LM.loc[[indexusadata[500]]]), type = 'shap', B= 100)
bd_henry_us_4.plot()

## XGBoost Model

For this model, I am going to work with, the parameters max_depth, learning_rate and subsample from the Xgboost model and n_features_to_select from RFE.

In [24]:
dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=list(x_train.columns))
dtest = xgb.DMatrix(x_test, feature_names=list(x_test.columns))

In [17]:
#hyper_params = {'estimator__max_depth':[1, 10, 15], 'estimator__n_estimators': [150, 300, 500], 'estimator__learning_rate':[0.1, 0.05, 0.01] }#, 'n_features_to_select': list(range(1, 26))}
hyper_params = {'n_features_to_select': list(range(16, 26))}
xgbm = xgb.XGBRegressor(learning_rate =0.01, n_estimators=215, max_depth=10, min_child_weight=0.8, subsample=1, nthread=4)
rfe_xgboost = RFE(xgbm)
model_cv_xgb = GridSearchCV(estimator = rfe_xgboost, param_grid = hyper_params, scoring= 'r2', cv = folds, verbose = 1, return_train_score=True)      
model_cv_xgb.fit(x_train, y_train)
cv_results_xgb = pd.DataFrame(model_cv_xgb.cv_results_)

In [18]:
display(cv_results_xgb)
print(model_cv_xgb.best_params_)

In [19]:
#plt.figure(figsize=(16,6))
plt.plot(cv_results_xgb["param_n_features_to_select"], cv_results_xgb["mean_test_score"])
plt.plot(cv_results_xgb["param_n_features_to_select"], cv_results_xgb["mean_train_score"])
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='lower right')
plt.savefig('../Output/testscoretrain_xgb.png')
plt.show()

display(cv_results_xgb[['param_n_features_to_select', 'mean_train_score', 'mean_test_score']])

Once I get the best parametrization, I execute the model with it, by the moment this is an example because I haven't execute the previous, too long.

In [282]:
n_features_optimal = 21 # model_cv.best_params_['n_features_to_select']
xgbm = xgb.XGBRegressor(learning_rate =0.01, n_estimators=215, max_depth=10, min_child_weight=0.8, subsample=1, nthread=4)
rfe = RFE(xgbm, n_features_to_select= n_features_optimal)             
rfe = rfe.fit(x_train, y_train)

In [283]:
x_train_XGBM, x_test_XGBM = removeFeatures(x_train, x_test, rfe)

In [20]:
display('Featues chosen: ', set(x_train_XGBM.columns))
display('Features discarted: ',set(x_train.columns) - set(x_train_XGBM.columns))

In [284]:
xgb_model = xgb.XGBRegressor(learning_rate =0.01, n_estimators=215, max_depth=10, min_child_weight=0.8, subsample=1, nthread=4)
xgb_model.fit(x_train_XGBM, y_train)

In [105]:
saveModelToFile(xgb_model, 'XGBoostModel')

In [211]:
## Execute it to avoid all the process
xgb_model = loadModelFromFile('XGBoostModel')

In [285]:
y_pred = xgb_model.predict(x_test_XGBM)

In [286]:
#results = y_test.copy()
results['xgboost'] = y_pred

In [214]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))
mseresults['xgboost'] = metrics.mean_squared_error(y_test, y_pred)

Firstly, I create an explainer for the model, the inputs are the model, x_train and y_train. After, i can get the variable importance and the reverse cumulative distribution of residuals

In [29]:
variable_groups = {
    'Date': ['ANO_FACTURA', 'MES_FACTURA', 'FECHA_FACTURA', 'TEMPORADA_COMERCIAL_ID'],
    'Product': ['PRODUCTO_ID', 'TALLA', 'GENERO_PRODUCTO', 'CATEGORIA', 'TIPOLOGIA', 'CONSUMER_COLOR', 'CREMALLERA', 'CORDONES', 'OUTSOLE_SUELA_TIPO', 'OUTSOLE_SUELA_SUBTIPO', 'PLANTILLA_EXTRAIBLE'],
    'Age': ['EDAD_COMPRA'],
    'ContanctInfo': ['NUMERO_DEUDOR_PAIS_ID', 'CIUDAD_CONTACTO', 'IDIOMA_CONTACTO'],
    'SalePerson': ['ESFUERZO_VENTA_ID']
}

In [31]:
xgboost_explainer = dx.Explainer(xgb_model, x_test_XGBM, y_test)
explanation = xgboost_explainer.model_parts()
explanation.plot()
res = xgboost_explainer.model_performance(model_type='regression')
res.plot()
groupedexpl = xgboost_explainer.model_parts(variable_groups=variable_groups)
groupedexpl.plot()

In [30]:
print(xgboost_explainer.model_performance().result)

Also, I can group the features into a new ones. In this case, I have created different variables that include a similar meaning.
- Date: ANO_FACTURA, MES_FACTURA, FECHA_FACTURA, TEMPORADA_COMERCIAL_ID
- Product: PRODUCTO_ID, TALLA, GRUPO_ARTICULO_PRODUCTO_ID, GENERO_PRODUCTO, CATEGORIA, TIPOLOGIA, CONSUMER_COLOR, CREMALLERA, CORDONES, OUTSOLE_SUELA_TIPO, OUTSOLE_SUELA_SUBTIPO, PLANTILLA_EXTRAIBLE
- Age: EDAD_SN, EDAD_COMPRA
- ContanctInfo: NUMERO_DEUDOR_PAIS_ID, CIUDAD_CONTACTO, IDIOMA_CONTACTO, GENERO_CONTACTO, CONTACTO_SN
- SalePerson: ESFUERZO_VENTA_ID

In [32]:
xgb_sh_es_0 = xgboost_explainer.predict_parts(np.ravel(x_test_XGBM.loc[[indexspaindata[0]]]), type = 'shap')
xgb_sh_es_0.plot()
xgb_sh_es_1 = xgboost_explainer.predict_parts(np.ravel(x_test_XGBM.loc[[indexspaindata[20]]]), type = 'shap')
xgb_sh_es_1.plot()
xgb_sh_es_2 = xgboost_explainer.predict_parts(np.ravel(x_test_XGBM.loc[[indexspaindata[200]]]), type = 'shap')
xgb_sh_es_2.plot()
xgb_sh_es_3 = xgboost_explainer.predict_parts(np.ravel(x_test_XGBM.loc[[indexspaindata[500]]]), type = 'shap')
xgb_sh_es_3.plot()

In [34]:
xgb_sh_us_0 = xgboost_explainer.predict_parts(np.ravel(x_test_XGBM.loc[[indexusadata[0]]]), type = 'shap')
xgb_sh_us_0.plot()
xgb_sh_us_1 = xgboost_explainer.predict_parts(np.ravel(x_test_XGBM.loc[[indexusadata[20]]]), type = 'shap')
xgb_sh_us_1.plot()
xgb_sh_us_2 = xgboost_explainer.predict_parts(np.ravel(x_test_XGBM.loc[[indexusadata[200]]]), type = 'shap')
xgb_sh_us_2.plot()
xgb_sh_us_3 = xgboost_explainer.predict_parts(np.ravel(x_test_XGBM.loc[[indexusadata[500]]]), type = 'shap')
xgb_sh_us_3.plot()

## NN

### MLP Regressor

In [61]:
nn_reg = MLPRegressor(hidden_layer_sizes=(300, 300),  activation='logistic', solver='adam', alpha=0.01, batch_size='auto', learning_rate='constant', learning_rate_init=0.01, max_iter=1000, shuffle=False, tol=0.0001, verbose=True, early_stopping= True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn_reg.fit(x_train, np.ravel(y_train))
saveModelToFile(nn_reg, 'MLPRegressorModel')

In [230]:
nn_reg = loadModelFromFile('MLPRegressorModel')

In [231]:
y_pred = nn_reg.predict(x_test)
nn_reg.score(x_test, y_test)

In [36]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))

In [232]:
nn_reg_pred = lambda x: nn_reg.predict(x).astype(float)

In [233]:
results['nn'] = y_pred

In [23]:
indexspaindata = getIndexFilter(testdataset, 14, 'NUMERO_DEUDOR_PAIS_ID')
indexspaindata_original = getIndexFilter(testdataset_original, 'ES', 'NUMERO_DEUDOR_PAIS_ID')

In [30]:
explainer = lime.lime_tabular.LimeTabularExplainer(x_train.values, feature_names=x_train.columns, verbose= False, mode='regression')
expsp1 = explainer.explain_instance(np.ravel(x_test.loc[[indexspaindata[0]]]), nn_reg_pred, num_features=10)
expsp2 = explainer.explain_instance(np.ravel(x_test.loc[[indexspaindata[20]]]), nn_reg_pred, num_features=10)
expsp3 = explainer.explain_instance(np.ravel(x_test.loc[[indexspaindata[200]]]), nn_reg_pred, num_features=10)
expsp4 = explainer.explain_instance(np.ravel(x_test.loc[[indexspaindata[500]]]), nn_reg_pred, num_features=10)

# expsp1.show_in_notebook(show_table=True)
#expsp1.as_pyplot_figure()

# expsp2.show_in_notebook(show_table=True)
#expsp2.as_pyplot_figure()

# expsp3.show_in_notebook(show_table=True)
#expsp3.as_pyplot_figure()

# expsp4.show_in_notebook(show_table=True)
#expsp4.as_pyplot_figure()

In [37]:
explainer = lime.lime_tabular.LimeTabularExplainer(x_train.values, feature_names=x_train.columns, verbose= False, mode='regression')
expus1 = explainer.explain_instance(np.ravel(x_test.loc[[indexusadata[0]]]), nn_reg_pred, num_features=10)
expus2 = explainer.explain_instance(np.ravel(x_test.loc[[indexusadata[20]]]), nn_reg_pred, num_features=10)
expus3 = explainer.explain_instance(np.ravel(x_test.loc[[indexusadata[200]]]), nn_reg_pred, num_features=10)
expus4 = explainer.explain_instance(np.ravel(x_test.loc[[indexusadata[500]]]), nn_reg_pred, num_features=10)

#expus1.show_in_notebook(show_table=True)
#expus1.as_pyplot_figure()

#expus2.show_in_notebook(show_table=True)
#expus2.as_pyplot_figure()

#expus3.show_in_notebook(show_table=True)
#expus3.as_pyplot_figure()

#expus4.show_in_notebook(show_table=True)
#expus4.as_pyplot_figure()

In [102]:
exp.as_list()

### LSTM

In [287]:
#torch.manual_seed(39931191)
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state
        self.dropout = nn.Dropout(p= 0.05) # Dropout
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True) #lstm

        self.fc_1 =  nn.Linear(hidden_size, hidden_size//2) #fully connected 1
        self.fc = nn.Linear(hidden_size//2, 1) #fully connected last layer
        self.relu = nn.ReLU()
    
    def forward(self,x):
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
        # Propagate input through LSTM
        _, (hn, cn) = self.lstm(x, (h_0.detach(), c_0.detach())) #lstm with input, hidden, and internal state
        hn_fs = hn.view(self.num_layers, x.size(0), self.hidden_size)[-1] #reshaping the data for Dense layer next
        out = self.dropout(hn_fs) # Dropout
        out = self.fc_1(out) # Dense
        out = self.relu(out) # Activation
        out = self.fc(out) # Dense
        out = self.relu(out) # Activation
        return out

In [288]:
num_epochs =  10 #1000 epochs
learning_rate = 0.01 #0.001 lr

input_size = 25 #number of features
hidden_size = 175 #number of features in hidden state
num_layers = 2 #number of stacked lstm layers

x_train_tensors = Variable(torch.Tensor(x_train.values))
x_test_tensors = Variable(torch.Tensor(x_test.values))
y_train_tensors = Variable(torch.Tensor(y_train.values))
y_test_tensors = Variable(torch.Tensor(y_test.values))

x_train_tensors_final = torch.reshape(x_train_tensors,   (x_train_tensors.shape[0], 1, x_train_tensors.shape[1]))
x_test_tensors_final = torch.reshape(x_test_tensors,  (x_test_tensors.shape[0], 1, x_test_tensors.shape[1]))

x_train_loader = DataLoader(x_train_tensors, batch_size= 200, shuffle= False, num_workers = 2)
x_test_loader = DataLoader(x_test_tensors, batch_size= 200, shuffle= False, num_workers = 2)
y_train_loader = DataLoader(y_train_tensors, batch_size= 200, shuffle= False, num_workers = 2)
y_test_loader = DataLoader(y_test_tensors, batch_size= 200, shuffle= False, num_workers = 2)

In [33]:
# REMOVING VARIABLES NO IMPORTANT: CIUDAD_CONTACTO, GENERO_CONTACTO, TIPOLOGIA, FECHA_FACTURA
x_train_lstm = x_train.drop(columns=['CIUDAD_CONTACTO', 'GENERO_CONTACTO', 'TIPOLOGIA', 'FECHA_FACTURA'])
x_test_lstm = x_test.drop(columns=['CIUDAD_CONTACTO', 'GENERO_CONTACTO', 'TIPOLOGIA', 'FECHA_FACTURA'])

x_train_tensors = Variable(torch.Tensor(x_train_lstm.values))
x_test_tensors = Variable(torch.Tensor(x_test_lstm.values))
y_train_tensors = Variable(torch.Tensor(y_train.values))
y_test_tensors = Variable(torch.Tensor(y_test.values))

x_train_tensors_final = torch.reshape(x_train_tensors,   (x_train_tensors.shape[0], 1, x_train_tensors.shape[1]))
x_test_tensors_final = torch.reshape(x_test_tensors,  (x_test_tensors.shape[0], 1, x_test_tensors.shape[1]))

x_train_loader = DataLoader(x_train_tensors, batch_size= 200, shuffle= False, num_workers = 2)
x_test_loader = DataLoader(x_test_tensors, batch_size= 200, shuffle= False, num_workers = 2)
y_train_loader = DataLoader(y_train_tensors, batch_size= 200, shuffle= False, num_workers = 2)
y_test_loader = DataLoader(y_test_tensors, batch_size= 200, shuffle= False, num_workers = 2)

In [289]:
lstm_model = LSTM(input_size, hidden_size, num_layers) #our lstm class

In [290]:
criterion = torch.nn.MSELoss()    # mean-squared error for regression
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=learning_rate, weight_decay=1e-5)

In [291]:
min_val = sys.maxsize
count_epoch = 0
for epoch in range(num_epochs):
  for dtx, dty in zip(enumerate(x_train_loader), enumerate(y_train_loader)):
    xtr = torch.reshape(Variable(dtx[1]), (dtx[1].shape[0], 1, dtx[1].shape[1])) # Reshape
    outputs = lstm_model.forward(xtr) #forward pass
    optimizer.zero_grad() #caluclate the gradient, manually setting to 0
    loss = criterion(outputs, dty[1]) # obtain the loss function
    loss.backward() #calculates the loss of the loss function
    optimizer.step() #improve from loss, i.e backprop
  print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))
  if (min_val - (min_val * 0.001)) > loss.item():
    min_val = loss.item()
    count_epoch = 0
  else:
    count_epoch = count_epoch + 1
  if count_epoch == 10:
    print('Model does not improve')
    #break

In [107]:
saveModelToFile(lstm_model, 'LSTMModel')

In [216]:
## Execute it if you have the file and you don't want to execute again LSTM
lstm_model = loadModelFromFile('LSTMModel')

In [292]:
train_predict = lstm_model(x_test_tensors_final) #forward pass
data_predict = train_predict.data.numpy() #numpy conversion

In [293]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, data_predict))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, data_predict))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, data_predict)))
print('R2:', metrics.r2_score(y_test, data_predict))

In [294]:
results['lstm'] = data_predict

In [30]:
def predictplot(dso, dsp, dso_size, range_plot):
    plt.axvline(x=dso_size, c='r', linestyle='--') #size of the training set
    plt.scatter(range(0, range_plot),y_test[:range_plot], label='Actual Data') #actual plot
    plt.scatter(range(0, range_plot), data_predict[:range_plot], label='Predicted Data') #predicted plot
    plt.title('Time-Series Prediction')
    plt.legend()
    plt.savefig('../Output/timeseriespredictionLSTM.png')
    plt.show()

In [None]:
predictplot(y_test, data_predict, 5, 10)

In [42]:
indexspaindata = getIndexFilter(testdataset, 14, 'NUMERO_DEUDOR_PAIS_ID')
indexspaindata_original = getIndexFilter(testdataset_original, 'ES', 'NUMERO_DEUDOR_PAIS_ID')
usa_iso = getISOCountry('US')
indexusadata = getIndexFilter(testdataset, usa_iso, 'NUMERO_DEUDOR_PAIS_ID')
indexusadata_original = getIndexFilter(testdataset_original, 'US', 'NUMERO_DEUDOR_PAIS_ID')

In [36]:
print(testdataset_original.loc[[indexspaindata_original[0]]])
f = testdataset_original.sort_values(['PRODUCTO_ID'])
f = f.groupby(['PRODUCTO_ID', 'CONSUMER_COLOR']).IMP_VENTA_NETO_EUR.sum()
display(f.head(20))
data_info_original['CONSUMER_COLOR'].value_counts()

In [189]:
f = testdataset_original[testdataset_original.PRODUCTO_ID.str.contains('16002')].head(20)

In [67]:
display(testdataset.loc[[indexspaindata[0]]].values)

In [85]:
x_test.loc[[indexspaindata[20]]].values

In [108]:
print(testdataset_original.loc[[indexspaindata_original[0]]].values)
print(testdataset_original.loc[[indexspaindata_original[1]]].values)

In [43]:
ig = IntegratedGradients(lstm_model)
sp1 = torch.Tensor(x_test.loc[[indexspaindata[0]]].values)
sp2 = torch.Tensor(x_test.loc[[indexspaindata[20]]].values)
sp3 = torch.Tensor(x_test.loc[[indexspaindata[200]]].values)
sp4 = torch.Tensor(x_test.loc[[indexspaindata[500]]].values)

us1 = torch.Tensor(x_test.loc[[indexspaindata[0]]].values)
us2 = torch.Tensor(x_test.loc[[indexspaindata[20]]].values)
us3 = torch.Tensor(x_test.loc[[indexspaindata[200]]].values)
us4 = torch.Tensor(x_test.loc[[indexspaindata[500]]].values)

x_test_tensors_finalsp1 = torch.reshape(sp1,  (sp1.shape[0], 1, sp1.shape[1]))
x_test_tensors_finalsp2 = torch.reshape(sp2,  (sp2.shape[0], 1, sp2.shape[1]))
x_test_tensors_finalsp3 = torch.reshape(sp3,  (sp3.shape[0], 1, sp3.shape[1]))
x_test_tensors_finalsp4 = torch.reshape(sp4,  (sp4.shape[0], 1, sp4.shape[1]))

x_test_tensors_finalus1 = torch.reshape(us1,  (us1.shape[0], 1, us1.shape[1]))
x_test_tensors_finalus2 = torch.reshape(us2,  (us2.shape[0], 1, us2.shape[1]))
x_test_tensors_finalus3 = torch.reshape(us3,  (us3.shape[0], 1, us3.shape[1]))
x_test_tensors_finalus4 = torch.reshape(us4,  (us4.shape[0], 1, us4.shape[1]))

In [44]:
x_test_tensors_final_igsp1 = torch.reshape(x_test_tensors_finalsp1[0],  (1, 1, 25))
attributionsp1 = ig.attribute(x_test_tensors_final_igsp1)
x_test_tensors_final_igsp2 = torch.reshape(x_test_tensors_finalsp2[0],  (1, 1, 25))
attributionsp2 = ig.attribute(x_test_tensors_final_igsp2)
x_test_tensors_final_igsp3 = torch.reshape(x_test_tensors_finalsp3[0],  (1, 1, 25))
attributionsp3 = ig.attribute(x_test_tensors_final_igsp3)
x_test_tensors_final_igsp4 = torch.reshape(x_test_tensors_finalsp4[0],  (1, 1, 25))
attributionsp4 = ig.attribute(x_test_tensors_final_igsp4)

x_test_tensors_final_igus1 = torch.reshape(x_test_tensors_finalus1[0],  (1, 1, 25))
attributionus1 = ig.attribute(x_test_tensors_final_igus1)
x_test_tensors_final_igus2 = torch.reshape(x_test_tensors_finalus2[0],  (1, 1, 25))
attributionus2 = ig.attribute(x_test_tensors_final_igus2)
x_test_tensors_final_igus3 = torch.reshape(x_test_tensors_finalus3[0],  (1, 1, 25))
attributionus3 = ig.attribute(x_test_tensors_final_igus3)
x_test_tensors_final_igus4 = torch.reshape(x_test_tensors_finalus4[0],  (1, 1, 25))
attributionus4 = ig.attribute(x_test_tensors_final_igus4)

In [40]:
def visualize_importance(title, features, importance, xtitle):
    plt.barh(range(0, features.size), importance, align = 'center')
    plt.yticks(range(0, features.size), features, wrap = True, fontsize = 7)
    plt.ylabel(xtitle)
    plt.title(title)
    plt.show()

In [41]:
visualize_importance('Variable Importance', x_test.columns, attributionsp1.detach().numpy()[0][0], 'Feature')
visualize_importance('Variable Importance', x_test.columns, attributionsp2.detach().numpy()[0][0], 'Feature')
visualize_importance('Variable Importance', x_test.columns, attributionsp3.detach().numpy()[0][0], 'Feature')
visualize_importance('Variable Importance', x_test.columns, attributionsp4.detach().numpy()[0][0], 'Feature')

In [42]:
visualize_importance('Variable Importance', x_test.columns, attributionus1.detach().numpy()[0][0], 'Feature')
visualize_importance('Variable Importance', x_test.columns, attributionus2.detach().numpy()[0][0], 'Feature')
visualize_importance('Variable Importance', x_test.columns, attributionus3.detach().numpy()[0][0], 'Feature')
visualize_importance('Variable Importance', x_test.columns, attributionus4.detach().numpy()[0][0], 'Feature')

In [96]:
def visualize_importances_LC(feature_names, importances, title="Average Feature Importances", plot=True, axis_title="Features"):
    print(title)
    for i in range(len(feature_names)):
        print(feature_names[i], ": ", '%.3f'%(importances[i]))
    x_pos = (np.arange(len(feature_names)))
    if plot:
        plt.figure(figsize=(12,6))
        plt.bar(x_pos, importances[:6], align='center')
        plt.xticks(x_pos, feature_names, wrap=True, fontsize = 10)
        plt.xlabel(axis_title)
        plt.title(title)

In [46]:
layer_1 = LayerConductance(lstm_model, lstm_model.fc_1)
layer_1_valssp1 = layer_1.attribute(x_test_tensors_final_igsp1)
layer_1_valssp1 = (layer_1_valssp1.detach().numpy())
layer_1_valssp2 = layer_1.attribute(x_test_tensors_final_igsp2)
layer_1_valssp2 = (layer_1_valssp2.detach().numpy())
layer_1_valssp3 = layer_1.attribute(x_test_tensors_final_igsp3)
layer_1_valssp3 = (layer_1_valssp3.detach().numpy())
layer_1_valssp4 = layer_1.attribute(x_test_tensors_final_igsp4)
layer_1_valssp4 = (layer_1_valssp4.detach().numpy())

In [90]:
layer_1 = LayerConductance(lstm_model, lstm_model.fc_1)
layer_1_valsus1 = layer_1.attribute(x_test_tensors_final_igus1)
layer_1_valsus1 = (layer_1_valsus1.detach().numpy())
layer_1_valsus2 = layer_1.attribute(x_test_tensors_final_igus2)
layer_1_valsus2 = (layer_1_valsus2.detach().numpy())
layer_1_valsus3 = layer_1.attribute(x_test_tensors_final_igus3)
layer_1_valsus3 = (layer_1_valsus3.detach().numpy())
layer_1_valsus4 = layer_1.attribute(x_test_tensors_final_igus4)
layer_1_valsus4 = (layer_1_valsus4.detach().numpy())

In [97]:
visualize_importances_LC(range(6), np.mean(layer_1_valssp1, axis = 0), title="Average Neuron Importances", axis_title= "Neurons")
visualize_importances_LC(range(6), np.mean(layer_1_valssp2, axis = 0), title="Cumulative Neuron Importances", axis_title= "Neurons")
visualize_importances_LC(range(6), np.mean(layer_1_valssp3, axis = 0), title="Average Neuron Importances", axis_title= "Neurons")
visualize_importances_LC(range(6), np.mean(layer_1_valssp4, axis = 0), title="Average Neuron Importances", axis_title = "Neurons")

In [95]:
visualize_importances_LC(range(6), np.mean(layer_1_valsus1, axis = 0), title="Average Neuron Importances", axis_title= "Neurons")
visualize_importances_LC(range(6), np.mean(layer_1_valsus2, axis = 0), title="Cumulative Neuron Importances", axis_title= "Neurons")
visualize_importances_LC(range(6), np.mean(layer_1_valsus3, axis = 0), title="Average Neuron Importances", axis_title= "Neurons")
visualize_importances_LC(range(6), np.mean(layer_1_valsus4, axis = 0), title="Average Neuron Importances", axis_title = "Neurons")

In [121]:
def visualize_importances_NC(feature_names, importances, title="Average Feature Importances", plot=True, axis_title="Features"):
    print(title)
    for i in range(len(feature_names)):
        print(feature_names[i], ": ", '%.3f'%(importances[i]))
    x_pos = (np.arange(len(feature_names)))
    if plot:
        plt.figure(figsize=(12,6))
        plt.barh(x_pos, importances, align='center')
        plt.yticks(x_pos, feature_names, wrap=True, fontsize = 7)
        plt.ylabel(axis_title)
        plt.title(title)

In [124]:
neuron_cond = NeuronConductance(lstm_model, lstm_model.fc_1)
neuron_cond_vals_5sp1 = neuron_cond.attribute(x_test_tensors_final_igsp1, neuron_selector=5)
neuron_cond_vals_5sp2 = neuron_cond.attribute(x_test_tensors_final_igsp2, neuron_selector=5)
neuron_cond_vals_5sp3 = neuron_cond.attribute(x_test_tensors_final_igsp3, neuron_selector=5)
neuron_cond_vals_5sp4 = neuron_cond.attribute(x_test_tensors_final_igsp4, neuron_selector=5)
neuron_cond_vals_5us1 = neuron_cond.attribute(x_test_tensors_final_igus1, neuron_selector=5)
neuron_cond_vals_5us2 = neuron_cond.attribute(x_test_tensors_final_igus2, neuron_selector=5)
neuron_cond_vals_5us3 = neuron_cond.attribute(x_test_tensors_final_igus3, neuron_selector=5)
neuron_cond_vals_5us4 = neuron_cond.attribute(x_test_tensors_final_igus4, neuron_selector=5)

In [125]:
visualize_importances_NC(x_train.columns, neuron_cond_vals_5sp1.mean(dim=0).detach().numpy()[0], title="Average Feature Importances for Neuron 5")
visualize_importances_NC(x_train.columns, neuron_cond_vals_5sp2.mean(dim=0).detach().numpy()[0], title="Average Feature Importances for Neuron 5")
visualize_importances_NC(x_train.columns, neuron_cond_vals_5sp3.mean(dim=0).detach().numpy()[0], title="Average Feature Importances for Neuron 5")
visualize_importances_NC(x_train.columns, neuron_cond_vals_5sp4.mean(dim=0).detach().numpy()[0], title="Average Feature Importances for Neuron 5")

In [126]:
visualize_importances_NC(x_train.columns, neuron_cond_vals_5us1.mean(dim=0).detach().numpy()[0], title="Average Feature Importances for Neuron 5")
visualize_importances_NC(x_train.columns, neuron_cond_vals_5us2.mean(dim=0).detach().numpy()[0], title="Average Feature Importances for Neuron 5")
visualize_importances_NC(x_train.columns, neuron_cond_vals_5us3.mean(dim=0).detach().numpy()[0], title="Average Feature Importances for Neuron 5")
visualize_importances_NC(x_train.columns, neuron_cond_vals_5us4.mean(dim=0).detach().numpy()[0], title="Average Feature Importances for Neuron 5")

## Committee of experts

In [295]:
newXtrain = results[['xgboost', 'lstm']]

In [296]:
meta_learner = xgb.XGBRegressor(learning_rate =0.01, n_estimators=320, max_depth=20, min_child_weight=0.8, subsample=1, nthread=4)

In [297]:
meta_learner.fit(newXtrain, y_test)

In [298]:
y_pred = meta_learner.predict(newXtrain)
results['CommExps'] = y_pred

In [299]:
results

In [300]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))

In [301]:
from sklearn.metrics import mean_absolute_percentage_error

mean_absolute_percentage_error(results['Import'], results['CommExps'])

## Prediction of intervals

As before, the base model is learnt from the training data. Then, a second model (the error model) is trained on a validation set to predict the squared difference between the predictions and the real values.

In [27]:
def takeVal(dt, ind, drange):
    money = []
    dayslist = []
    counter = []
    day = drange
    i = 0
    dayslist.append(str(dt.loc[[ind[(len(ind) - 1) - i]]].values[0][2]) + '/' + str(dt.loc[[ind[(len(ind) - 1) - i]]].values[0][1]) + '/' + str(dt.loc[[ind[(len(ind) - 1) - i]]].values[0][0]))
    while True:
        dayformatted = str(dt.loc[[ind[(len(ind) - 1) - i]]].values[0][2]) + '/' + str(dt.loc[[ind[(len(ind) - 1) - i]]].values[0][1]) + '/' + str(dt.loc[[ind[(len(ind) - 1) - i]]].values[0][0])
        if dayformatted not in dayslist:
            dayslist.append(dayformatted)
            day = day - 1
            counter.append(i)
        if day == 0:
            dayslist = dayslist[:drange]
            break
        i = i + 1
    return dayslist, counter
def sumPredictions(predictions, indexes):
    sum = 0
    sumpred = []
    aux = 0
    for p in predictions.values:
        sum = sum + float(p)
        if aux in indexes:
            sumpred.append(round(sum, 2))
            sum = 0
        aux = aux + 1
    return sumpred

In [28]:
indexspaindata = getIndexFilter(testdataset, 14, 'NUMERO_DEUDOR_PAIS_ID')
indexspaindata_original = getIndexFilter(testdataset_original, 'ES', 'NUMERO_DEUDOR_PAIS_ID')
usa_iso = getISOCountry('US')
indexusadata = getIndexFilter(testdataset, usa_iso, 'NUMERO_DEUDOR_PAIS_ID')
indexusadata_original = getIndexFilter(testdataset_original, 'US', 'NUMERO_DEUDOR_PAIS_ID')
ita_iso = getISOCountry('IT')
indexitadata = getIndexFilter(testdataset, ita_iso, 'NUMERO_DEUDOR_PAIS_ID')
indexitadata_original = getIndexFilter(testdataset_original, 'IT', 'NUMERO_DEUDOR_PAIS_ID')

In [36]:
SpaintakeVal, indexcountersp = takeVal(testdataset_original, indexspaindata_original, 40)
UsatakeVal, indexcounterus = takeVal(testdataset_original, indexusadata_original, 40)
ItatakeVal, indexcounterit = takeVal(testdataset_original, indexitadata_original, 40)

In [37]:
impsp = sumPredictions(results[results.index.isin(indexspaindata)]['Import'], indexcountersp)
impus = sumPredictions(results[results.index.isin(indexusadata)]['Import'], indexcounterus)
impita = sumPredictions(results[results.index.isin(indexitadata)]['Import'], indexcounterit)

In [38]:
predsp = sumPredictions(results[results.index.isin(indexspaindata)]['lm'], indexcountersp)
predus = sumPredictions(results[results.index.isin(indexusadata)]['lm'], indexcounterus)
predita = sumPredictions(results[results.index.isin(indexitadata)]['lm'], indexcounterit)

In [39]:
predsp

In [43]:
st_dev_lm_sp = round(metrics.mean_squared_error(impsp, predsp) ** 0.5, 2)
upper = impsp + st_dev_lm_sp
lower = impsp - st_dev_lm_sp

In [44]:
st_dev_lm_sp

In [None]:
plt.plot(SpaintakeVal, impsp, '.', color = 'k')
plt.plot(SpaintakeVal, predsp, '.', color = 'g')
plt.fill_between(SpaintakeVal, upper, lower, alpha=0.2)
plt.xticks(SpaintakeVal, rotation=85)
plt.title('Prediction Interval (€) - Spain')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_lm_us = round(metrics.mean_squared_error(impus, predus) ** 0.5, 2)

In [None]:
plt.plot(UsatakeVal, impus, '.', color =  'k')
plt.plot(UsatakeVal, predus, '.', color =  'g')
plt.fill_between(UsatakeVal, (impus + st_dev_lm_us), (impus - st_dev_lm_us), alpha=0.2)
plt.xticks(UsatakeVal, rotation=85)
plt.title('Prediction Interval (€) - US')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_lm_ita = round(metrics.mean_squared_error(impita, predita) ** 0.5, 2)

In [None]:
plt.plot(ItatakeVal, impita, '.', color =  'k')
plt.plot(ItatakeVal, predita, '.', color =  'g')
plt.fill_between(ItatakeVal, (impita + st_dev_lm_ita), (impita - st_dev_lm_ita), alpha=0.2)
plt.xticks(ItatakeVal, rotation=85)
plt.title('Prediction Interval (€) - Italy')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
print('Spain: ', st_dev_lm_sp)
print('US: ', st_dev_lm_us)
print('Italy: ', st_dev_lm_ita)

In [None]:
predsp_xgboost = sumPredictions(results[results.index.isin(indexspaindata)]['xgboost'], indexcountersp)
predus_xgboost = sumPredictions(results[results.index.isin(indexusadata)]['xgboost'], indexcounterus)
predita_xgboost = sumPredictions(results[results.index.isin(indexitadata)]['xgboost'], indexcounterit)

In [None]:
st_dev_xgboost_sp = round(metrics.mean_squared_error(impsp, predsp_xgboost) ** 0.5, 2)

In [None]:
plt.plot(SpaintakeVal, impsp, '.', color =  'k')
plt.plot(SpaintakeVal, predsp_xgboost, '.', color =  'g')
plt.fill_between(range(len(SpaintakeVal)), impsp + st_dev_xgboost_sp, impsp - st_dev_xgboost_sp, alpha=0.2)
plt.xticks(SpaintakeVal, rotation=85)
plt.title('Prediction Interval (€) - Spain')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_xgboost_us = round(metrics.mean_squared_error(impus, predus_xgboost) ** 0.5, 2)

In [None]:
plt.plot(UsatakeVal, impus, '.', color = 'k')
plt.plot(UsatakeVal, predus_xgboost, '.', color = 'g')
plt.fill_between(range(len(UsatakeVal)), (impus + st_dev_xgboost_us), (impus - st_dev_xgboost_us), alpha=0.2)
plt.xticks(UsatakeVal, rotation=85)
plt.title('Prediction Interval (€) - US')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_xgboost_ita = round(metrics.mean_squared_error(impita, predita_xgboost) ** 0.5, 2)

In [None]:
plt.plot(ItatakeVal, impita, '.', color = 'k')
plt.plot(ItatakeVal, predita_xgboost, '.', color = 'g')
plt.fill_between(range(len(ItatakeVal)), (impita + st_dev_xgboost_ita), (impita - st_dev_xgboost_ita), alpha=0.2)
plt.xticks(ItatakeVal, rotation=85)
plt.title('Prediction Interval (€) - Italy')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
print('Spain: ', st_dev_xgboost_sp)
print('US: ', st_dev_xgboost_us)
print('Italy: ', st_dev_xgboost_ita)

In [None]:
predsp_nn = sumPredictions(results[results.index.isin(indexspaindata)]['nn'], indexcountersp)
predus_nn = sumPredictions(results[results.index.isin(indexusadata)]['nn'], indexcounterus)
predita_nn = sumPredictions(results[results.index.isin(indexitadata)]['nn'], indexcounterit)

In [None]:
st_dev_nn_sp = round(metrics.mean_squared_error(impsp, predsp_nn) ** 0.5, 2)

In [None]:
plt.plot(SpaintakeVal, impsp, '.', color =  'k')
plt.plot(SpaintakeVal, predsp_nn, '.', color =  'g')
plt.fill_between(SpaintakeVal, impsp + st_dev_nn_sp, impsp - st_dev_nn_sp, alpha=0.2)
plt.xticks(SpaintakeVal, rotation=85)
plt.title('Prediction Interval (€) - Spain')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_nn_us = round(metrics.mean_squared_error(impus, predus_nn) ** 0.5, 2)

In [None]:
plt.plot(UsatakeVal, impus, '.', color = 'k')
plt.plot(UsatakeVal, predus_nn, '.', color = 'g')
plt.fill_between(UsatakeVal, (impus + st_dev_nn_us), (impus - st_dev_nn_us), alpha=0.2)
plt.xticks(UsatakeVal, rotation=85)
plt.title('Prediction Interval (€) - US')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_nn_ita = round(metrics.mean_squared_error(impita, predita_nn) ** 0.5, 2)

In [None]:
plt.plot(ItatakeVal, impita, '.', color = 'k')
plt.plot(ItatakeVal, predita_nn, '.', color = 'g')
plt.fill_between(ItatakeVal, (impita + st_dev_nn_ita), (impita - st_dev_nn_ita), alpha=0.2)
plt.xticks(ItatakeVal, rotation=85)
plt.title('Prediction Interval (€) - Italy')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
print('Spain: ', st_dev_nn_sp)
print('US: ', st_dev_nn_us)
print('Italy: ', st_dev_nn_ita)

In [None]:
predsp_lstm = sumPredictions(results[results.index.isin(indexspaindata)]['lstm'], indexcountersp)
predus_lstm = sumPredictions(results[results.index.isin(indexusadata)]['lstm'], indexcounterus)
predita_lstm = sumPredictions(results[results.index.isin(indexitadata)]['lstm'], indexcounterit)

In [None]:
st_dev_lstm_sp = round(metrics.mean_squared_error(impsp, predsp_lstm) ** 0.5, 2)

In [None]:
plt.plot(SpaintakeVal, impsp, '.', color =  'k')
plt.plot(SpaintakeVal, predsp_lstm, '.', color =  'g')
plt.fill_between(SpaintakeVal, impsp + st_dev_lstm_sp, impsp - st_dev_lstm_sp, alpha=0.2)
plt.xticks(SpaintakeVal, rotation=85)
plt.title('Prediction Interval (€) - Spain')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_lstm_us = round(metrics.mean_squared_error(impus, predus_lstm) ** 0.5, 2)

In [None]:
plt.plot(UsatakeVal, impus, '.', color = 'k')
plt.plot(UsatakeVal, predus_lstm, '.', color = 'g')
plt.fill_between(UsatakeVal, (impus + st_dev_lstm_us), (impus - st_dev_lstm_us), alpha=0.2)
plt.xticks(UsatakeVal, rotation=85)
plt.title('Prediction Interval (€) - US')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_lstm_ita = round(metrics.mean_squared_error(impita, predita_lstm) ** 0.5, 2)

In [None]:
plt.plot(ItatakeVal, impita, '.', color = 'k')
plt.plot(ItatakeVal, predita_lstm, '.', color = 'g')
plt.fill_between(ItatakeVal, (impita + st_dev_lstm_ita), (impita - st_dev_lstm_ita), alpha=0.2)
plt.xticks(ItatakeVal, rotation=85)
plt.title('Prediction Interval (€) - Italy')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
print('Spain: ', st_dev_lstm_sp)
print('US: ', st_dev_lstm_us)
print('Italy: ', st_dev_lstm_ita)

In [None]:
predsp_CommExps = sumPredictions(results[results.index.isin(indexspaindata)]['CommExps'], indexcountersp)
predus_CommExps = sumPredictions(results[results.index.isin(indexusadata)]['CommExps'], indexcounterus)
predita_CommExps = sumPredictions(results[results.index.isin(indexitadata)]['CommExps'], indexcounterit)

In [None]:
st_dev_CommExps_sp = round(metrics.mean_squared_error(impsp, predsp_CommExps) ** 0.5, 2)

In [None]:
plt.plot(SpaintakeVal, impsp, '.', color =  'k')
plt.plot(SpaintakeVal, predsp_CommExps, '.', color =  'g')
plt.fill_between(SpaintakeVal, impsp + st_dev_CommExps_sp, impsp - st_dev_CommExps_sp, alpha=0.2)
plt.xticks(SpaintakeVal, rotation=85)
plt.title('Prediction Interval (€) - Spain')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_CommExps_us = round(metrics.mean_squared_error(impus, predus_CommExps) ** 0.5, 2)

In [None]:
plt.plot(UsatakeVal, impus, '.', color = 'k')
plt.plot(UsatakeVal, predus_CommExps, '.', color = 'g')
plt.fill_between(UsatakeVal, (impus + st_dev_CommExps_us), (impus - st_dev_CommExps_us), alpha=0.2)
plt.xticks(UsatakeVal, rotation=85)
plt.title('Prediction Interval (€) - US')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
st_dev_CommExps_ita = round(metrics.mean_squared_error(impita, predita_CommExps) ** 0.5, 2)

In [None]:
plt.plot(ItatakeVal, impita, '.', color = 'k')
plt.plot(ItatakeVal, predita_CommExps, '.', color = 'g')
plt.fill_between(ItatakeVal, (impita + st_dev_CommExps_ita), (impita - st_dev_CommExps_ita), alpha=0.2)
plt.xticks(ItatakeVal, rotation=85)
plt.title('Prediction Interval (€) - Italy')
plt.legend(['Real Import', 'Prediction', 'Prediction Interval'])
plt.show()

In [None]:
print('Spain: ', st_dev_CommExps_sp)
print('US: ', st_dev_CommExps_us)
print('Italy: ', st_dev_CommExps_ita)

In [None]:
plt.plot(SpaintakeVal, impsp, '.', color =  'k')
plt.plot(SpaintakeVal, predsp, '.', color = 'r')
plt.plot(SpaintakeVal, predsp_xgboost, '.', color =  'b')
plt.plot(SpaintakeVal, predsp_nn, '.', color =  'y')
plt.plot(SpaintakeVal, predsp_lstm, '.', color =  'c')
plt.plot(SpaintakeVal, predsp_CommExps, '.', color =  'g')
plt.fill_between(SpaintakeVal, impsp + st_dev_CommExps_sp, impsp - st_dev_CommExps_sp, alpha=0.2)
plt.xticks(SpaintakeVal, rotation=85)
plt.title('Prediction Interval (€) - Spain')
plt.legend(['Real Import', 'MLR', 'XGBoost', 'MLP Regressor', 'LSTM', 'Hybrid approach'])
plt.show()

In [None]:
plt.plot(UsatakeVal, impus, '.', color =  'k')
plt.plot(UsatakeVal, predus, '.', color = 'r')
plt.plot(UsatakeVal, predus_xgboost, '.', color =  'b')
plt.plot(UsatakeVal, predus_nn, '.', color =  'y')
plt.plot(UsatakeVal, predus_lstm, '.', color =  'c')
plt.plot(UsatakeVal, predus_CommExps, '.', color =  'g')
plt.fill_between(UsatakeVal, impus + st_dev_CommExps_us, impus - st_dev_CommExps_us, alpha=0.2)
plt.xticks(UsatakeVal, rotation=85)
plt.title('Prediction Interval (€) - US')
plt.legend(['Real Import', 'MLR', 'XGBoost', 'MLP Regressor', 'LSTM', 'Hybrid approach'])
plt.show()

In [None]:
plt.plot(ItatakeVal, impita, '.', color =  'k')
plt.plot(ItatakeVal, predita, '.', color = 'r')
plt.plot(ItatakeVal, predita_xgboost, '.', color =  'b')
plt.plot(ItatakeVal, predita_nn, '.', color =  'y')
plt.plot(ItatakeVal, predita_lstm, '.', color =  'c')
plt.plot(ItatakeVal, predita_CommExps, '.', color =  'g')
plt.fill_between(ItatakeVal, impita + st_dev_CommExps_ita, impita - st_dev_CommExps_ita, alpha=0.2)
plt.xticks(ItatakeVal, rotation=85)
plt.title('Prediction Interval (€) - Italy')
plt.legend(['Real Import', 'MLR', 'XGBoost', 'MLP Regressor', 'LSTM', 'Hybrid approach'])
plt.show()

In [None]:
display(st_dev_CommExps_sp)
display(st_dev_CommExps_us)
display(st_dev_CommExps_ita)

In [None]:
display((impita + st_dev_CommExps_ita))
display((impita - st_dev_CommExps_ita))

In [None]:
mean = []
for p, r in zip(predsp_CommExps, impsp):
    if ((r + st_dev_CommExps_sp) < p):
        print('Deviance: ', round((1-((r + st_dev_CommExps_sp) / p))))
    if ((r - st_dev_CommExps_sp) > p):
        print('Deviance: ', round((1 - (p / (r-st_dev_CommExps_sp))) * 100, 2))
        mean.append((round((1 - (p / (r-st_dev_CommExps_sp))) * 100, 2)))
print('Mean error: ', round(sum(mean)/len(mean), 2))

In [None]:
mean = []
for p, r in zip(predus_CommExps, impus):
    if ((r + st_dev_CommExps_us) < p):
        print(p)
    if ((r - st_dev_CommExps_us) > p):
        print('Deviance: ', round((1 - (p / (r-st_dev_CommExps_us))) * 100, 2))
        mean.append(round((1 - (p / (r-st_dev_CommExps_us))) * 100, 2))
print('Mean error: ', round(sum(mean)/len(mean), 2))

In [None]:
mean = []
for p, r in zip(predita_CommExps, impita):
    if ((r + st_dev_CommExps_ita) < p):
        print(p)
    if ((r - st_dev_CommExps_ita) > p):
        print('Deviance: ', round((1 - (p / (r-st_dev_CommExps_ita))) * 100, 2))
        mean.append(round((1 - (p / (r-st_dev_CommExps_ita))) * 100, 2))
print('Mean error: ', round(sum(mean)/len(mean), 2))

In [None]:
results['CommExps'].values

In [87]:
from scipy import stats

shapiro_test = stats.shapiro(results['CommExps'].values)
print('Statistics=%.3f, p=%.3f' % (shapiro_test.statistic, shapiro_test.pvalue))
if shapiro_test.pvalue > 0.05:
	print('Sample looks Gaussian (fail to reject H0)')
else:
	print('Sample does not look Gaussian (reject H0)')