In [1]:
upstream = ['calculateNetworkStats', 'createTimeSeries'] # this means: execute raw.py, then clean.py
product = None

In [2]:
# Parameters
upstream = {"calculateNetworkStats": {"nb": "C:\\Users\\yosty\\Desktop\\Desktop_Folder\\14 - git\\timeSeriesDOTS\\ploomber\\dots\\00-data\\calculateNetworkStats.ipynb", "DOTSnetStats": "C:\\Users\\yosty\\Desktop\\Desktop_Folder\\14 - git\\timeSeriesDOTS\\ploomber\\dots\\00-data\\clean\\DOTSnetStats.csv"}, "createTimeSeries": {"nb": "C:\\Users\\yosty\\Desktop\\Desktop_Folder\\14 - git\\timeSeriesDOTS\\ploomber\\dots\\00-data\\createTimeSeries.ipynb", "dotsTimeSeries": "C:\\Users\\yosty\\Desktop\\Desktop_Folder\\14 - git\\timeSeriesDOTS\\ploomber\\dots\\00-data\\clean\\dotsTimeSeries.csv"}}
product = {"nb": "C:\\Users\\yosty\\Desktop\\Desktop_Folder\\14 - git\\timeSeriesDOTS\\ploomber\\dots\\01-timeSeries\\xgboostWindow.ipynb"}


   In [previous work](https://rcyost.github.io/DOTS-network) I've calculated some basic network statistics on IMF Direction of Trade Statistics (DOTS) export data.

   I looked for relevant features using univariate linear regression [in this notebook](https://rcyost.github.io/network-feature-engineering-trade)

   In this notebook I'll use XGBoost.

  Table of Contents:
   1. Load and clean data
   2. For each trade series, XGBoost export series against the exporter's network statistics
      - This could be re-run on importer's statistics
      - Recalculate the network with edges as nodes: [example](https://youtu.be/p5LO97n3llg?t=235)
   3. Sort by mean absolute error.
   4. Collapse network statistics with PCA, repeat 2,3,4 on PCA series


   ### 1. Load and clean data

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


from numpy import absolute
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from tqdm import tqdm

from math import ceil

import pickle

  from pandas import MultiIndex, Int64Index


In [4]:

timeSeries=(pd.read_csv(upstream['createTimeSeries']['dotsTimeSeries'])
    .pivot_table(index='period', columns=['ReferenceArea', 'CounterpartReferenceArea'], values='value')
)
# timeSeries=(pd.read_csv('dotsTimeSeries.csv')
#     .pivot_table(index='period', columns=['ReferenceArea', 'CounterpartReferenceArea'], values='value')
# )


tsPctChange=np.log(timeSeries).pct_change().iloc[1:].dropna(axis=1)
tsPctChange.columns=['-'.join(col) for col in tsPctChange.columns]
tsPctChange[tsPctChange>1.5]=np.nan
tsPctChange[tsPctChange<-1.5]=np.nan
tsPctChange=tsPctChange.dropna(axis=1)
tsPctChange.index=pd.to_datetime(tsPctChange.index)
tsPctChange=tsPctChange[tsPctChange.index > '1985-01-01']

netStats=pd.read_csv(upstream['calculateNetworkStats']['DOTSnetStats']).drop(['Unnamed: 0', 'CONNECTIVITY', 'HAS_BRIDGE', 'TOTAL_NET_VALUE', 'PAGERANK_NUMPY'],axis=1)
# netStats=pd.read_csv('DOTSnetStats.csv').drop(['Unnamed: 0', 'CONNECTIVITY', 'HAS_BRIDGE', 'TOTAL_NET_VALUE', 'PAGERANK_NUMPY'],axis=1)
netStats.set_index(['index', 'PERIOD'], inplace=True)
# get to period index and econ, stats cols
netStatsWide=(netStats
.reset_index()
.melt(id_vars=['index', 'PERIOD'])
.pivot_table(index='PERIOD', columns=['index', 'variable'], values='value')
)
netStatsWide.index = pd.to_datetime(netStatsWide.index)
netStatsWidePctChange=netStatsWide.pct_change().iloc[1:].dropna(axis=1)
netStatsWidePctChange.index=pd.to_datetime(netStatsWidePctChange.index)
netStatsWidePctChange=netStatsWidePctChange[netStatsWidePctChange.index > '1985-01-01']

In [5]:

# lag the net stats to not leak information
netStatsWidePctChange=netStatsWidePctChange.shift(-1).iloc[:-1]
# take off a period of time series so sizes match
tsPctChange=tsPctChange.iloc[:-1]

In [6]:
netStats.corr()

Unnamed: 0,DEGREE,IN_DEGREE,OUT_DEGREE,DEGREE_CENTRALITY,IN_DEGREE_CENTRALITY,OUT_DEGREE_CENTRALITY,AVG_NEIGHBOR_DEGREE,PAGERANK,KATZ,CLOSENESS_CENTRALITY,BETWEENNESS_CENTRALITY,CLUSTCOEF,NUM_NODES,NUM_EDGES,AVERAGECLUSTCOEF,TRIANGLES
DEGREE,1.0,0.962988,0.977184,0.992434,0.963476,0.955832,0.493826,0.406518,-0.132791,0.835492,0.555967,-0.498091,0.613884,0.631775,-0.188338,0.963967
IN_DEGREE,0.962988,1.0,0.883767,0.941548,0.993526,0.847207,0.52092,0.371575,-0.139517,0.887069,0.473481,-0.39551,0.676223,0.69593,-0.207463,0.955192
OUT_DEGREE,0.977184,0.883767,1.0,0.980944,0.889714,0.991741,0.446807,0.412921,-0.120589,0.751422,0.592064,-0.553033,0.532852,0.548381,-0.163477,0.920773
DEGREE_CENTRALITY,0.992434,0.941548,0.980944,1.0,0.955052,0.974949,0.451287,0.453122,-0.149922,0.806,0.621666,-0.556223,0.546003,0.562473,-0.190481,0.929922
IN_DEGREE_CENTRALITY,0.963476,0.993526,0.889714,0.955052,1.0,0.865192,0.479139,0.419466,-0.159216,0.871295,0.534531,-0.442852,0.615455,0.63402,-0.214711,0.93306
OUT_DEGREE_CENTRALITY,0.955832,0.847207,0.991741,0.974949,0.865192,1.0,0.403864,0.451742,-0.134137,0.709634,0.650507,-0.608596,0.461798,0.475728,-0.161105,0.872909
AVG_NEIGHBOR_DEGREE,0.493826,0.52092,0.446807,0.451287,0.479139,0.403864,1.0,-0.122871,-0.043899,0.699419,-0.031501,0.006313,0.775069,0.797217,-0.337722,0.489305
PAGERANK,0.406518,0.371575,0.412921,0.453122,0.419466,0.451742,-0.122871,1.0,-0.040801,0.222944,0.763224,-0.448343,-0.061415,-0.058331,0.012322,0.290918
KATZ,-0.132791,-0.139517,-0.120589,-0.149922,-0.159216,-0.134137,-0.043899,-0.040801,1.0,-0.085656,-0.075148,0.09084,-0.019724,-0.016766,0.017145,-0.097818
CLOSENESS_CENTRALITY,0.835492,0.887069,0.751422,0.806,0.871295,0.709634,0.699419,0.222944,-0.085656,1.0,0.34798,-0.275433,0.817167,0.838388,-0.469232,0.818362


In [7]:
netStatsWidePctChange.head()

index,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,...,"Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep."
variable,AVERAGECLUSTCOEF,CLOSENESS_CENTRALITY,CLUSTCOEF,DEGREE,DEGREE_CENTRALITY,IN_DEGREE,IN_DEGREE_CENTRALITY,KATZ,NUM_EDGES,NUM_NODES,...,CLUSTCOEF,DEGREE,DEGREE_CENTRALITY,IN_DEGREE,IN_DEGREE_CENTRALITY,KATZ,NUM_EDGES,NUM_NODES,PAGERANK,TRIANGLES
PERIOD,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1985-02-01,0.002301,0.0,-0.007292,0.052632,0.052632,0.034483,0.034483,-0.298043,0.009121,0.0,...,-0.002829,0.145455,0.145455,0.189189,0.189189,0.682972,0.009121,0.0,0.083911,0.316947
1985-03-01,0.011363,0.006944,0.023946,-0.083333,-0.083333,0.0,0.0,0.968943,0.001928,0.0,...,0.003962,-0.047619,-0.047619,-0.068182,-0.068182,-0.108051,0.001928,0.0,-0.274875,-0.085462
1985-04-01,-0.000926,0.003484,0.017832,0.036364,0.036364,0.033333,0.033333,0.078214,0.007457,0.0,...,0.001344,0.033333,0.033333,0.04878,0.04878,-0.359584,0.007457,0.0,0.049424,0.105263
1985-05-01,-0.008471,-0.020478,-0.013485,-0.070175,-0.070175,-0.16129,-0.16129,-0.125089,-0.00788,0.0,...,0.005263,-0.064516,-0.064516,-0.093023,-0.093023,-0.336619,-0.00788,0.0,0.207353,-0.168124
1985-06-01,0.005615,0.017361,-0.024007,0.09434,0.09434,0.192308,0.192308,-0.114136,0.010108,0.0,...,0.007962,0.051724,0.051724,0.051282,0.051282,0.441468,0.010108,0.0,0.156956,0.089953


In [8]:
netStatsWidePctChange.corr()

Unnamed: 0_level_0,index,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,...,"Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep.","Yemen, P.D. Rep."
Unnamed: 0_level_1,variable,AVERAGECLUSTCOEF,CLOSENESS_CENTRALITY,CLUSTCOEF,DEGREE,DEGREE_CENTRALITY,IN_DEGREE,IN_DEGREE_CENTRALITY,KATZ,NUM_EDGES,NUM_NODES,...,CLUSTCOEF,DEGREE,DEGREE_CENTRALITY,IN_DEGREE,IN_DEGREE_CENTRALITY,KATZ,NUM_EDGES,NUM_NODES,PAGERANK,TRIANGLES
index,variable,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Afghanistan,AVERAGECLUSTCOEF,1.000000,0.073669,0.413637,0.061910,0.076437,-0.016576,0.002934,0.038123,0.315566,-0.156818,...,0.048834,0.028365,0.028657,-0.010274,-0.009959,-0.041185,0.284493,-0.036407,-0.092208,0.007055
Afghanistan,CLOSENESS_CENTRALITY,0.073669,1.000000,-0.018438,0.508878,0.509873,0.852449,0.857545,0.097872,0.336982,0.041482,...,-0.018475,0.061994,0.063055,0.027022,0.028106,0.018928,0.120589,-0.128162,-0.032935,0.029118
Afghanistan,CLUSTCOEF,0.413637,-0.018438,1.000000,-0.283062,-0.264100,-0.148596,-0.127168,0.032734,0.007868,-0.214551,...,0.066220,0.032527,0.032679,0.060587,0.060744,-0.054971,0.112460,-0.020927,-0.109745,0.018873
Afghanistan,DEGREE,0.061910,0.508878,-0.283062,1.000000,0.996922,0.644623,0.645099,0.010131,0.269504,0.067043,...,0.036883,0.031952,0.032331,0.004377,0.004776,0.005986,0.205111,-0.046906,0.061522,0.000550
Afghanistan,DEGREE_CENTRALITY,0.076437,0.509873,-0.264100,0.996922,1.000000,0.635419,0.642452,0.010168,0.231666,-0.010925,...,0.036926,0.032567,0.032978,0.004590,0.005020,0.005765,0.207166,-0.050575,0.060699,0.000540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Yemen, P.D. Rep.",KATZ,-0.041185,0.018928,-0.054971,0.005986,0.005765,0.011210,0.011045,-0.025253,-0.025512,0.003067,...,-0.198149,-0.097333,-0.096779,-0.040764,-0.040300,1.000000,-0.106359,-0.052708,0.103698,-0.019826
"Yemen, P.D. Rep.",NUM_EDGES,0.284493,0.120589,0.112460,0.205111,0.207166,0.145586,0.148514,0.001583,0.333857,-0.014360,...,0.133953,0.315306,0.315761,0.239801,0.240454,-0.106359,1.000000,-0.086404,0.004700,0.209635
"Yemen, P.D. Rep.",NUM_NODES,-0.036407,-0.128162,-0.020927,-0.046906,-0.050575,-0.064192,-0.068557,-0.011431,-0.038572,0.048066,...,0.057039,-0.104656,-0.113299,-0.036679,-0.045197,-0.052708,-0.086404,1.000000,0.256117,0.044395
"Yemen, P.D. Rep.",PAGERANK,-0.092208,-0.032935,-0.109745,0.061522,0.060699,0.023219,0.022310,-0.006782,0.005955,0.015312,...,-0.086453,0.199118,0.196689,0.246023,0.243752,0.103698,0.004700,0.256117,1.000000,0.262506


In [9]:
tsPctChange.head()

Unnamed: 0_level_0,Argentina-Brazil,Argentina-Chile,Argentina-Japan,Australia-Canada,Australia-China,Australia-France,Australia-Germany,Australia-India,Australia-Italy,Australia-Japan,...,United States-Switzerland,"United States-Taiwan, Province of China",United States-Thailand,United States-Trinidad and Tobago,United States-Turkey,United States-United Arab Emirates,United States-United Kingdom,United States-Uruguay,"United States-Venezuela, Bolivarian Republic",Uruguay-Germany
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1985-02-01,0.062522,-0.154539,-0.293125,0.14247,0.062065,-0.097768,0.065731,-0.131695,0.00017,-0.004596,...,-0.009254,-0.01109,-0.006611,-0.090844,-0.027302,0.038621,-0.008302,-0.067725,0.009637,0.028158
1985-03-01,-0.157772,-0.028198,0.374053,0.018934,-0.088413,-0.003325,-0.037304,0.080223,-0.137057,-0.023654,...,0.066524,0.014933,0.085342,0.075829,-0.105616,0.006352,0.053685,0.174717,-0.00532,-0.231935
1985-04-01,0.076867,0.130921,0.230518,-0.087285,0.164218,0.056872,-0.091518,0.128781,0.182579,0.051857,...,-0.061728,-0.040338,-0.094513,-0.015796,0.2357,-0.085926,-0.052472,-0.041805,0.017312,0.44725
1985-05-01,-0.079725,-0.085492,-0.072008,0.074769,0.035272,-0.029843,0.165752,0.010152,0.004218,-0.000145,...,0.021356,0.009626,-0.0426,0.032492,-0.043499,-0.008756,-0.004399,0.122381,0.037731,0.028077
1985-06-01,0.037981,0.148035,0.147914,0.027766,0.093546,0.115701,-0.122772,0.011665,-0.078161,0.012099,...,-0.01616,0.010196,-0.013965,-0.026642,-0.039266,0.012857,-0.013124,-0.119213,-0.034192,0.09511


In [10]:
importers=pd.Series(col.split('-')[0] for col in tsPctChange.columns).unique()
exporters=pd.Series(col.split('-')[1] for col in tsPctChange.columns).unique()
allEcons=sorted(set(list(importers) + list(exporters)))
netStats=pd.Series(col[1] for col in netStatsWidePctChange.columns).nunique()

print('The upper-bound on number of tests:', len(allEcons)*netStats)

The upper-bound on number of tests: 1328


  ## 2. Loop and XGBoost

In [11]:

# https://www.kaggle.com/felipefiorini/xgboost-hyper-parameter-tuning
# https://www.kaggle.com/felipefiorini/xgboost-hyper-parameter-tuning/notebook

def hyperParameterTuning(X_train, y_train):
    param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
    }

    xgb_model = xgb.XGBRegressor()

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,
                           #scoring = 'neg_mean_absolute_error', #MAE
                           #scoring = 'neg_mean_squared_error',  #MSE
                           cv = 5,
                           n_jobs = -1,
                           verbose = 0)

    gsearch.fit(X_train,y_train)

    return(gsearch.best_params_)

In [12]:

# https://xgboost.readthedocs.io/en/latest/python/examples/index.html
# https://xgboost.readthedocs.io/en/stable/parameter.html
# https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn

# https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning

results={}

econs=pd.Series(col for col in tsPctChange.columns).unique()
tempSeries='Argentina-Brazil'

# create dataset
# network statistics
X=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries.split('-')[0] or col[0] == tempSeries.split('-')[1]]]
X.columns=["-".join(col) for col in X.columns]
X_temp=X

# bilateral trade series
y=tsPctChange[[tempSeries]]

# if there is data for model
if not X_temp.empty and not y.empty:
    results[tempSeries]={}
    results[tempSeries]['y_std']=y.std()
    results[tempSeries]['series']=tempSeries
    X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size=0.1, shuffle=False)
    results[tempSeries]['y_test_std']=y_test.std()

    #bestParams=hyperParameterTuning(X_train, y_train)

    bst = xgb.XGBRegressor(
        objective = 'reg:squarederror',
        colsample_bytree = .05,
        learning_rate = .01,
        max_depth = 3,
        min_child_weight = 5,
        n_estimators = 500,
        subsample = .5,
        nthread=4)

    #results[tempSeries]['bestParams']=bestParams

    bst.fit(X_train, y_train)

    results[tempSeries]['model']=bst

    y_pred = bst.predict(X_test)

    mse=mean_squared_error(y_test, y_pred)
    results[tempSeries]['mse']=mse

    results[tempSeries]['data']=[X_train, X_test, y_train, y_test, y_pred]


    importances=['weight', 'gain', 'cover']
    for importance in importances:
        results[tempSeries][importance]=(bst.get_booster().get_score(importance_type=importance))

In [13]:

results

{'Argentina-Brazil': {'y_std': Argentina-Brazil    0.042695
  dtype: float64,
  'series': 'Argentina-Brazil',
  'y_test_std': Argentina-Brazil    0.025175
  dtype: float64,
  'model': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.05,
               enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None,
               interaction_constraints='', learning_rate=0.01, max_delta_step=0,
               max_depth=3, min_child_weight=5, missing=nan,
               monotone_constraints='()', n_estimators=500, n_jobs=4, nthread=4,
               num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
               reg_lambda=1, scale_pos_weight=1, subsample=0.5,
               tree_method='exact', validate_parameters=1, verbosity=None),
  'mse': 0.0006461496763945248,
  'data': [            Argentina-AVERAGECLUSTCOEF  Argentina-CLOSENESS_CENTRALITY  \
   PERIOD                                   