## Trees, Ensembles and XGBoost

    -- Introduction
    -- How they work?
    -- What are Trees useful for?

## Imports and initialization

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = [20, 10]
colab_path = "https://raw.githubusercontent.com/poornagurram/TimeSeriesAnalysis_ODSC_2019/master/"

## Data Preparation

In [0]:
data = pd.read_csv(colab_path+'data/gdp_uk.csv')

In [0]:
data[['year', 'value']].plot(x='year', y='value')

In [0]:
data['gdp_growth'] = np.log(data.value / data.value.shift(1))
data['is_inc'] = np.where(data.value / data.value.shift(1) > 1, 1, 0)

In [0]:
data.head(10)

In [0]:
for lag in range(1, 6):
    data[f'gdp_growth_lag_{lag}'] = data['gdp_growth'].shift(lag)

In [0]:
data.dropna(inplace=True)

In [0]:
df = data[['year', 
         'gdp_growth_lag_1', 
         'gdp_growth_lag_2',
         'gdp_growth_lag_3',
         'gdp_growth_lag_4',
         'gdp_growth_lag_5',
         'gdp_growth',
         'is_inc']].copy()

In [0]:
df.head(10)

In [0]:
features_columns = ['gdp_growth_lag_1', 'gdp_growth_lag_2','gdp_growth_lag_3', 'gdp_growth_lag_4', 'gdp_growth_lag_5']
target = 'is_inc'

## Trees & XGBoost

In [0]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [0]:
model = xgb.XGBClassifier(max_depth=5)
# model = RandomForestClassifier(n_estimators=20, max_depth=5)
# model = DecisionTreeClassifier(max_depth=5)

## Train

In [0]:
train_df = df[df.year < 1990].copy()
test_df = df[df.year >= 1990].copy()

In [0]:
model.fit(train_df[features_columns], train_df[target])

In [0]:
model.feature_importances_

## Test

In [0]:
df['is_inc_pred'] = model.predict(df[features_columns])
test_df['is_inc_pred'] = model.predict(test_df[features_columns])

In [0]:
from sklearn.metrics import accuracy_score

In [0]:
accuracy_score(test_df['is_inc'], test_df['is_inc_pred'])

In [0]:
accuracy_score(df['is_inc'], df['is_inc_pred'])

## Regressor

In [0]:
model = xgb.XGBRegressor()

In [0]:
dir(model)

In [0]:
model.fit(train_df[features_columns], train_df['gdp_growth']) 

In [0]:
df['gdp_growth_pred'] = model.predict(df[features_columns])
test_df['gdp_growth_pred'] = model.predict(test_df[features_columns])

In [0]:
df[['year', 'gdp_growth', 'gdp_growth_pred']].plot(x='year', y=['gdp_growth_pred', 'gdp_growth'])

In [0]:
test_df[['year', 'gdp_growth', 'gdp_growth_pred']].plot(x='year', y=['gdp_growth_pred', 'gdp_growth'])

## Gotchas with Trees

    -- Do not capture linear relationships
    -- Time series is not inherent. So need to input Time series flavour forcefully
    -- Work wonderfully for structured data
    -- One hot encoding is mandatory (ordinality is assumed)