# Import 

In [1]:
# STANDARD PACKAGES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
from datetime import datetime as dt
import time
import json
import random
from tqdm import tqdm #https://pypi.org/project/tqdm/#ipython-jupyter-integ½ration
from functools import reduce
import pickle
import itertools
import warnings
import platform
import multiprocessing as mp


# SCRAPE PACKAGES
import requests
from bs4 import BeautifulSoup
# from pytrends.request import TrendReq #pip install pytrends

# MODEL PACKAGES
    #SKLEARN
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from statsmodels.tsa.x13 import x13_arima_analysis as x13

# CUSTOM FUNCTIONS

import os
import sys
currentdir = os.path.dirname(os.path.realpath('analysis_DK'))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

from func import (chunks, reindex, global_id, term_list, time_corr_plot, rmse, time_variable_plot, find_highest_corr, test_train_split, test_train_split_Q,
                  bootstrap_all_windows, bootstrap_n_samples, bootstrap_sample, final_model, final_model_boot, ar_1, grid_bestpar, tuning_window, tuning_window_mp, tuning_window_bestpar,
                  model_tuning, seasadj, seasadj_col_list, abs_percentage_change, add_poly_terms, create_interaction, GT_dict)

# Final model

## Baseline -  AR(1) 

### Subset data 

In [52]:
df_analysis = pd.read_csv('data/descriptive/df_descriptive.csv', parse_dates=['date'])

In [53]:
df_analysis = df_analysis[df_analysis.country == 'SE']

In [54]:
df_analysis = df_analysis[~(df_analysis.ID == 'Jämtland')]

In [55]:
df_analysis = df_analysis.drop(['GT_NO_0', 'GT_NO_1', 'GT_NO_2', 'GT_NO_6', 'GT_NO_7', 'GT_NO_8', 'GT_NO_10', 'GT_NO_11',  
                                'GT_DK_0', 'GT_DK_2', 'GT_DK_3', 'GT_DK_4', 'GT_DK_5', 'GT_DK_6', 'GT_DK_7', 'GT_DK_8',
                                'GT_DK_9', 'GT_DK_10', 'GT_DK_11', 'GT_DK_12', 'GT_DK_13', 'GT_DK_14',
                                'GT_DK_15', 'GT_DK_16', 'GT_DK_17', 'GT_DK_18', 'GT_DK_19', 'GT_DK_20', 'GT_DK_21',
                                'country'], axis = 1)

In [56]:
df_analysis.reset_index(inplace = True)

In [57]:
df_analysis.to_csv('data/descriptive/df_analysis.csv', index = False)

In [58]:
df_analysis.groupby(['ID'])['population'].mean().round()

ID
Blekinge             154134.0
Dalarna              278771.0
Gotland               57479.0
Gävleborg            278814.0
Halland              305064.0
Jönköping            341693.0
Kalmar               235971.0
Kronoberg            187031.0
Norrbotten           249896.0
Skåne               1262614.0
Stockholm           2114130.0
Södermanland         275851.0
Uppsala              342115.0
Värmland             274999.0
Västerbotten         261286.0
Västernorrland       243523.0
Västmanland          257907.0
Västra Götalands    1606265.0
Örebro               284760.0
Östergötland         435579.0
Name: population, dtype: float64

### Preprocessing

#### Transform relevant columns to abs change exept those with M_ and ID_, date and t

In [47]:
df_analysis['target_actual'] = df_analysis.groupby(['ID'])['target_actual'].diff()

### Lagged variables

In [48]:
df_analysis['target_lag'] = df_analysis.groupby(['ID'])['target_actual'].shift(1)

#### Creating dummies from categorial variables - remember to drop the reference category (done after change is constructed)

In [246]:
df_analysis = pd.get_dummies(df_analysis, prefix=['ID'], prefix_sep='_', columns=['ID']).copy()

#### Adding interaction terms by regions and all variables

In [247]:
# relevant interaction variables
interaction_1 = ['target_lag'] 

interaction_2 = [item for item in df_analysis if item.startswith('ID_')]

In [248]:
for var1 in interaction_1:
    for var2 in interaction_2:
        name = var1 + "*" + var2
        df_analysis[name] = pd.Series(df_analysis[var1] * df_analysis[var2], name=name)

#### Drop na

In [249]:
df_analysis.dropna(inplace=True)

In [250]:
df_analysis.sort_index(axis=1, inplace=True)

In [251]:
window = 11
testsize = 1
valsize = 1
rolling_window = True
params = []
n_components = []

#### Select relevant columns

In [253]:
df_ar = df_analysis[['date', 'target_actual', 
                     'target_lag*ID_Blekinge', 'target_lag*ID_Dalarna', 'target_lag*ID_Gotland', 'target_lag*ID_Gävleborg', 
                     'target_lag*ID_Halland', 'target_lag*ID_Jönköping', 'target_lag*ID_Kalmar', 
                     'target_lag*ID_Kronoberg', 'target_lag*ID_Norrbotten', 'target_lag*ID_Skåne', 'target_lag*ID_Stockholm', 
                     'target_lag*ID_Södermanland', 'target_lag*ID_Uppsala', 'target_lag*ID_Värmland', 'target_lag*ID_Västerbotten', 
                     'target_lag*ID_Västernorrland', 'target_lag*ID_Västmanland', 'target_lag*ID_Västra Götalands', 'target_lag*ID_Örebro', 
                     'target_lag*ID_Östergötland']]

#### To Quarter data

In [254]:
df_ar['date'] = df_ar.set_index('date').index.to_period('Q')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [255]:
df_ar['date'].min()

Period('2007Q3', 'Q-DEC')

In [256]:
df_ar['date'].max()

Period('2019Q3', 'Q-DEC')

#### Subset for period to match DK

In [257]:
df_ar = df_ar[df_ar.date >= '01-01-2008']

### Running baseline model

In [262]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_ar, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

#### Concatting val and train

In [263]:
for win in X_train.keys():
    X_train[win] = np.concatenate((X_train[win], X_val[win])).copy()
    y_train[win] = np.concatenate((y_train[win], y_val[win])).copy()

#### Estimating with OLS

In [264]:
results_ols= tuning_window(X_fit = X_train, y_fit = y_train, X_test = X_test, y_test = y_test, params = params, n_components = n_components, model_str = 'ols')


Tuning params for window:   0%|          | 0/35 [00:00<?, ?it/s][A
Tuning params for window: 100%|██████████| 35/35 [00:00<00:00, 1191.33it/s][A

In [265]:
results_ols[1]['best'][1]

1.2138739638034914

#### Exporting the results

In [266]:
with open('results/final/baseline/results_ar1.pickle', 'wb') as handle:
    pickle.dump(results_ols, handle, protocol= pickle.HIGHEST_PROTOCOL)

### Check results

In [267]:
temp = []
for key in results_ols.keys():
    temp.append(results_ols[key]['best'][1])
    

In [268]:
np.mean(temp)

1.3189239646182216

In [269]:
results_ols[1]['y_pred_dict'][results_ols[1]['best'][0]]

array([ 0.14,  0.23, -0.91,  0.26,  0.08,  0.15,  0.18,  0.25, -0.17,
        0.22,  0.14,  0.27,  0.2 ,  0.2 , -0.11,  0.16,  0.04,  0.13,
        0.03,  0.32])

## Baseline -  AR(2) with y_t-1 + y_t-12

### Subset data 

In [270]:
df_analysis = pd.read_csv('data/descriptive/df_descriptive.csv', parse_dates=['date'])

In [271]:
df_analysis = df_analysis[df_analysis.country == 'SE']

In [272]:
df_analysis = df_analysis[~(df_analysis.ID == 'Jämtland')]

In [273]:
df_analysis = df_analysis.drop(['GT_NO_0', 'GT_NO_1', 'GT_NO_2', 'GT_NO_6', 'GT_NO_7', 'GT_NO_8', 'GT_NO_10', 'GT_NO_11',  
                                'GT_DK_0', 'GT_DK_2', 'GT_DK_3', 'GT_DK_4', 'GT_DK_5', 'GT_DK_6', 'GT_DK_7', 'GT_DK_8',
                                'GT_DK_9', 'GT_DK_10', 'GT_DK_11', 'GT_DK_12', 'GT_DK_13', 'GT_DK_14',
                                'GT_DK_15', 'GT_DK_16', 'GT_DK_17', 'GT_DK_18', 'GT_DK_19', 'GT_DK_20', 'GT_DK_21',
                                'country'], axis = 1)

### Preprocessing

#### Transform relevant columns to abs change exept those with M_ and ID_, date and t

In [274]:
df_analysis['target_actual'] = df_analysis.groupby(['ID'])['target_actual'].diff()

### Lagged variables

In [275]:
df_analysis['target_lag'] = df_analysis.groupby(['ID'])['target_actual'].shift(1)

In [276]:
df_analysis['target_4_lag'] = df_analysis.groupby(['ID'])['target_actual'].shift(4)

#### Creating dummies from categorial variables - remember to drop the reference category (done after change is constructed)

In [277]:
df_analysis = pd.get_dummies(df_analysis, prefix=['ID'], prefix_sep='_', columns=['ID']).copy()

#### Adding interaction terms by regions and all variables

In [278]:
# relevant interaction variables
interaction_1 = ['target_lag', 'target_4_lag'] 

interaction_2 = [item for item in df_analysis if item.startswith('ID_')]

In [279]:
for var1 in interaction_1:
    for var2 in interaction_2:
        name = var1 + "*" + var2
        df_analysis[name] = pd.Series(df_analysis[var1] * df_analysis[var2], name=name)

#### Drop na

In [280]:
df_analysis.dropna(inplace=True)

In [281]:
df_analysis.sort_index(axis=1, inplace=True)

In [282]:
window = 11
testsize = 1
valsize = 1
rolling_window = True
params = []
n_components = []

#### Select relevant columns

In [283]:
df_ar = df_analysis[['date', 'target_actual', 
                     'target_lag*ID_Blekinge', 'target_lag*ID_Dalarna', 'target_lag*ID_Gotland', 'target_lag*ID_Gävleborg', 
                     'target_lag*ID_Halland', 'target_lag*ID_Jönköping', 'target_lag*ID_Kalmar', 
                     'target_lag*ID_Kronoberg', 'target_lag*ID_Norrbotten', 'target_lag*ID_Skåne', 'target_lag*ID_Stockholm', 
                     'target_lag*ID_Södermanland', 'target_lag*ID_Uppsala', 'target_lag*ID_Värmland', 'target_lag*ID_Västerbotten', 
                     'target_lag*ID_Västernorrland', 'target_lag*ID_Västmanland', 'target_lag*ID_Västra Götalands', 'target_lag*ID_Örebro', 
                     'target_lag*ID_Östergötland',
                    'target_4_lag*ID_Blekinge', 'target_4_lag*ID_Dalarna', 'target_4_lag*ID_Gotland',
                     'target_4_lag*ID_Gävleborg', 'target_4_lag*ID_Halland', 'target_4_lag*ID_Jönköping', 
                     'target_4_lag*ID_Kalmar', 'target_4_lag*ID_Kronoberg', 'target_4_lag*ID_Norrbotten', 'target_4_lag*ID_Skåne', 
                     'target_4_lag*ID_Stockholm', 'target_4_lag*ID_Södermanland', 'target_4_lag*ID_Uppsala', 'target_4_lag*ID_Värmland',
                     'target_4_lag*ID_Västerbotten', 'target_4_lag*ID_Västernorrland', 'target_4_lag*ID_Västmanland', 
                     'target_4_lag*ID_Västra Götalands', 'target_4_lag*ID_Örebro', 'target_4_lag*ID_Östergötland']]

#### To Quarter data

In [284]:
df_ar['date'] = df_ar.set_index('date').index.to_period('Q')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [285]:
df_ar['date'].min()

Period('2008Q2', 'Q-DEC')

In [286]:
df_ar['date'].max()

Period('2019Q3', 'Q-DEC')

#### Subset for period to match DK

In [287]:
df_ar = df_ar[df_ar.date >= '01-01-2008']

### Running baseline model

In [288]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_ar, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

#### Concatting val and train

In [289]:
for win in X_train.keys():
    X_train[win] = np.concatenate((X_train[win], X_val[win])).copy()
    y_train[win] = np.concatenate((y_train[win], y_val[win])).copy()

#### Estimating with OLS

In [290]:
results_ols= tuning_window(X_fit = X_train, y_fit = y_train, X_test = X_test, y_test = y_test, params = params, n_components = n_components, model_str = 'ols')


Tuning params for window:   0%|          | 0/34 [00:00<?, ?it/s][A
Tuning params for window: 100%|██████████| 34/34 [00:00<00:00, 583.08it/s][A

In [291]:
results_ols[1]['best'][1]

1.0421324292046568

#### Exporting the results

In [292]:
with open('results/final/baseline/results_ar_year_lag.pickle', 'wb') as handle:
    pickle.dump(results_ols, handle, protocol= pickle.HIGHEST_PROTOCOL)

### Check results

In [293]:
temp = []
for key in results_ols.keys():
    temp.append(results_ols[key]['best'][1])
    

In [294]:
np.mean(temp)

1.2295557185457329

In [295]:
results_ols[1]['y_pred_dict'][results_ols[1]['best'][0]]

array([-0.8 ,  0.34, -0.49,  0.19, -0.52,  0.43, -0.85,  0.04,  0.4 ,
        0.25,  0.3 , -0.02,  0.54, -2.75,  1.44,  0.26, -0.86,  0.28,
        0.42,  1.65])

## ML - Data and preprocessing 

### Import data frame with adjusted here

### Subset data 

In [35]:
df_analysis = pd.read_csv('data/descriptive/df_descriptive.csv', parse_dates=['date'])

In [36]:
df_analysis = df_analysis[df_analysis.country == 'SE']

In [37]:
df_analysis = df_analysis[~(df_analysis.ID == 'Jämtland')]

In [38]:
df_analysis = df_analysis.drop(['GT_NO_0', 'GT_NO_1', 'GT_NO_2', 'GT_NO_6', 'GT_NO_7', 'GT_NO_8', 'GT_NO_10', 'GT_NO_11',  
                                'GT_DK_0', 'GT_DK_2', 'GT_DK_3', 'GT_DK_4', 'GT_DK_5', 'GT_DK_6', 'GT_DK_7', 'GT_DK_8',
                                'GT_DK_9', 'GT_DK_10', 'GT_DK_11', 'GT_DK_12', 'GT_DK_13', 'GT_DK_14',
                                'GT_DK_15', 'GT_DK_16', 'GT_DK_17', 'GT_DK_18', 'GT_DK_19', 'GT_DK_20', 'GT_DK_21',
                                'country'], axis = 1)

In [39]:
print(list(df_analysis))

['date', 'target_actual', 'ID', 'jobs', 'sector_information_technology', 'sector_engineering_technology', 'sector_management_staff', 'sector_trade_service', 'sector_industry_craft', 'sector_sales_communication', 'sector_teaching', 'sector_office_finance', 'sector_social_health', 'sector_other', 'population', 'high_edu_share', 'labour_force_share', 'urban_share', 'GT_DK_1', 'GT_SE_0', 'GT_SE_1', 'GT_SE_2', 'GT_SE_5', 'GT_SE_6', 'GT_SE_7', 'GT_SE_8', 'GT_SE_9', 'GT_SE_10', 'GT_SE_11']


In [40]:
df_analysis.date.min()

Timestamp('2007-01-01 00:00:00')

In [41]:
df_analysis.date.max()

Timestamp('2019-07-01 00:00:00')

### Initial preprocessing and feature construction

- Create dummies 
- Create interaction terms

Overall monthly time trend variable, $t=1,2...,T$ within `ID` variable:

In [42]:
#Temp container
temp = {}

for i in df_analysis['ID'].unique():
    temp[i] = df_analysis[df_analysis['ID']==i]
    temp[i]['t'] = range(1, len(temp[i]['ID'])+1)

#Concatting the df's
temp = pd.concat(temp, ignore_index=True, sort = False)

#Merging onto analysis
df_analysis = pd.merge(left = df_analysis, right = temp[['date', 'ID', 't']], left_on =['date', 'ID'], right_on = ['date', 'ID'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#### Drop sector variables

criteria = 0.2

In [43]:
df_analysis.drop(['sector_management_staff', 'sector_trade_service', 'sector_sales_communication', 'sector_teaching', 'sector_social_health', 'sector_other'], axis = 1, inplace=True)

#### Transform relevant columns to abs change exept those with M_ and ID_, date and t

In [44]:
df_analysis['target_actual'] = df_analysis.groupby(['ID'])['target_actual'].diff()

### Lagged variables

In [45]:
df_analysis['target_lag'] = df_analysis.groupby(['ID'])['target_actual'].shift(1)

NB! As this is Quarterly data one year lag is 4 quarters

In [46]:
df_analysis['target_4_lag'] = df_analysis.groupby(['ID'])['target_actual'].shift(4)

#### Create new variables with 3 month (1 quarter) lag of jobrate

In [47]:
columns_1q_lag = ['jobs', 'sector_information_technology', 'sector_engineering_technology', 'sector_industry_craft','sector_office_finance']

for colname in columns_1q_lag:
    df_analysis[str(colname + '_1_lag')] = df_analysis.groupby(['ID'])[colname].shift(1)

#### Dropping some GT variables

In [48]:
# Dropping some GT's
drop_list = ['GT_SE_6', 'GT_SE_8', 'GT_SE_10', 'GT_SE_11', 'GT_DK_1']

In [49]:
df_analysis.drop(drop_list, axis = 1, inplace=True)

#### Create new variables with 1 month lag of GT

In [50]:
columns_1q_lag = ['GT_SE_0', 'GT_SE_1', 'GT_SE_2', 'GT_SE_5', 'GT_SE_7', 'GT_SE_9']
for colname in columns_1q_lag:
    df_analysis[str(colname + '_1_lag')] = df_analysis.groupby(['ID'])[colname].shift(1)

#### Month dummies for season effects

In [51]:
df_analysis['quarter'] = pd.DatetimeIndex(df_analysis['date']).month.astype(str)

#### Creating dummies from categorial variables - remember to drop the reference category (done after change is constructed)

In [52]:
df_analysis = pd.get_dummies(df_analysis, prefix=['ID','Q'], prefix_sep='_', columns=['ID', 'quarter']).copy()

#### Drop na

In [53]:
df_analysis.dropna(inplace=True)

In [54]:
df_analysis.date.max()

Timestamp('2019-07-01 00:00:00')

In [55]:
df_analysis.date.min()

Timestamp('2008-04-01 00:00:00')

#### Adding interaction terms

Polynominal features - To be deleted later

In [56]:
#df_analysis = add_poly_terms(df = df_analysis, 
#                            poly_columns = ['target_actual', 'GT_0', 'GT_1', 'GT_2', 'GT_3', 'GT_4', 'GT_5', 'GT_6', 'GT_7', 'GT_8', 'GT_9', 'GT_10', 'GT_11', 'GT_12', 'GT_13', 'GT_14', 'GT_15', 'GT_16', 'GT_17', 'GT_18', 'GT_19', 'target_lag', 'jobs', 'sector_information_technology', 'sector_engineering_technology', 'sector_management_staff', 'sector_trade_service', 'sector_industry_craft', 'sector_sales_communication', 'sector_teaching', 'sector_office_finance', 'sector_social_health', 'sector_other'])

In [57]:
#df_analysis.dropna(inplace=True)

Adding interaction terms by regions and all variables

In [58]:
# relevant interaction variables
interaction_1 = ['target_lag', 'target_4_lag'] 
# 'sector_information_technology', 'sector_engineering_technology', 'sector_management_staff', 'sector_trade_service', 'sector_industry_craft', 'sector_sales_communication', 'sector_teaching', 'sector_office_finance', 'sector_social_health', 'sector_other'

# get list of all ID area 
interaction_2 = [item for item in df_analysis if item.startswith('ID_')]

In [59]:
for var1 in interaction_1:
    for var2 in interaction_2:
        name = var1 + "*" + var2
        df_analysis[name] = pd.Series(df_analysis[var1] * df_analysis[var2], name=name)

#### Drop variables to not end up in dummytrap

In [60]:
df_analysis = df_analysis.drop(['ID_Blekinge', 'Q_1'], axis = 1)

In [61]:
df_analysis.drop(interaction_1, axis = 1, inplace=True)

In [62]:
df_analysis.sort_index(axis=1, inplace=True)

#### To Quarter data

In [63]:
df_analysis['date'] = df_analysis.set_index('date').index.to_period('Q')

In [64]:
df_analysis['date'].min()

Period('2008Q2', 'Q-DEC')

In [65]:
df_analysis['date'].max()

Period('2019Q3', 'Q-DEC')

#### Subset for period to match DK

In [66]:
df_analysis = df_analysis[df_analysis.date >= '01-01-2008']

In [67]:
df_analysis.columns

Index(['GT_SE_0', 'GT_SE_0_1_lag', 'GT_SE_1', 'GT_SE_1_1_lag', 'GT_SE_2',
       'GT_SE_2_1_lag', 'GT_SE_5', 'GT_SE_5_1_lag', 'GT_SE_7', 'GT_SE_7_1_lag',
       'GT_SE_9', 'GT_SE_9_1_lag', 'ID_Dalarna', 'ID_Gotland', 'ID_Gävleborg',
       'ID_Halland', 'ID_Jönköping', 'ID_Kalmar', 'ID_Kronoberg',
       'ID_Norrbotten', 'ID_Skåne', 'ID_Stockholm', 'ID_Södermanland',
       'ID_Uppsala', 'ID_Värmland', 'ID_Västerbotten', 'ID_Västernorrland',
       'ID_Västmanland', 'ID_Västra Götalands', 'ID_Örebro', 'ID_Östergötland',
       'Q_10', 'Q_4', 'Q_7', 'date', 'high_edu_share', 'jobs', 'jobs_1_lag',
       'labour_force_share', 'population', 'sector_engineering_technology',
       'sector_engineering_technology_1_lag', 'sector_industry_craft',
       'sector_industry_craft_1_lag', 'sector_information_technology',
       'sector_information_technology_1_lag', 'sector_office_finance',
       'sector_office_finance_1_lag', 't', 'target_4_lag*ID_Blekinge',
       'target_4_lag*ID_Dalarna', 'ta

#### Setting window size

In [39]:
window = 11
testsize = 1
valsize = 1
rolling_window = True

## Lasso

### Test/train data split

Data must be split non-randomly as it needs to adhere to the underlying time structure. Two distinct approaches:

1. Rolling window (fixed length)
1. Expanding window (initial length that increases with each iteration)

Each model must be run as a loop over the test/train splits. Thus, we will have multiple test/train splits for both rolling window and expanding window.

In [35]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

In [36]:
with open('results/final/y_dates.pickle', 'wb') as handle:
    pickle.dump(y_dates, handle, protocol= pickle.HIGHEST_PROTOCOL)

### Pre-processing

- Standardizing

Standardizing features for each window

In [335]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_val[win] = sc.transform(X_val[win])
    X_test[win] = sc.transform(X_test[win])

### Training the models

#### Hyperparameter space - random

In [336]:
alphas = np.logspace(-8, 8, num = 10000) #random.sample(list(np.logspace(-10,5, num = 15000)), k = 15000)
n_components = list(np.arange(0.6, 0.95, 0.05).round(2))#[0.60, 0.70, 0.80, 0.90]
params = [(alpha) for alpha in alphas]
print('Number of param sets: '+ str(len(params)))

Number of param sets: 10000


#### Inner loop - training hyperparameter on validation

##### On random space

In [337]:
results_lasso_mp = tuning_window_mp(X_fit = X_train, y_fit = y_train, X_test = X_val, y_test = y_val, params = params, n_components = n_components, model_str = 'lasso')


  0%|          | 0/34 [00:00<?, ?it/s][A
  3%|▎         | 1/34 [01:23<45:43, 83.15s/it][A
  6%|▌         | 2/34 [02:46<44:21, 83.16s/it][A
  9%|▉         | 3/34 [03:58<41:19, 79.98s/it][A
 12%|█▏        | 4/34 [05:18<39:59, 79.99s/it][A
 15%|█▍        | 5/34 [06:45<39:40, 82.08s/it][A
 18%|█▊        | 6/34 [08:09<38:34, 82.65s/it][A
 21%|██        | 7/34 [09:40<38:19, 85.16s/it][A
 24%|██▎       | 8/34 [10:51<35:02, 80.87s/it][A
 26%|██▋       | 9/34 [12:11<33:30, 80.43s/it][A
 29%|██▉       | 10/34 [13:24<31:18, 78.28s/it][A
 32%|███▏      | 11/34 [14:31<28:45, 75.04s/it][A
 35%|███▌      | 12/34 [15:40<26:47, 73.06s/it][A
 38%|███▊      | 13/34 [16:54<25:39, 73.31s/it][A
 41%|████      | 14/34 [18:01<23:51, 71.59s/it][A
 44%|████▍     | 15/34 [19:08<22:13, 70.17s/it][A
 47%|████▋     | 16/34 [20:15<20:45, 69.18s/it][A
 50%|█████     | 17/34 [21:22<19:23, 68.44s/it][A
 53%|█████▎    | 18/34 [22:29<18:10, 68.18s/it][A
 56%|█████▌    | 19/34 [23:36<16:55, 67.71s/it]

In [338]:
with open('results/final/lasso/results_mp.pickle', 'wb') as handle:
    pickle.dump(results_lasso_mp, handle, protocol= pickle.HIGHEST_PROTOCOL)

#### Outer loop - fitting on train / test split

Importing stored results:

In [339]:
with open('results/final/lasso/results_mp.pickle', 'rb') as handle:
    results_lasso_opt = pickle.load(handle)

##### On full sample

Reloading data and concatting:

In [340]:
X_train, X_val, X_test, y_train, y_val, y_test, dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

In [341]:
#Concatting val and train
for win in X_train.keys():
    X_train[win] = np.concatenate((X_train[win], X_val[win])).copy()
    y_train[win] = np.concatenate((y_train[win], y_val[win])).copy()

Standardizing features for each window

In [342]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_test[win] = sc.transform(X_test[win])

In [343]:
results_final = final_model(inner_results=results_lasso_opt, X_fit = X_train, y_fit = y_train, X_test = X_test, y_test = y_test, model_str = 'lasso')


  0%|          | 0/34 [00:00<?, ?it/s][A
 68%|██████▊   | 23/34 [00:00<00:00, 223.86it/s][A
100%|██████████| 34/34 [00:00<00:00, 215.27it/s][A

#### Exporting final results

In [344]:
with open('results/final/lasso/results_final.pickle', 'wb') as handle:
    pickle.dump(results_final, handle, protocol= pickle.HIGHEST_PROTOCOL)

In [345]:
temp = []
for key in results_final.keys():
    temp.append(results_final[key]['best_rmse'][1])
np.mean(temp)

1.2144491919547746

## Ridge

In [346]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

### Pre-processing

- Standardizing

Standardizing features for each window

In [347]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_val[win] = sc.transform(X_val[win])
    X_test[win] = sc.transform(X_test[win])

### Training the models

#### Hyperparameter space - random

In [348]:
alphas = np.logspace(-8, 8, num = 10000) #random.sample(list(np.logspace(-10,5, num = 15000)), k = 15000)
n_components = list(np.arange(0.6, 0.95, 0.05).round(2))#[0.60, 0.70, 0.80, 0.90]
params = [(alpha) for alpha in alphas]
print('Number of param sets: '+ str(len(params)))

Number of param sets: 10000


#### Inner loop - training hyperparameter on validation

##### On random space

In [349]:
results_mp = tuning_window_mp(X_fit = X_train, y_fit = y_train, X_test = X_val, y_test = y_val, params = params, n_components = n_components, model_str = 'ridge')


  0%|          | 0/34 [00:00<?, ?it/s][A
  3%|▎         | 1/34 [01:14<40:51, 74.28s/it][A
  6%|▌         | 2/34 [02:25<39:12, 73.50s/it][A
  9%|▉         | 3/34 [03:38<37:50, 73.25s/it][A
 12%|█▏        | 4/34 [04:52<36:44, 73.48s/it][A
 15%|█▍        | 5/34 [06:04<35:19, 73.07s/it][A
 18%|█▊        | 6/34 [07:19<34:23, 73.68s/it][A
 21%|██        | 7/34 [08:47<34:58, 77.72s/it][A
 24%|██▎       | 8/34 [09:59<32:56, 76.01s/it][A
 26%|██▋       | 9/34 [11:11<31:14, 74.99s/it][A
 29%|██▉       | 10/34 [12:29<30:21, 75.91s/it][A
 32%|███▏      | 11/34 [13:39<28:23, 74.05s/it][A
 35%|███▌      | 12/34 [14:51<26:57, 73.53s/it][A
 38%|███▊      | 13/34 [16:04<25:38, 73.27s/it][A
 41%|████      | 14/34 [17:13<23:59, 71.95s/it][A
 44%|████▍     | 15/34 [18:25<22:49, 72.06s/it][A
 47%|████▋     | 16/34 [19:39<21:48, 72.68s/it][A
 50%|█████     | 17/34 [20:50<20:25, 72.08s/it][A
 53%|█████▎    | 18/34 [21:59<18:59, 71.20s/it][A
 56%|█████▌    | 19/34 [23:10<17:48, 71.22s/it]

In [350]:
with open('results/final/ridge/results_mp.pickle', 'wb') as handle:
    pickle.dump(results_mp, handle, protocol= pickle.HIGHEST_PROTOCOL)

#### Outer loop - fitting on train / test split

Importing stored results:

In [351]:
with open('results/final/ridge/results_mp.pickle', 'rb') as handle:
    results_opt = pickle.load(handle)

##### On full sample

Reloading data and concatting:

In [352]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

In [353]:
#Concatting val and train
for win in X_train.keys():
    X_train[win] = np.concatenate((X_train[win], X_val[win])).copy()
    y_train[win] = np.concatenate((y_train[win], y_val[win])).copy()

Standardizing features for each window

In [354]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_test[win] = sc.transform(X_test[win])

In [355]:
results_final = final_model(inner_results=results_opt, X_fit = X_train, y_fit = y_train, X_test = X_test, y_test = y_test, model_str = 'ridge')


  0%|          | 0/34 [00:00<?, ?it/s][A
 56%|█████▌    | 19/34 [00:00<00:00, 182.34it/s][A
100%|██████████| 34/34 [00:00<00:00, 180.81it/s][A

#### Exporting final results

In [356]:
with open('results/final/ridge/results_final.pickle', 'wb') as handle:
    pickle.dump(results_final, handle, protocol= pickle.HIGHEST_PROTOCOL)

In [357]:
temp = []
for key in results_final.keys():
    temp.append(results_final[key]['best_rmse'][1])
np.mean(temp)

1.2349172324583901

## Elastic net

In [359]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

### Pre-processing

- Standardizing

Standardizing features for each window

In [360]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_val[win] = sc.transform(X_val[win])
    X_test[win] = sc.transform(X_test[win])

### Training the models

#### Hyperparameter space - random

In [363]:
alphas = np.logspace(-8,8, num = 10000) #2000
n_components= [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90]
l1_ratio = list(np.arange(0.01,0.99,0.05)) ## 0.0 cannot be included due to a bug in the code
params = [(alpha, ratio) for alpha in alphas for ratio in l1_ratio]
print('Number of param sets: '+ str(len(params)))


Number of param sets: 200000


#### Inner loop - training hyperparameter on validation

##### On random space

In [364]:
results_mp = tuning_window_mp(X_fit = X_train, y_fit = y_train, X_test = X_val, y_test = y_val, params = params, n_components = n_components, model_str = 'elasticnet')



  0%|          | 0/34 [00:00<?, ?it/s][A[A

  3%|▎         | 1/34 [24:49<13:39:07, 1489.33s/it][A[A

  6%|▌         | 2/34 [46:39<12:45:43, 1435.72s/it][A[A

  9%|▉         | 3/34 [1:08:08<11:59:00, 1391.62s/it][A[A

 12%|█▏        | 4/34 [1:29:37<11:20:26, 1360.89s/it][A[A

 15%|█▍        | 5/34 [1:51:19<10:49:08, 1343.07s/it][A[A

 18%|█▊        | 6/34 [2:13:01<10:21:00, 1330.73s/it][A[A

 21%|██        | 7/34 [2:34:39<9:54:22, 1320.83s/it] [A[A

 24%|██▎       | 8/34 [2:56:19<9:29:41, 1314.68s/it][A[A

 26%|██▋       | 9/34 [3:18:05<9:06:41, 1312.05s/it][A[A

 29%|██▉       | 10/34 [3:39:41<8:42:52, 1307.17s/it][A[A

 32%|███▏      | 11/34 [4:01:23<8:20:35, 1305.89s/it][A[A

 35%|███▌      | 12/34 [4:23:06<7:58:24, 1304.76s/it][A[A

 38%|███▊      | 13/34 [4:44:39<7:35:30, 1301.45s/it][A[A

 41%|████      | 14/34 [5:06:17<7:13:27, 1300.35s/it][A[A

 44%|████▍     | 15/34 [5:27:51<6:51:08, 1298.36s/it][A[A

 47%|████▋     | 16/34 [5:49:23<6:28:57, 12

In [365]:
with open('results/final/elastic/results_mp.pickle', 'wb') as handle:
    pickle.dump(results_mp, handle, protocol= pickle.HIGHEST_PROTOCOL)

#### Outer loop - fitting on train / test split

Importing stored results:

In [366]:
with open('results/final/elastic/results_mp.pickle', 'rb') as handle:
    results_opt = pickle.load(handle)

##### On full sample

Reloading data and concatting:

In [367]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

In [368]:
#Concatting val and train
for win in X_train.keys():
    X_train[win] = np.concatenate((X_train[win], X_val[win])).copy()
    y_train[win] = np.concatenate((y_train[win], y_val[win])).copy()

Standardizing features for each window

In [369]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_test[win] = sc.transform(X_test[win])

In [370]:
results_final = final_model(inner_results=results_opt, X_fit = X_train, y_fit = y_train, X_test = X_test, y_test = y_test, model_str = 'elasticnet')



  0%|          | 0/34 [00:00<?, ?it/s][A[A

 71%|███████   | 24/34 [00:00<00:00, 236.08it/s][A[A

100%|██████████| 34/34 [00:00<00:00, 236.15it/s][A[A

#### Exporting final results

In [371]:
with open('results/final/elastic/results_final.pickle', 'wb') as handle:
    pickle.dump(results_final, handle, protocol= pickle.HIGHEST_PROTOCOL)

In [372]:
temp = []
for key in results_final.keys():
    temp.append(results_final[key]['best_rmse'][1])
np.mean(temp)

1.2187329444035426

## ML - Data and preprocessing - tree-based

### Import data frame with adjusted here

### Subset data 

In [2]:
df_analysis = pd.read_csv('data/descriptive/df_descriptive.csv', parse_dates=['date'])

In [3]:
df_analysis = df_analysis[df_analysis.country == 'SE']

In [4]:
df_analysis = df_analysis[~(df_analysis.ID == 'Jämtland')]

In [5]:
df_analysis = df_analysis.drop(['GT_NO_0', 'GT_NO_1', 'GT_NO_2', 'GT_NO_6', 'GT_NO_7', 'GT_NO_8', 'GT_NO_10', 'GT_NO_11',  
                                'GT_DK_0', 'GT_DK_2', 'GT_DK_3', 'GT_DK_4', 'GT_DK_5', 'GT_DK_6', 'GT_DK_7', 'GT_DK_8',
                                'GT_DK_9', 'GT_DK_10', 'GT_DK_11', 'GT_DK_12', 'GT_DK_13', 'GT_DK_14',
                                'GT_DK_15', 'GT_DK_16', 'GT_DK_17', 'GT_DK_18', 'GT_DK_19', 'GT_DK_20', 'GT_DK_21',
                                'country'], axis = 1)

In [6]:
print(list(df_analysis))

['date', 'target_actual', 'ID', 'jobs', 'sector_information_technology', 'sector_engineering_technology', 'sector_management_staff', 'sector_trade_service', 'sector_industry_craft', 'sector_sales_communication', 'sector_teaching', 'sector_office_finance', 'sector_social_health', 'sector_other', 'population', 'high_edu_share', 'labour_force_share', 'urban_share', 'GT_DK_1', 'GT_SE_0', 'GT_SE_1', 'GT_SE_2', 'GT_SE_5', 'GT_SE_6', 'GT_SE_7', 'GT_SE_8', 'GT_SE_9', 'GT_SE_10', 'GT_SE_11']


In [7]:
df_analysis.date.min()

Timestamp('2007-01-01 00:00:00')

In [8]:
df_analysis.date.max()

Timestamp('2019-07-01 00:00:00')

### Initial preprocessing and feature construction

- Create dummies 
- Create interaction terms

Overall monthly time trend variable, $t=1,2...,T$ within `ID` variable:

In [9]:
#Temp container
temp = {}

for i in df_analysis['ID'].unique():
    temp[i] = df_analysis[df_analysis['ID']==i]
    temp[i]['t'] = range(1, len(temp[i]['ID'])+1)

#Concatting the df's
temp = pd.concat(temp, ignore_index=True, sort = False)

#Merging onto analysis
df_analysis = pd.merge(left = df_analysis, right = temp[['date', 'ID', 't']], left_on =['date', 'ID'], right_on = ['date', 'ID'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#### Drop sector variables

criteria = 0.2

In [10]:
df_analysis.drop(['sector_management_staff', 'sector_trade_service', 'sector_sales_communication', 'sector_teaching', 'sector_social_health', 'sector_other'], axis = 1, inplace=True)

#### Transform relevant columns to abs change exept those with M_ and ID_, date and t

In [11]:
df_analysis['target_actual'] = df_analysis.groupby(['ID'])['target_actual'].diff()

### Lagged variables

In [12]:
df_analysis['target_lag'] = df_analysis.groupby(['ID'])['target_actual'].shift(1)

NB! As this is Quarterly data one year lag is 4 quarters

In [13]:
df_analysis['target_4_lag'] = df_analysis.groupby(['ID'])['target_actual'].shift(4)

#### Create new variables with 3 month (1 quarter) lag of jobrate

In [14]:
columns_1q_lag = ['jobs', 'sector_information_technology', 'sector_engineering_technology', 'sector_industry_craft','sector_office_finance']

for colname in columns_1q_lag:
    df_analysis[str(colname + '_1_lag')] = df_analysis.groupby(['ID'])[colname].shift(1)

#### Dropping some GT variables

In [15]:
# Dropping some GT's
drop_list = ['GT_SE_6', 'GT_SE_8', 'GT_SE_10', 'GT_SE_11', 'GT_DK_1']

In [16]:
df_analysis.drop(drop_list, axis = 1, inplace=True)

#### Create new variables with 1 month lag of GT

In [17]:
columns_1q_lag = ['GT_SE_0', 'GT_SE_1', 'GT_SE_2', 'GT_SE_5', 'GT_SE_7', 'GT_SE_9']
for colname in columns_1q_lag:
    df_analysis[str(colname + '_1_lag')] = df_analysis.groupby(['ID'])[colname].shift(1)

#### Month dummies for season effects

In [18]:
df_analysis['quarter'] = pd.DatetimeIndex(df_analysis['date']).month.astype(str)

#### Creating dummies from categorial variables - remember to drop the reference category (done after change is constructed)

In [19]:
df_analysis = pd.get_dummies(df_analysis, prefix=['ID','Q'], prefix_sep='_', columns=['ID', 'quarter']).copy()

#### Drop na

In [20]:
df_analysis.dropna(inplace=True)

In [21]:
df_analysis.date.max()

Timestamp('2019-07-01 00:00:00')

In [22]:
df_analysis.date.min()

Timestamp('2008-04-01 00:00:00')

#### Adding interaction terms

Polynominal features - To be deleted later

In [23]:
#df_analysis = add_poly_terms(df = df_analysis, 
#                            poly_columns = ['target_actual', 'GT_0', 'GT_1', 'GT_2', 'GT_3', 'GT_4', 'GT_5', 'GT_6', 'GT_7', 'GT_8', 'GT_9', 'GT_10', 'GT_11', 'GT_12', 'GT_13', 'GT_14', 'GT_15', 'GT_16', 'GT_17', 'GT_18', 'GT_19', 'target_lag', 'jobs', 'sector_information_technology', 'sector_engineering_technology', 'sector_management_staff', 'sector_trade_service', 'sector_industry_craft', 'sector_sales_communication', 'sector_teaching', 'sector_office_finance', 'sector_social_health', 'sector_other'])

In [24]:
#df_analysis.dropna(inplace=True)

Adding interaction terms by regions and all variables

In [25]:
# # relevant interaction variables
# interaction_1 = ['target_lag', 'target_4_lag'] 
# # 'sector_information_technology', 'sector_engineering_technology', 'sector_management_staff', 'sector_trade_service', 'sector_industry_craft', 'sector_sales_communication', 'sector_teaching', 'sector_office_finance', 'sector_social_health', 'sector_other'

# # get list of all ID area 
# interaction_2 = [item for item in df_analysis if item.startswith('ID_')]

In [26]:
# for var1 in interaction_1:
#     for var2 in interaction_2:
#         name = var1 + "*" + var2
#         df_analysis[name] = pd.Series(df_analysis[var1] * df_analysis[var2], name=name)

#### Drop variables to not end up in dummytrap

In [27]:
df_analysis = df_analysis.drop(['ID_Blekinge', 'Q_1'], axis = 1)

In [28]:
# df_analysis.drop(interaction_1, axis = 1, inplace=True)

In [29]:
df_analysis.sort_index(axis=1, inplace=True)

#### To Quarter data

In [30]:
df_analysis['date'] = df_analysis.set_index('date').index.to_period('Q')

In [31]:
df_analysis['date'].min()

Period('2008Q2', 'Q-DEC')

In [32]:
df_analysis['date'].max()

Period('2019Q3', 'Q-DEC')

#### Subset for period to match DK

In [33]:
df_analysis = df_analysis[df_analysis.date >= '01-01-2008']

#### Setting window size

In [48]:
window = 11
testsize = 1
valsize = 1
rolling_window = True

## Random forest

In [49]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

### Pre-processing

- Standardizing

Standardizing features for each window

In [50]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_val[win] = sc.transform(X_val[win])
    X_test[win] = sc.transform(X_test[win])

### Training the models

#### Hyperparameter space - random

In [52]:
# Website https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# parameter specification
n_components = [0.9]

#range(1, X_train[1].shape[1] +1)
# Number of trees in random forest
n_estimators = [*range(50, 500, 10)]
# Number of features to consider at every split
max_features = ['auto', "sqrt"] # Consider whether this should be set to 'auto as PCA should do its job'
# Maximum number of levels in tree
max_depth = [*range(3, 11, 1), *range(20, 100, 20)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 3, 5]
# Method of selecting samples for training each tree
# https://gdcoder.com/random-forest-regression-model-explained-in-depth-part-2-python-code-snippet-using-sklearn/

# Create the random grid
d = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_split': min_samples_split,
     'min_samples_leaf':min_samples_leaf
    }

params = list(d.values())

params = list(itertools.product(*params))

print(len(params))


3510


#### Inner loop - training hyperparameter on validation

##### On random space

In [53]:
results_mp = tuning_window(X_fit = X_train, y_fit = y_train, X_test = X_val, y_test = y_val, params = params, n_components = n_components, model_str = 'randomforest')

Tuning params for window: 100%|███████████████████████████████████████████████████| 34/34 [13:07:53<00:00, 1390.39s/it]


In [54]:
with open('results/final/randomforest/results_noint.pickle', 'wb') as handle:
    pickle.dump(results_mp, handle, protocol= pickle.HIGHEST_PROTOCOL)

#### Outer loop - fitting on train / test split

Importing stored results:

In [55]:
with open('results/final/randomforest/results_noint.pickle', 'rb') as handle:
    results_opt = pickle.load(handle)

##### On full sample

Reloading data and concatting:

In [56]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

In [57]:
#Concatting val and train
for win in X_train.keys():
    X_train[win] = np.concatenate((X_train[win], X_val[win])).copy()
    y_train[win] = np.concatenate((y_train[win], y_val[win])).copy()

Standardizing features for each window

In [58]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_test[win] = sc.transform(X_test[win])

In [59]:
results_final = final_model(inner_results=results_opt, X_fit = X_train, y_fit = y_train, X_test = X_test, y_test = y_test, model_str = 'randomforest')

100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [00:10<00:00,  3.28it/s]


#### Exporting final results

In [60]:
with open('results/final/randomforest/results_final_noint.pickle', 'wb') as handle:
    pickle.dump(results_final, handle, protocol= pickle.HIGHEST_PROTOCOL)

In [61]:
temp = []
for key in results_final.keys():
    temp.append(results_final[key]['best_rmse'][1])
np.mean(temp)

1.1698456092407148

## XGboost

In [35]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

### Pre-processing

- Standardizing

Standardizing features for each window

In [36]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_val[win] = sc.transform(X_val[win])
    X_test[win] = sc.transform(X_test[win])

### Training the models

#### Hyperparameter space - random

In [37]:
# Website https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# parameter specification
n_components = [0.9]


colsample_bytree = [0.3, 0.5, 0.7, 0.9, 1]#is the subsample ratio of columns when constructing each tree. Subsampling will occur once in every boosting iteration. This number ranges from 0 to 1.
#learning_rate is the step size shrinkage and is used to prevent overfitting. This number ranges from 0 to 1.
# Maximum number of levels in tree
# First try: colsample_bytree = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

max_depth = [*range(3, 11, 1), *range(20, 100, 20)]

#max_depth = [*range(10, 300, 10)]
# first try max_depth = [*range(50, 500, 20)]

n_estimators = [*range(50, 500, 10)]#is the number of boosted trees to fit
# first try n_estimators = [*range(50, 500, 20)]

gamma = [0]

subsample = [0.5, 0.75, 1]

min_child_weight = [1, 3, 5]

# Create the random grid
d = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'colsample_bytree': colsample_bytree,
    'gamma' : gamma,
    'subsample' : subsample,
    'min_child_weight' : min_child_weight
    }

params = list(d.values())

params = list(itertools.product(*params))

print(len(params))

24300


In [38]:
random.seed(1)
params = random.sample(params, 10000)

#### Inner loop - training hyperparameter on validation

##### On random space

In [39]:
results_mp = tuning_window_mp(X_fit = X_train, y_fit = y_train, X_test = X_val, y_test = y_val, params = params, n_components = n_components, model_str = 'xgboost')

100%|█████████████████████████████████████████████████████████████████████████████| 34/34 [17:06:35<00:00, 1811.64s/it]


In [40]:
with open('results/final/xgboost/results_noint.pickle', 'wb') as handle:
    pickle.dump(results_mp, handle, protocol= pickle.HIGHEST_PROTOCOL)

#### Outer loop - fitting on train / test split

##### On full sample

Reloading data and concatting:

In [41]:
with open('results/final/xgboost/results_noint.pickle', 'rb') as handle:
    results_opt = pickle.load(handle)
    

In [42]:
X_train, X_val, X_test, y_train, y_val, y_test, y_dates = test_train_split_Q(df = df_analysis, window = window, testsize=testsize, valsize = valsize,
                                                                  y_col='target_actual', rolling_window = rolling_window, df_output= False, geo_count = 20)

In [43]:
#Concatting val and train
for win in X_train.keys():
    X_train[win] = np.concatenate((X_train[win], X_val[win])).copy()
    y_train[win] = np.concatenate((y_train[win], y_val[win])).copy()

Standardizing features for each window

In [44]:
for win in X_train.keys():
    sc = StandardScaler()
    X_train[win] = sc.fit_transform(X_train[win])
    X_test[win] = sc.transform(X_test[win])

In [45]:
results_final = final_model(inner_results=results_opt, X_fit = X_train, y_fit = y_train, X_test = X_test, y_test = y_test, model_str = 'xgboost')

100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [00:04<00:00,  8.24it/s]


#### Exporting final results

In [46]:
with open('results/final/xgboost/results_final_noint.pickle', 'wb') as handle:
    pickle.dump(results_final, handle, protocol= pickle.HIGHEST_PROTOCOL)

In [47]:
temp = []
for key in results_final.keys():
    temp.append(results_final[key]['best_rmse'][1])
np.mean(temp)

1.1158579122612415