In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Loading 1960+ data

In [13]:
input_data = pd.read_csv('../Data/Input_features_1960_no_covid.csv')
# input_data = input_data.drop([722,723,724,725]) # removing COVID-19 affected months (2020-03 to 2020-06)
# input_data = input_data.reset_index(drop=True)
# input_data.iloc[:, 2:]
input_data = input_data.iloc[:,3:]
input_data

Unnamed: 0,Fedfund_rate,GDP_pch,CPI_pc1,Loan_pch,House_ch
0,3.99,2.657590,1.24095,10.2,-197.0
1,3.97,1.675410,1.41379,10.1,-164.0
2,3.84,0.693230,1.51881,6.7,-511.0
3,3.92,-0.288950,1.93237,6.6,-301.0
4,3.85,0.086070,1.82507,3.3,-227.0
...,...,...,...,...,...
756,5.06,1.310517,4.12884,6.0,40.0
757,5.08,1.690863,3.09200,3.0,-143.0
758,5.12,2.071210,3.29908,2.4,80.0
759,5.33,2.071210,3.70750,6.3,-200.0


In [16]:
output_data = pd.read_csv('../Data/Employment_pch_1960_no_covid.csv')
output_data = output_data.iloc[:,2]
output_data
# output_data = output_data[:-1] # ensuring input and output start and end at the same timepoints
# output_data = output_data.drop([722,723,724,725]) # removing COVID-19 affected months (2020-03 to 2020-06)
# output_data = output_data.reset_index(drop=True)
# output_data.iloc[:, 2]

0      0.35148
1      0.47860
2     -0.55860
3      0.76205
4     -0.50995
        ...   
756    0.19161
757    0.06450
758    0.10867
759    0.08535
760    0.18401
Name: Total_private_pch, Length: 761, dtype: float64

# Naive Forecasting (using 1960+ data)

In [18]:
X = input_data
y = output_data

y_labels = (y > 0).astype(int)

# predict next month's class to be the same as this month's class
y_pred = y_labels.shift(1).fillna(0)

# split the data into training and testing sets along with corresp. indices
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y_labels, np.arange(len(y_labels)),
                                                                                 test_size=0.3, random_state=8,
                                                                                 shuffle=True)

# check accuracy of baseline for classification
accuracy = np.sum(y_pred[indices_test] == y_test) / np.size(y_test)
print(f'Classification Accuracy - Naive Forecasting (Baseline, 1960+): {accuracy}')

Classification Accuracy - Naive Forecasting (Baseline, 1960+): 0.8951965065502183


# Linear Regression (using 1960+ data)

In [20]:
X = input_data
y = output_data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8, shuffle=True)
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
r2_score = model.score(X_test, y_test)
print(f'R2 Score - Linear Regression (Baseline, 1960+): {r2_score}')

R2 Score - Linear Regression (Baseline, 1960+): 0.3825790000281102


# Loading 2006+ data

In [21]:
inputs = pd.read_csv('../Data/all_inputs_cleaned.csv')
outputs = pd.read_csv('../Data/all_outputs_cleaned.csv')

In [22]:
inputs.iloc[:, 2:]

Unnamed: 0,CPI,InterestRate,GDP,ValAddConst,ValAddInfo,Borrowing,CommercialLoan,ConsumerLoan,Deficit,ITBPrice,ITBVol,VGTPrice,VGTVol,S&P500Price,S&P500Vol
0,0.005007,0.031315,0.012731,0.016023,0.000000,-0.004089,0.178,0.054,-2.393523,-0.141342,0.058676,-0.058480,-0.357298,-0.015674,0.010319
1,0.002989,0.010122,0.012731,0.016023,0.000000,0.016496,0.201,0.078,-1.361045,-0.087839,2.328675,-0.036031,0.028369,0.002547,-0.155359
2,0.002484,0.050100,0.010429,0.000000,0.014294,0.007533,0.223,0.045,-1.478174,-0.021042,0.796079,0.032067,-0.368216,0.013990,0.089268
3,0.005451,0.001908,0.010429,0.000000,0.014294,-0.008966,0.177,0.056,-2.616416,0.046412,2.381466,0.066667,0.267074,0.022189,-0.029398
4,0.004436,0.000000,0.010429,0.000000,0.014294,-0.005700,0.222,0.074,0.951423,0.046867,-0.436061,0.044646,0.280520,0.028076,0.159019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,0.000530,0.038710,0.012081,0.012858,0.015515,0.166242,0.013,0.073,0.440654,0.052962,0.067876,0.038191,-0.165449,0.009060,0.254976
200,0.003678,0.047619,0.011676,0.015295,0.011737,0.062205,-0.029,0.058,-1.465993,0.066633,-0.030136,0.092126,0.350401,0.034360,-0.010638
201,0.001241,0.003953,0.011676,0.015295,0.011737,0.001615,-0.026,0.060,-2.364214,0.079389,0.066784,0.040382,-0.028132,0.047030,-0.146846
202,0.001803,0.007874,0.011676,0.015295,0.011737,-0.016332,-0.021,0.030,-0.052340,-0.008281,0.420013,-0.034826,-0.024273,0.005205,0.156903


In [23]:
outputs.iloc[:, 4]

0      0.000744
1      0.001250
2      0.001117
3      0.000567
4      0.000331
         ...   
199    0.001916
200    0.000645
201    0.001087
202    0.001325
203    0.001966
Name: Total_Private, Length: 204, dtype: float64

# Naive Forecasting (using 2006+ data)

In [24]:
X = inputs.iloc[:, 2:]  # using all features, except the fact that it is a time series (ignoring column 1 - Month)
y = outputs.iloc[:, 4]  # toggle between 2,3,4 to predict Construction, Information and Total Private

y_labels = (y > 0).astype(int)

# predict next month's class to be the same as this month's class
y_pred = y_labels.shift(1).dropna()

# split the data into training and testing sets along with corresp. indices
_, _, _, y_test, _, indices_test = train_test_split(X, y_labels, np.arange(len(y_labels)), test_size=0.3, random_state=0)

# check accuracy of baseline for classification
accuracy = np.sum(y_pred[indices_test] == y_test) / np.size(y_test)
print(f'Classification Accuracy - Naive Forecasting (Baseline, 2006+): {accuracy}')

Classification Accuracy - Naive Forecasting (Baseline, 2006+): 0.967741935483871


# Linear Regression (using 2006+ data)

In [25]:
X = inputs.iloc[:, 2:]  # using all features, except the fact that it is a time series (ignoring column 1 - Month)
y = outputs.iloc[:, 4]  # toggle between 2,3,4 to predict Construction, Information and Total Private

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
r2_score = model.score(X_test, y_test)
print(f'R2 Score - Linear Regression (Baseline, 2006+): {r2_score}')

R2 Score - Linear Regression (Baseline, 2006+): 0.20705216039330654
