In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Loading 1960+ data

In [13]:
input_data = pd.read_csv('../Data/Input_features_1960_no_covid.csv')
# input_data = input_data.drop([722,723,724,725]) # removing COVID-19 affected months (2020-03 to 2020-06)
# input_data = input_data.reset_index(drop=True)
# input_data.iloc[:, 2:]
input_data = input_data.iloc[:,3:]
input_data

Unnamed: 0,Fedfund_rate,GDP_pch,CPI_pc1,Loan_pch,House_ch
0,3.99,2.657590,1.24095,10.2,-197.0
1,3.97,1.675410,1.41379,10.1,-164.0
2,3.84,0.693230,1.51881,6.7,-511.0
3,3.92,-0.288950,1.93237,6.6,-301.0
4,3.85,0.086070,1.82507,3.3,-227.0
...,...,...,...,...,...
756,5.06,1.310517,4.12884,6.0,40.0
757,5.08,1.690863,3.09200,3.0,-143.0
758,5.12,2.071210,3.29908,2.4,80.0
759,5.33,2.071210,3.70750,6.3,-200.0


In [14]:
output_data = pd.read_csv('../Data/Employment_pch_1960_no_covid.csv')
output_data
# output_data = output_data[:-1] # ensuring input and output start and end at the same timepoints
# output_data = output_data.drop([722,723,724,725]) # removing COVID-19 affected months (2020-03 to 2020-06)
# output_data = output_data.reset_index(drop=True)
# output_data.iloc[:, 2]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Month,Total_private_pch,Construction_pch
0,0,0,1960-01-01,0.35148,-1.20994
1,1,1,1960-02-01,0.47860,1.05925
2,2,2,1960-03-01,-0.55860,-5.63380
3,3,3,1960-04-01,0.76205,4.26935
4,4,4,1960-05-01,-0.50995,0.56591
...,...,...,...,...,...
757,761,761,2023-06-01,0.06450,0.36588
758,762,762,2023-07-01,0.10867,0.15085
759,763,763,2023-08-01,0.08535,0.37655
760,764,764,2023-09-01,0.18401,0.16256


# Naive Forecasting (using 1960+ data)

In [None]:
X = input_data.iloc[:, 2:]
y = output_data.iloc[:, 2]

y_labels = (y > 0).astype(int)

# predict next month's class to be the same as this month's class
y_pred = y_labels.shift(1).fillna(0)

# split the data into training and testing sets along with corresp. indices
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y_labels, np.arange(len(y_labels)),
                                                                                 test_size=0.3, random_state=8,
                                                                                 shuffle=True)

# check accuracy of baseline for classification
accuracy = np.sum(y_pred[indices_test] == y_test) / np.size(y_test)
print(f'Classification Accuracy - Naive Forecasting (Baseline, 1960+): {accuracy}')

# Linear Regression (using 1960+ data)

In [None]:
X = input_data.iloc[:,2:]
y = output_data.iloc[:,2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8, shuffle=True)
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
r2_score = model.score(X_test, y_test)
print(f'R2 Score - Linear Regression (Baseline, 1960+): {r2_score}')

# Loading 2006+ data

In [None]:
inputs = pd.read_csv('../Data/all_inputs_cleaned.csv')
outputs = pd.read_csv('../Data/all_outputs_cleaned.csv')

In [None]:
inputs.iloc[:, 2:]

In [None]:
outputs.iloc[:, 4]

# Naive Forecasting (using 2006+ data)

In [None]:
X = inputs.iloc[:, 2:]  # using all features, except the fact that it is a time series (ignoring column 1 - Month)
y = outputs.iloc[:, 4]  # toggle between 2,3,4 to predict Construction, Information and Total Private

y_labels = (y > 0).astype(int)

# predict next month's class to be the same as this month's class
y_pred = y_labels.shift(1).dropna()

# split the data into training and testing sets along with corresp. indices
_, _, _, y_test, _, indices_test = train_test_split(X, y_labels, np.arange(len(y_labels)), test_size=0.3, random_state=0)

# check accuracy of baseline for classification
accuracy = np.sum(y_pred[indices_test] == y_test) / np.size(y_test)
print(f'Classification Accuracy - Naive Forecasting (Baseline, 2006+): {accuracy}')

# Linear Regression (using 2006+ data)

In [None]:
X = inputs.iloc[:, 2:]  # using all features, except the fact that it is a time series (ignoring column 1 - Month)
y = outputs.iloc[:, 4]  # toggle between 2,3,4 to predict Construction, Information and Total Private

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
r2_score = model.score(X_test, y_test)
print(f'R2 Score - Linear Regression (Baseline, 2006+): {r2_score}')