In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Loading 1960+ data

In [None]:
input_data = pd.read_csv('../Data/Input_features_1960_no_covid.csv')
# input_data = input_data.drop([722,723,724,725]) # removing COVID-19 affected months (2020-03 to 2020-06)
# input_data = input_data.reset_index(drop=True)
# input_data.iloc[:, 2:]
input_data = input_data.iloc[:,3:]
input_data

In [None]:
output_data = pd.read_csv('../Data/Employment_pch_1960_no_covid.csv')
output_data = output_data.iloc[:,2]
output_data
# output_data = output_data[:-1] # ensuring input and output start and end at the same timepoints
# output_data = output_data.drop([722,723,724,725]) # removing COVID-19 affected months (2020-03 to 2020-06)
# output_data = output_data.reset_index(drop=True)
# output_data.iloc[:, 2]

# Naive Forecasting (using 1960+ data)

In [None]:
X = input_data
y = output_data

y_labels = (y > 0).astype(int)

# predict next month's class to be the same as this month's class
y_pred = y_labels.shift(1).fillna(0)

# split the data into training and testing sets along with corresp. indices
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y_labels, np.arange(len(y_labels)),
                                                                                 test_size=0.3, random_state=8,
                                                                                 shuffle=True)

# check accuracy of baseline for classification
accuracy = np.sum(y_pred[indices_test] == y_test) / np.size(y_test)
print(f'Classification Accuracy - Naive Forecasting (Baseline, 1960+): {accuracy}')

# Linear Regression (using 1960+ data)

In [None]:
X = input_data
y = output_data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8, shuffle=True)
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
r2_score = model.score(X_test, y_test)
print(f'R2 Score - Linear Regression (Baseline, 1960+): {r2_score}')

# Loading 2006+ data

In [None]:
inputs = pd.read_csv('../Data/all_inputs_cleaned.csv')
outputs = pd.read_csv('../Data/all_outputs_cleaned.csv')

In [None]:
inputs.iloc[:, 2:]

In [None]:
outputs.iloc[:, 4]

# Naive Forecasting (using 2006+ data)

In [None]:
X = inputs.iloc[:, 2:]  # using all features, except the fact that it is a time series (ignoring column 1 - Month)
y = outputs.iloc[:, 4]  # toggle between 2,3,4 to predict Construction, Information and Total Private

y_labels = (y > 0).astype(int)

# predict next month's class to be the same as this month's class
y_pred = y_labels.shift(1).dropna()

# split the data into training and testing sets along with corresp. indices
_, _, _, y_test, _, indices_test = train_test_split(X, y_labels, np.arange(len(y_labels)), test_size=0.3, random_state=0)

# check accuracy of baseline for classification
accuracy = np.sum(y_pred[indices_test] == y_test) / np.size(y_test)
print(f'Classification Accuracy - Naive Forecasting (Baseline, 2006+): {accuracy}')

# Linear Regression (using 2006+ data)

In [None]:
X = inputs.iloc[:, 2:]  # using all features, except the fact that it is a time series (ignoring column 1 - Month)
y = outputs.iloc[:, 4]  # toggle between 2,3,4 to predict Construction, Information and Total Private

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
r2_score = model.score(X_test, y_test)
print(f'R2 Score - Linear Regression (Baseline, 2006+): {r2_score}')