In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [2]:
train_lda = pd.read_csv('../data/la_training_lda_final.csv', index_col=['date'])
test_lda = pd.read_csv('../data/la_testing_lda_final.csv', index_col=['date'])
nlp_data = pd.read_csv('../data/Los_Angeles_feat.csv', index_col=['date'])

In [3]:
X_train = train_lda.join(nlp_data)
X_test = test_lda.join(nlp_data)

In [4]:
X_train.shape, X_test.shape

((292, 19), (73, 19))

In [5]:
gt = pd.read_csv('../data/LA22_groundtruth.csv')

In [6]:
gt['Total']= gt.iloc[:, 1:].sum(axis=1)
gt['date'] = gt['OCCURRED_ON_DATE'].str[:10]

In [7]:
gt = gt.set_index('date')

In [8]:
Y_train = gt.reindex(X_train.index)['Total']
Y_test = gt.reindex(X_test.index)['Total']

In [9]:
# generate data for n-th iteration for k-fold cross validation
def getdata_cv(Xtrain, Ytrain, k=5, n=0):
    batch_size = int(Xtrain.shape[0] / k)
    if n == 0:
        Xtrain_batch = Xtrain[batch_size:, :]
        Ytrain_batch = Ytrain[batch_size:]
        Xtest_batch = Xtrain[:batch_size, :]
        Ytest_batch = Ytrain[:batch_size]
    elif n == k-1:
        Xtrain_batch = Xtrain[:n*batch_size, :]
        Ytrain_batch = Ytrain[:n*batch_size]
        Xtest_batch = Xtrain[n*batch_size:, :]
        Ytest_batch = Ytrain[n*batch_size:]
    else:
        start_idx = n*batch_size
        end_idx = (n+1)*batch_size
        Xtrain_batch = np.concatenate((Xtrain[:start_idx, :], Xtrain[end_idx:, :]))
        Ytrain_batch = np.concatenate((Ytrain[:start_idx], Ytrain[end_idx:]))
        Xtest_batch = Xtrain[start_idx:end_idx, :]
        Ytest_batch = Ytrain[start_idx:end_idx]
    return Xtrain_batch, Ytrain_batch, Xtest_batch, Ytest_batch


In [10]:
# hyper parameter for LASSO
alpha = [0.1, 0.3, 0.5, 1, 10]
# allowed error
percentage = [0.05, 0.1, 0.2]

In [11]:
def accuracy(pred, gt, percentage=0.05):
    gt_upper = gt + gt * percentage
    gt_lower = gt - gt * percentage
    return np.mean((pred <= gt_upper) & (pred >= gt_lower))

In [12]:
def train_lasso(Xtrain, Ytrain, alpha, percentage, k=5):
    loss_sum = 0
    acc_sum = 0
    for i in range(k):
        Xtrain_batch, Ytrain_batch, Xtest_batch, Ytest_batch = getdata_cv(Xtrain, Ytrain, k=k, n=i)
        model = Lasso(alpha=alpha).fit(Xtrain_batch, Ytrain_batch)
        pred = model.predict(Xtest_batch)
        loss_sum += mean_squared_error(pred, Ytest_batch)
        acc_sum += accuracy(pred, Ytest_batch, percentage)
    return loss_sum/k, acc_sum/k

In [13]:
result_loss = np.zeros((len(alpha), len(percentage)))
result_acc = np.zeros((len(alpha), len(percentage)))
for i in range(len(alpha)):
    for j in range(len(percentage)):
        loss, acc = train_lasso(X_train.to_numpy(), Y_train.to_numpy(), alpha[i], percentage[j])
        result_loss[i, j] = loss
        result_acc[i, j] = acc


In [14]:
result_loss

array([[44130.47845274, 44130.47845274, 44130.47845274],
       [43705.12911651, 43705.12911651, 43705.12911651],
       [43591.147594  , 43591.147594  , 43591.147594  ],
       [43411.30348354, 43411.30348354, 43411.30348354],
       [43349.891062  , 43349.891062  , 43349.891062  ]])

In [15]:
result_acc

array([[0.37321839, 0.62022989, 0.86287356],
       [0.37333333, 0.62367816, 0.86954023],
       [0.36643678, 0.62022989, 0.87298851],
       [0.36988506, 0.62022989, 0.87298851],
       [0.36988506, 0.62022989, 0.87298851]])

## evaluate the model on test set

In [17]:
alpha = 0.3

model = Lasso(alpha=alpha).fit(X_train, Y_train)
pred = model.predict(X_test)
mse_loss = mean_squared_error(pred, Y_test)
print(mse_loss)
for j in range(len(percentage)):
        acc = accuracy(pred, Y_test, percentage[j])
        print(acc)

32567.119867643407
0.547945205479452
0.684931506849315
0.8767123287671232


## Boston dataset

In [18]:
train_lda = pd.read_csv('../data/b_training_lda_final.csv', index_col=['date'])
test_lda = pd.read_csv('../data/b_testing_lda_final.csv', index_col=['date'])
nlp_data = pd.read_csv('../data/Boston_feat.csv', index_col=['date'])

In [19]:
X_train = train_lda.join(nlp_data)
X_test = test_lda.join(nlp_data)

In [28]:
gt = pd.read_csv('../data/Boston22_groundtruth.csv')
gt['Total']= gt.iloc[:, 1:].sum(axis=1)
gt['date'] = gt['OCCURRED_ON_DATE'].str[:10]
gt = gt.set_index('date')
Y_train = gt.reindex(X_train.index)['Total']
Y_test = gt.reindex(X_test.index)['Total']

In [30]:
# hyper parameter for LASSO
alpha = [0.1, 0.3, 0.5, 1, 10]
percentage = [0.05, 0.1, 0.2]

In [31]:

result_loss = np.zeros((len(alpha), len(percentage)))
result_acc = np.zeros((len(alpha), len(percentage)))
for i in range(len(alpha)):
    for j in range(len(percentage)):
        loss, acc = train_lasso(X_train.to_numpy(), Y_train.to_numpy(), alpha[i], percentage[j])
        result_loss[i, j] = loss
        result_acc[i, j] = acc

In [32]:
result_loss

array([[2805.1539822 , 2805.1539822 , 2805.1539822 ],
       [2788.73622457, 2788.73622457, 2788.73622457],
       [2771.45642894, 2771.45642894, 2771.45642894],
       [2765.81905737, 2765.81905737, 2765.81905737],
       [2759.95864149, 2759.95864149, 2759.95864149]])

In [33]:
result_acc

array([[0.30137931, 0.56528736, 0.86655172],
       [0.27724138, 0.57563218, 0.86310345],
       [0.28758621, 0.58597701, 0.85965517],
       [0.29103448, 0.58597701, 0.85965517],
       [0.28413793, 0.58942529, 0.85632184]])

In [36]:
alpha = 0.1

model = Lasso(alpha=alpha).fit(X_train, Y_train)
pred = model.predict(X_test)
mse_loss = mean_squared_error(pred, Y_test)
print(mse_loss)
for j in range(len(percentage)):
        acc = accuracy(pred, Y_test, percentage[j])
        print(acc)

2340.1657161434105
0.2328767123287671
0.589041095890411
0.9041095890410958
