In [66]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [8]:
train_lda = pd.read_csv('../data/la_training_lda_final.csv', index_col=['date'])
test_lda = pd.read_csv('../data/la_testing_lda_final.csv', index_col=['date'])
nlp_data = pd.read_csv('../data/Los_Angeles_feat.csv', index_col=['date'])

In [60]:
X_train = train_lda.join(nlp_data)
X_test = test_lda.join(nlp_data)

In [61]:
X_train.shape, X_test.shape

((292, 19), (73, 19))

In [77]:
gt = pd.read_csv('../data/LA22_groundtruth.csv')

In [29]:
gt['Total']= gt.iloc[:, 1:].sum(axis=1)
gt['date'] = gt['OCCURRED_ON_DATE'].str[:10]

  gt['Total']= gt.iloc[:, 1:].sum(axis=1)


In [52]:
gt = gt.set_index('date')

In [62]:
Y_train = gt.reindex(X_train.index)['Total']
Y_test = gt.reindex(X_test.index)['Total']

In [64]:
# generate data for n-th iteration for k-fold cross validation
def getdata_cv(Xtrain, Ytrain, k=5, n=0):
    batch_size = int(Xtrain.shape[0] / k)
    if n == 0:
        Xtrain_batch = Xtrain[batch_size:, :]
        Ytrain_batch = Ytrain[batch_size:]
        Xtest_batch = Xtrain[:batch_size, :]
        Ytest_batch = Ytrain[:batch_size]
    elif n == k-1:
        Xtrain_batch = Xtrain[:n*batch_size, :]
        Ytrain_batch = Ytrain[:n*batch_size]
        Xtest_batch = Xtrain[n*batch_size:, :]
        Ytest_batch = Ytrain[n*batch_size:]
    else:
        start_idx = n*batch_size
        end_idx = (n+1)*batch_size
        Xtrain_batch = np.concatenate((Xtrain[:start_idx, :], Xtrain[end_idx:, :]))
        Ytrain_batch = np.concatenate((Ytrain[:start_idx], Ytrain[end_idx:]))
        Xtest_batch = Xtrain[start_idx:end_idx, :]
        Ytest_batch = Ytrain[start_idx:end_idx]
    return Xtrain_batch, Ytrain_batch, Xtest_batch, Ytest_batch


In [83]:
# hyper parameter for LASSO
alpha = [0.1, 0.3, 0.5, 1, 10]
percentage = [0.05, 0.1, 0.2]

In [73]:
def accuracy(pred, gt, percentage=0.05):
    gt_upper = gt + gt * percentage
    gt_lower = gt - gt * percentage
    return np.mean((pred <= gt_upper) & (pred >= gt_lower))

In [74]:
def train_lasso(Xtrain, Ytrain, alpha, percentage, k=5):
    loss_sum = 0
    acc_sum = 0
    for i in range(k):
        Xtrain_batch, Ytrain_batch, Xtest_batch, Ytest_batch = getdata_cv(Xtrain, Ytrain, k=k, n=i)
        model = Lasso(alpha=alpha).fit(Xtrain_batch, Ytrain_batch)
        pred = model.predict(Xtest_batch)
        loss_sum += mean_squared_error(pred, Ytest_batch)
        acc_sum += accuracy(pred, Ytest_batch, percentage)
    return loss_sum/k, acc_sum/k

In [84]:
result_loss = np.zeros((len(alpha), len(percentage)))
result_acc = np.zeros((len(alpha), len(percentage)))
for i in range(len(alpha)):
    for j in range(len(percentage)):
        loss, acc = train_lasso(X_train.to_numpy(), Y_train.to_numpy(), alpha[i], percentage[j])
        result_loss[i, j] = loss
        result_acc[i, j] = acc


In [85]:
result_loss

array([[176587.71953979, 176587.71953979, 176587.71953979],
       [175902.57532253, 175902.57532253, 175902.57532253],
       [174976.18705472, 174976.18705472, 174976.18705472],
       [174364.59037601, 174364.59037601, 174364.59037601],
       [173535.56509318, 173535.56509318, 173535.56509318]])

In [86]:
result_acc

array([[0.37321839, 0.61011494, 0.86287356],
       [0.38344828, 0.61678161, 0.86965517],
       [0.37678161, 0.62022989, 0.86954023],
       [0.36643678, 0.62022989, 0.87298851],
       [0.36988506, 0.62022989, 0.87298851]])