In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [2]:
import os
cwd = os.getcwd()
print(cwd)

/Users/jessica/Desktop/CSE8803IUC-main


In [3]:
train_lda = pd.read_csv('/Users/jessica/Desktop/CSE8803IUC-main/data/la_training_lda_final.csv', index_col=['date'])
test_lda = pd.read_csv('/Users/jessica/Desktop/CSE8803IUC-main/data/la_testing_lda_final.csv', index_col=['date'])
nlp_data = pd.read_csv('/Users/jessica/Desktop/CSE8803IUC-main/data/Los_Angeles_feat.csv', index_col=['date'])

In [4]:
X_train = train_lda.join(nlp_data)
X_test = test_lda.join(nlp_data)

In [5]:
X_train.shape, X_test.shape

((292, 19), (73, 19))

In [6]:
gt = pd.read_csv('/Users/jessica/Desktop/CSE8803IUC-main/data/LA22_groundtruth.csv')

In [7]:
gt['Total']= gt.iloc[:, 1:].sum(axis=1)
gt['date'] = gt['OCCURRED_ON_DATE'].str[:10]

In [8]:
gt = gt.set_index('date')

In [9]:
Y_train = gt.reindex(X_train.index)['Total']
Y_test = gt.reindex(X_test.index)['Total']

In [10]:
# generate data for n-th iteration for k-fold cross validation
def getdata_cv(Xtrain, Ytrain, k=5, n=0):
    batch_size = int(Xtrain.shape[0] / k)
    if n == 0:
        Xtrain_batch = Xtrain[batch_size:, :]
        Ytrain_batch = Ytrain[batch_size:]
        Xtest_batch = Xtrain[:batch_size, :]
        Ytest_batch = Ytrain[:batch_size]
    elif n == k-1:
        Xtrain_batch = Xtrain[:n*batch_size, :]
        Ytrain_batch = Ytrain[:n*batch_size]
        Xtest_batch = Xtrain[n*batch_size:, :]
        Ytest_batch = Ytrain[n*batch_size:]
    else:
        start_idx = n*batch_size
        end_idx = (n+1)*batch_size
        Xtrain_batch = np.concatenate((Xtrain[:start_idx, :], Xtrain[end_idx:, :]))
        Ytrain_batch = np.concatenate((Ytrain[:start_idx], Ytrain[end_idx:]))
        Xtest_batch = Xtrain[start_idx:end_idx, :]
        Ytest_batch = Ytrain[start_idx:end_idx]
    return Xtrain_batch, Ytrain_batch, Xtest_batch, Ytest_batch


In [11]:
# hyper parameter
percentage = [0.05, 0.1, 0.2]

In [12]:
def accuracy(pred, gt, percentage=0.05):
    gt_upper = gt + gt * percentage
    gt_lower = gt - gt * percentage
    return np.mean((pred <= gt_upper) & (pred >= gt_lower))

In [13]:
from sklearn.naive_bayes import GaussianNB
#clf = GaussianNB()
#clf.fit(X, Y)
def train_NB(Xtrain, Ytrain, percentage, k=5):
    loss_sum = 0
    acc_sum = 0
    for i in range(k):
        Xtrain_batch, Ytrain_batch, Xtest_batch, Ytest_batch = getdata_cv(Xtrain, Ytrain, k=k, n=i)
        model = GaussianNB().fit(Xtrain_batch, Ytrain_batch)
        pred = model.predict(Xtest_batch)
        loss_sum += mean_squared_error(pred, Ytest_batch)
        print(mean_squared_error(pred, Ytest_batch))
        acc_sum += accuracy(pred, Ytest_batch, percentage)
        #print(model.score(Xtest_batch, Ytest_batch))
        #score: Mean accuracy of self.predict(X) w.r.t. y.
    print("mean of MSE is", np.mean(loss_sum)/5)
    return loss_sum/k, acc_sum/k

In [14]:
model = GaussianNB().fit(X_train, Y_train)
pred = model.predict(X_test)
mse_loss = mean_squared_error(pred, Y_test)
print(mse_loss)
for j in range(len(percentage)):
        acc = accuracy(pred, Y_test, percentage[j])
        print(acc)

34938.24657534246
0.5205479452054794
0.7808219178082192
0.8767123287671232


In [15]:
result_loss = np.zeros(len(percentage))
result_acc = np.zeros(len(percentage))
for j in range(len(percentage)):
    loss, acc = train_NB(X_train.to_numpy(), Y_train.to_numpy(), percentage[j])
    result_loss[j] = loss
    result_acc[j] = acc


64199.03448275862
78012.20689655172
75515.5172413793
57053.24137931035
40180.26666666667
mean of MSE is 62992.05333333333
64199.03448275862
78012.20689655172
75515.5172413793
57053.24137931035
40180.26666666667
mean of MSE is 62992.05333333333
64199.03448275862
78012.20689655172
75515.5172413793
57053.24137931035
40180.26666666667
mean of MSE is 62992.05333333333


In [16]:
result_loss

array([62992.05333333, 62992.05333333, 62992.05333333])

In [17]:
result_acc

array([0.29425287, 0.55781609, 0.83885057])

In [18]:
from sklearn.neighbors import KNeighborsClassifier
def train_KNN(Xtrain, Ytrain, percentage, k=5):
    loss_sum = 0
    acc_sum = 0
    for i in range(k):
        Xtrain_batch, Ytrain_batch, Xtest_batch, Ytest_batch = getdata_cv(Xtrain, Ytrain, k=k, n=i)
        model = KNeighborsClassifier(n_neighbors=5).fit(Xtrain_batch, Ytrain_batch)
        pred = model.predict(Xtest_batch)
        loss_sum += mean_squared_error(pred, Ytest_batch)
        acc_sum += accuracy(pred, Ytest_batch, percentage)
        print(mean_squared_error(pred, Ytest_batch))
        #print(model.score(Xtest_batch, Ytest_batch))
        #score: Mean accuracy of self.predict(X) w.r.t. y.
    print("mean of MSE is", np.mean(loss_sum)/5)
    return loss_sum/k, acc_sum/k

In [19]:
result_loss = np.zeros(len(percentage))
result_acc = np.zeros(len(percentage))
for j in range(len(percentage)):
    loss, acc = train_KNN(X_train.to_numpy(), Y_train.to_numpy(), percentage[j])
    result_loss[j] = loss
    result_acc[j] = acc

89042.4827586207
79570.06896551725
91517.6551724138
67925.10344827586
43908.73333333333
mean of MSE is 74392.80873563219
89042.4827586207
79570.06896551725
91517.6551724138
67925.10344827586
43908.73333333333
mean of MSE is 74392.80873563219
89042.4827586207
79570.06896551725
91517.6551724138
67925.10344827586
43908.73333333333
mean of MSE is 74392.80873563219


In [20]:
result_loss

array([74392.80873563, 74392.80873563, 74392.80873563])

In [21]:
result_acc

array([0.25666667, 0.54402299, 0.82482759])