<a href="https://colab.research.google.com/github/shyDaniel/PM2.5_prediction_regression/blob/master/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PREDICTING PM2.5 WITH LINEAR REGRESSION, TECHINIQUES INCLUDING FEATURE ENGINEERING, CROSS-VALIDATION, NORMALIZATION, AND ADAGRAD GRADIENT OPTIMIZATION.**

Kaggle: https://www.kaggle.com/c/ml2020spring-hw1/overview

Hanyu Song 03/19/2020

In [148]:
import pandas as pd
import numpy as np
from google.colab import drive
# import train and test from google drive
! gdown --id '1JnF9biNzFqx5_9RKzCKPKgHPggtDHhue'
test = pd.read_csv('./test.csv', encoding = 'big5')
! gdown --id '1VR_MKDGwhexEThy4VEZudoN0zKgNRpys'
train = pd.read_csv('./train.csv', encoding = 'big5')

Downloading...
From: https://drive.google.com/uc?id=1JnF9biNzFqx5_9RKzCKPKgHPggtDHhue
To: /content/test.csv
100% 197k/197k [00:00<00:00, 74.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1VR_MKDGwhexEThy4VEZudoN0zKgNRpys
To: /content/train.csv
100% 466k/466k [00:00<00:00, 61.7MB/s]


## **FEATURE ENGINEERING FOR TRAINING DATASET**



In [0]:
# delete unnecessary columns, fill in NR values, change to numpy form
train = train.iloc[:, 3:]
train[train == 'NR'] = 0
train_data = train.to_numpy()

In [0]:
# now split train data into 12 (months) blocks in which each block contains 18 (features )*480 (24 hours * 20 days) info
monthly_train = {}
for month in range(12):
  temp = np.empty((18, 480))
  for day in range(20):
    temp[:, day*24 : (day + 1)*24] = train_data[18 * (20*month + day) : 18 * (20 * month + day + 1), :]
  monthly_train[month] = temp;

In [0]:
# we will use previous 9 hours to data to predict the next, which is the 10th hour of pm2.5 level. 
# Therefore, out of the 480 hours we have for a given month, we have 471 sets of data (480 - 9) that can be used for training.
# y shape: (12*471) * 1
# x shape: (12*471) * (9*18)
# weight shape: (9*18) * 1
# y = x*w

x = np.empty((12*471, 9*18), dtype = float)
y = np.empty((12*471, 1), dtype = float)

for month in range(12):
  for day in range(20):
    for hour in range(24):
      if day == 0 and hour < 9:
          continue
      y[month * 471 + day * 24 + hour - 9, 0] = monthly_train[month][9, day * 24 + hour] #PM2.5 is on row 9

for month in range(12):
  for day in range(20):
    for hour in range(24):
      if day == 19 and hour > 14: 
        continue
      x[month * 471 + day * 24 + hour, :] = monthly_train[month][:, day * 24 + hour : day * 24 + hour + 9].reshape(1, -1)

## **NORMALIZATION**

In [0]:
# for each of the 18 features, compute their mean and std 
# then use newdata = (data - mean) /std to update x
mean_x = np.mean(x, axis = 0) #18 * 9 
std_x = np.std(x, axis = 0) #18 * 9 
normed_x = np.empty(x.shape, dtype = float)
for i in range(len(x)): #12 * 471
    for j in range(len(x[0])): #18 * 9 
        if std_x[j] != 0:
            normed_x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]

## **CROSS-VALIDATION**

In [0]:
# This is an illustration of how we would split dataset into 4:1 train and validate
# x_train_set = x[: math.floor(len(normed_x) * 0.8), :]
# y_train_set = y[: math.floor(len(y) * 0.8), :]
# x_validation = x[math.floor(len(normed_x) * 0.8): , :]
# y_validation = y[math.floor(len(y) * 0.8): , :]

In [132]:
import math
import random as rand

dim = 9 * 18 + 1 # 9 * 18 features with 1 more constant
learning_rate = 100
iter_time = 10000
adagrad = np.zeros([dim, 1])
eps = 0.0000000001
sum_loss = 0
k_fold = 5

# Do 5-fold cross-validation on the normed_x and y
for i in range(k_fold):
  w = np.zeros((dim, 1))
  # create training and validating data for each iteration
  x_train_set = np.concatenate((normed_x[: math.floor(len(normed_x) * (0.2*i)), :], normed_x[math.floor(len(normed_x) * 0.2*(i + 1)) :, :]))
  x_validate_set = normed_x[math.floor(len(normed_x) * (0.2*i)) : math.floor(len(normed_x) * 0.2* (i + 1))]
  y_train_set = np.concatenate((y[:math.floor(len(y) * (0.2*i)), :], y[math.floor(len(y) * 0.2*(i + 1)):, :]))
  y_validate_set = y[math.floor(len(y) * (0.2*i)) : math.floor(len(y) * 0.2* (i + 1))]
 
  # x train plus one row of constant (to test the weight for the constant term)
  temp_x = np.concatenate((np.ones((x_train_set.shape[0], 1)), x_train_set), axis = 1)
  x_validate_set = np.concatenate((np.ones((x_validate_set.shape[0], 1)), x_validate_set), axis = 1)
  gradient = np.zeros((dim, 1))

  for t in range(iter_time):
    loss = np.sqrt(np.sum(np.power(np.dot(temp_x, w) - y_train_set, 2))/471/12)#rmse
    if(t%1000==0):
        print("  " + str(t) + ":" + str(loss))
    # gradient descent
    gradient = 2 * np.dot(temp_x.transpose(), np.dot(temp_x, w) - y_train_set) #dim*1

    # if (gradient){
    #     break;
    # }

    #adagrad gradient optimization
    adagrad += gradient ** 2
    w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
  
  final_w += w
  print("for the ", i,"th time, the parameter is roughly ", w[rand.randrange(1,18),0])
  valid_loss = np.sqrt(np.sum(np.power(np.dot(x_validate_set, w) - y_validate_set, 2))/471/12)
  print("the loss for the ", i,"th time validate is ", valid_loss)
  sum_loss += valid_loss

print('average loss for the 5-fold validation is: ', sum_loss/k_fold)

  0:23.191031803732383
  1000:5.137467846585117
  2000:5.013471278970329
  3000:4.983252584388589
  4000:4.973801776031307
  5000:4.970552490353995
  6000:4.969319033100226
  7000:4.968777493908696
  8000:4.968488356334668
  9000:4.9682996659653735
for the  0 th time, the parameter is roughly  1.1562716953101948
the loss for the  0 th time validate is  2.8254917029891926
  0:22.645491701640278
  1000:5.10400672452956
  2000:5.003220935378245
  3000:4.975611900716189
  4000:4.9665335977422895
  5000:4.963003843471492
  6000:4.961296626266485
  7000:4.9602632884047395
  8000:4.959522157689649
  9000:4.958932469077052
for the  1 th time, the parameter is roughly  0.031266346959048716
the loss for the  1 th time validate is  2.86800810924602
  0:25.17146085577053
  1000:5.060297149531589
  2000:4.968960727766671
  3000:4.943072167072916
  4000:4.934112601919547
  5000:4.93073239873075
  6000:4.929324609804323
  7000:4.928642959031936
  8000:4.9282417241696805
  9000:4.927957891395802
for t

## **TRAINING**

In [146]:
dim = 9 * 18 + 1 # 9 * 18 features with 1 more constant
learning_rate = 100
iter_time = 100000
adagrad = np.zeros([dim, 1])
eps = 0.0000000001
w = np.zeros([dim, 1])
normed_x = np.concatenate((np.ones([12 * 471, 1]), normed_x), axis = 1)

for t in range(iter_time):
    loss = np.sqrt(np.sum(np.power(np.dot(normed_x, w) - y, 2))/471/12)#rmse
    if(t%1000==0):
        print(str(t) + ":" + str(loss))
    gradient = 2 * np.dot(normed_x.transpose(), np.dot(normed_x, w) - y) #dim*1
    adagrad += gradient ** 2
    w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
np.save('model_weights.npy', w)

0:27.071214829194115
1000:7.0909686439472175
2000:6.02412917022559
3000:5.786921459274559
4000:5.719646593920436
5000:5.697327465527851
6000:5.6890026614252855
7000:5.685550743411822
8000:5.683950654084062
9000:5.683112196533551
10000:5.682612389290917
11000:5.68227557001515
12000:5.682023846319957
13000:5.681820475817497
14000:5.681647109858766
15000:5.6814940870334425
16000:5.6813560499652676
17000:5.68122986139529
18000:5.681113569134227
19000:5.6810058734465585
20000:5.680905845208106
21000:5.6808127734060685
22000:5.680726081059967
23000:5.680645278038106
24000:5.680569934064305
25000:5.680499662904217
26000:5.680434112806312
27000:5.680372960479074
28000:5.680315907092202
29000:5.680262675455349
30000:5.680213007897699
31000:5.68016666457814
32000:5.680123422071689
33000:5.680083072142897
34000:5.6800454206538085
35000:5.680010286574953
36000:5.679977501079685
37000:5.679946906709083
38000:5.6799183565985905
39000:5.679891713759908
40000:5.679866850413115
41000:5.679843647364887


## **FEATURE ENGINEERING FOR TESTING DATASET**

In [0]:
test = test.iloc[:, 2:]
test[test == 'NR'] = 0
test = test.to_numpy()

# noticed that test data lacked the last row of data, which is CH4, manually fill them with 1.7
add = np.ones((1,9))
for i in range(add.shape[1]):
  add[0,i] = 1.7
test = np.concatenate((test, add))
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
    test_x[i, :] = test[18 * i: 18* (i + 1), :].reshape(1, -1)

## **NORMALIZATION FOR TESTING DATASET**

In [0]:
mean_test = np.mean(test_x, axis = 0)
std_test = np.std(test_x, axis = 0)
normed_test_x = np.empty(test_x.shape, dtype = float)

for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_test[j] != 0:
            normed_test_x[i][j] = (test_x[i][j] - mean_test[j]) / std_test[j]
normed_test_x = np.concatenate((np.ones([240, 1]), normed_test_x), axis = 1)

## **PREDICTION**

In [152]:
test_w = np.load('model_weights.npy')
ans_y = np.dot(normed_test_x, test_w)
ans_y

array([[ 1.17980716e+01],
       [ 2.40567956e+01],
       [ 3.91182932e+01],
       [ 2.74206837e+01],
       [ 3.32827692e+01],
       [ 1.20404011e+01],
       [ 1.73540086e+00],
       [ 1.11373897e+01],
       [ 3.29420529e+01],
       [-2.07246583e+01],
       [ 2.29510790e+01],
       [ 2.15953050e+01],
       [ 2.42138181e+01],
       [-7.92653083e+00],
       [ 1.92617675e+01],
       [ 1.45647040e+01],
       [ 1.35981073e+01],
       [ 5.82723395e+01],
       [-4.81168199e+00],
       [ 1.68981752e+01],
       [ 1.93255854e+01],
       [ 8.82347710e+00],
       [ 2.97535402e+01],
       [ 2.55270178e+01],
       [ 2.71937558e+01],
       [ 2.05830600e+01],
       [ 7.15740994e+00],
       [ 8.42083843e+00],
       [ 1.21472043e+01],
       [-1.28291766e+01],
       [ 1.65022314e+01],
       [ 6.76819920e+00],
       [ 2.23917099e+01],
       [ 2.22148320e+01],
       [ 1.94947409e+01],
       [ 2.65029575e+01],
       [ 1.88337417e+01],
       [-2.15983959e+00],
       [-8.0

In [0]:
df = pd.DataFrame(np.empty((240, 2)), index = np.arange(240) + 1, columns = ['id', 'value'])
for i in range(240):
  df.iloc[i, 0] = 'id_' + str(i)
df.iloc[:, 1] = ans_y
df.to_csv('submission.csv', index = False)