In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("train_updated.csv", encoding="utf-8", usecols=range(2, 27))
train_data = df.replace(['NR'], [0.0]) 
# train_data is a complete dataset including all observations

In [2]:
train_data

Unnamed: 0,測項,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,AMB_TEMP,14,14,14,13,12,12,12,12,15,...,22,22,21,19,17,16,15,15,15,15
1,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,CO,0.51,0.41,0.39,0.37,0.35,0.3,0.37,0.47,0.78,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,NMHC,0.2,0.15,0.13,0.12,0.11,0.06,0.1,0.13,0.26,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,NO,0.9,0.6,0.5,1.7,1.8,1.5,1.9,2.2,6.6,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5
5,NO2,16,9.2,8.2,6.9,6.8,3.8,6.9,7.8,15,...,11,11,22,28,19,12,8.1,7,6.9,6
6,NOx,17,9.8,8.7,8.6,8.5,5.3,8.8,9.9,22,...,14,13,25,30,21,13,9.7,8.6,8.7,7.5
7,O3,16,30,27,23,24,28,24,22,21,...,65,64,51,34,33,34,37,38,38,36
8,PM10,56,50,48,35,25,12,4,2,11,...,52,51,66,85,85,63,46,36,42,42
9,PM2.5,26,39,36,35,31,28,25,20,19,...,36,45,42,49,45,44,41,30,24,13


In [3]:
pm2_5 = train_data[train_data['測項']=='PM2.5'].iloc[:,1:]
# pm2_5 is a subset of complete dataset: only contains PM2.5 observations

In [4]:
pm2_5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
9,26,39,36,35,31,28,25,20,19,30,...,36,45,42,49,45,44,41,30,24,13
27,21,23,30,30,22,18,13,13,11,22,...,53,43,43,45,46,32,16,19,22,26
45,19,25,27,20,16,14,15,8,4,9,...,32,36,34,45,40,41,23,29,23,37
63,27,27,14,20,22,24,26,33,48,50,...,62,55,56,67,78,83,90,75,85,82
81,80,80,76,81,75,66,70,65,66,57,...,64,73,57,57,53,70,70,60,68,66
99,66,55,46,46,46,42,35,35,40,45,...,62,59,61,50,60,62,62,50,45,43
117,41,32,35,31,33,25,36,28,33,29,...,62,61,47,54,57,59,61,43,39,38
135,48,53,60,60,52,47,45,50,43,40,...,35,36,36,32,33,33,34,21,15,13
153,18,15,9,9,14,15,20,16,23,16,...,20,20,20,18,18,22,15,11,1,1
171,6,7,7,0,0,1,1,7,7,8,...,23,20,18,15,16,10,6,11,22,28


In [5]:
# rearranging train_data
tempxlist = []
tempylist = []
for i in range (15):
    tempx = pm2_5.iloc[:,i:i+9]
    tempx.columns = np.array(range(9))
    tempy = pm2_5.iloc[:,i+9]
    tempy.columns = ['1']
    tempxlist.append(tempx)
    tempylist.append(tempy)

In [6]:
# from all 3600: select first 3200 to train and use the rest 400 to validate

xdata = pd.concat(tempxlist)
xall = np.array(xdata, float)
x = xall[:3200,:]

In [7]:
x.shape

(3200, 9)

In [8]:
ydata = pd.concat(tempylist)
ydata.head()
yall = np.array(ydata, float)
y = yall[:3200]

In [9]:
y.shape

(3200,)

In [10]:
x.shape[0]

3200

In [11]:
bdata = np.ones((x.shape[0],1))  # initial bias = 1

In [12]:
x = np.concatenate((bdata, x), axis=1)
x

array([[ 1., 26., 39., ..., 25., 20., 19.],
       [ 1., 21., 23., ..., 13., 13., 11.],
       [ 1., 19., 25., ..., 15.,  8.,  4.],
       ...,
       [ 1., 24., 23., ..., 19., 23., 21.],
       [ 1., 44., 40., ..., 23., 21., 14.],
       [ 1., 24., 19., ..., 47., 52., 40.]])

In [13]:
x[0]

array([ 1., 26., 39., 36., 35., 31., 28., 25., 20., 19.])

In [14]:
_l = len(x[0])
weights = np.ones((_l))

In [15]:
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [97]:
learn_rate = 1
iteration = 1000
s_grad = np.zeros(_l)

In [98]:
s_grad

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [99]:
for i in range (iteration):
    y_prime = np.dot(x, weights)
    loss = y - y_prime
    grad = np.dot(x.transpose(), loss)*(-2)
    s_grad += grad**2
    ada = np.sqrt(s_grad)
    weights = weights - learn_rate*grad/ada

In [100]:
weights # this should be the best b & w set

array([ 2.68622732, -0.14495618,  0.27277639,  0.04154967, -0.41709287,
        0.36816551,  0.2392043 , -0.66814801,  0.27622683,  0.94384464])

In [101]:
# validate the results using the rest 400
x_val = xall[3200:xall.shape[0],:]
x_val.shape

(400, 9)

In [102]:
y_val = yall[3200:yall.shape[0]]
y_val.shape

(400,)

In [103]:
# prep the data to validate
x_val = np.concatenate((np.ones((x_val.shape[0],1)), x_val), axis=1)
x_val

array([[ 1., 41., 36., ..., 10., 13.,  8.],
       [ 1., 28., 31., ...,  4.,  6., 11.],
       [ 1., 60., 57., ..., 23., 28., 20.],
       ...,
       [ 1., 31., 31., ..., 39., 36., 24.],
       [ 1., 21., 21., ..., 11.,  0.,  7.],
       [ 1.,  1.,  7., ..., 28., 17., 24.]])

In [104]:
x_val.shape

(400, 10)

In [105]:
weights

array([ 2.68622732, -0.14495618,  0.27277639,  0.04154967, -0.41709287,
        0.36816551,  0.2392043 , -0.66814801,  0.27622683,  0.94384464])

In [106]:
y_prime = np.dot(x_val, weights)
loss_val = y_val - y_prime
np.average(loss_val) # average loss

-2.085980970718801

In [107]:
# now let's start predicting

testdata = pd.read_csv("given/test_X.csv")

In [108]:
testdata

Unnamed: 0,id_0,AMB_TEMP,15,14,14.1,13,13.1,13.2,13.3,13.4,12
0,id_0,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
1,id_0,CO,0.36,0.35,0.34,0.33,0.33,0.34,0.34,0.37,0.42
2,id_0,NMHC,0.11,0.09,0.09,0.1,0.1,0.1,0.1,0.11,0.12
3,id_0,NO,0.6,0.4,0.3,0.3,0.3,0.7,0.8,0.8,0.9
4,id_0,NO2,9.3,7.1,6.1,5.7,5.5,5.3,5.5,7.1,7.5
5,id_0,NOx,9.9,7.5,6.4,5.9,5.8,6,6.2,7.8,8.4
6,id_0,O3,36,44,45,44,44,44,43,40,38
7,id_0,PM10,51,51,31,40,34,51,42,36,30
8,id_0,PM2.5,27,13,24,29,41,30,29,27,28
9,id_0,RAINFALL,NR,NR,NR,NR,NR,NR,NR,NR,NR


In [109]:
pm2_5_test = testdata[testdata['AMB_TEMP']=='PM2.5'].iloc[:,2:]

In [110]:
pm2_5_test

Unnamed: 0,15,14,14.1,13,13.1,13.2,13.3,13.4,12
8,27,13,24,29,41,30,29,27,28
26,46,47,57,78,84,76,59,61,61
44,10,10,25,34,40,39,36,25,22
62,71,58,51,41,41,46,43,34,29
80,13,23,18,10,5,5,13,9,12
98,25,32,40,40,31,27,13,24,29
116,37,42,65,70,71,58,51,41,41
134,17,22,17,9,17,23,29,17,18
152,78,84,76,59,61,61,60,49,49
170,13,13,18,20,14,9,16,22,35


In [111]:
x_test = np.array(pm2_5_test, float)
x_test

array([[27., 13., 24., ..., 29., 27., 28.],
       [46., 47., 57., ..., 59., 61., 61.],
       [10., 10., 25., ..., 36., 25., 22.],
       ...,
       [16., 25., 12., ..., 11.,  3.,  0.],
       [29., 27., 23., ..., 35., 35., 26.],
       [12.,  8.,  7., ...,  0.,  0.,  0.]])

In [112]:
# adding bias = 0
x_test_b = np.concatenate((np.ones((x_test.shape[0],1)),x_test),axis=1)

In [113]:
# predicting result
y_star = np.dot(x_test_b, weights)

In [114]:
y_star

array([ 28.00039938,  62.78287746,  18.49452608,  27.36462088,
        11.81572146,  35.95610067,  38.24466707,  17.24484132,
        49.58633767,  32.48226722,  39.22867723,  57.19203142,
        41.42474742,  42.20673324,  28.80451788,  17.94480434,
        51.91444439,  24.57001779,  19.02488678,  21.09480872,
        14.57449231,  28.74665307,  39.98550723,  11.03079939,
        59.54381152,  43.62901915,  27.75784752,  10.45140076,
        26.64744822,  36.7339824 ,  19.66740221,  14.0732773 ,
        17.76135038,   8.21168193,  18.75263395,  38.20966888,
        19.58010132,  29.79158501,   6.30298027,  59.81345865,
        46.09651896,  18.18125789,  52.46643589,  19.24415995,
        38.92626957,  50.32288092,  62.33148933,  41.15117058,
        49.35550359,  44.48790036,  31.1363945 ,  26.1192273 ,
        45.80786714,  26.70171194,  37.94838551,  37.83939452,
        25.14927762,  24.87964231,  32.82590635,  31.73579824,
        16.24500064,  23.56666157,  33.09610443,  54.92

In [115]:
# put the predicted results to the output file
y_pre = pd.read_csv('given/sampleSubmission.csv')

In [116]:
y_pre.value = y_star
y_pre

Unnamed: 0,id,value
0,id_0,28.000399
1,id_1,62.782877
2,id_2,18.494526
3,id_3,27.364621
4,id_4,11.815721
5,id_5,35.956101
6,id_6,38.244667
7,id_7,17.244841
8,id_8,49.586338
9,id_9,32.482267


In [117]:
y_pre.to_csv('submission.csv')