In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("train_updated.csv", encoding="utf-8", usecols=range(2, 27))
train_data = df.replace(['NR'], [0.0]) 
# train_data is a complete dataset including all observations

In [3]:
pm2_5 = train_data[train_data['測項']=='PM2.5'].iloc[:,1:]
# pm2_5 is a subset of complete dataset: only contains PM2.5 observations

In [5]:
# rearranging train_data
tempxlist = []
tempylist = []
for i in range (15):
    tempx = pm2_5.iloc[:,i:i+9]
    tempx.columns = np.array(range(9))
    tempy = pm2_5.iloc[:,i+9]
    tempy.columns = ['1']
    tempxlist.append(tempx)
    tempylist.append(tempy)

In [6]:
xdata = pd.concat(tempxlist)
x = np.array(xdata, float)
x

array([[26., 39., 36., ..., 25., 20., 19.],
       [21., 23., 30., ..., 13., 13., 11.],
       [19., 25., 27., ..., 15.,  8.,  4.],
       ...,
       [31., 31., 36., ..., 39., 36., 24.],
       [21., 21., 19., ..., 11.,  0.,  7.],
       [ 1.,  7., 24., ..., 28., 17., 24.]])

In [7]:
x.shape

(3600, 9)

In [8]:
ydata = pd.concat(tempylist)
ydata.head()
y = np.array(ydata, float)
y

array([30., 22.,  9., ..., 23.,  7., 29.])

In [9]:
y.shape

(3600,)

In [10]:
x.shape[0]

3600

In [11]:
bdata = np.ones((x.shape[0],1))  # initial bias = 1

In [12]:
x = np.concatenate((bdata, x), axis=1)
x

array([[ 1., 26., 39., ..., 25., 20., 19.],
       [ 1., 21., 23., ..., 13., 13., 11.],
       [ 1., 19., 25., ..., 15.,  8.,  4.],
       ...,
       [ 1., 31., 31., ..., 39., 36., 24.],
       [ 1., 21., 21., ..., 11.,  0.,  7.],
       [ 1.,  1.,  7., ..., 28., 17., 24.]])

In [14]:
_l = len(x[0])
weights = np.zeros((_l))

In [24]:
learn_rate = 1
iteration = 10000
s_grad = np.zeros(_l)

In [26]:
for i in range (iteration):
    y_prime = np.dot(x, weights)
    loss = y - y_prime
    grad = np.dot(x.transpose(), loss)*(-2)
    s_grad += grad**2
    ada = np.sqrt(s_grad)
    weights = weights - learn_rate*grad/ada

In [23]:
weights

array([ 2.14416897e+00,  2.49247404e-04, -3.70737877e-02,  2.04450118e-01,
       -2.25701620e-01, -2.98680931e-02,  4.65538636e-01, -5.57069337e-01,
        2.57164804e-02,  1.07483375e+00])

In [27]:
# so the weights above should be the best b and ws
# now let's start predicting

testdata = pd.read_csv("test_X.csv")

In [28]:
testdata

Unnamed: 0,id_0,AMB_TEMP,15,14,14.1,13,13.1,13.2,13.3,13.4,12
0,id_0,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
1,id_0,CO,0.36,0.35,0.34,0.33,0.33,0.34,0.34,0.37,0.42
2,id_0,NMHC,0.11,0.09,0.09,0.1,0.1,0.1,0.1,0.11,0.12
3,id_0,NO,0.6,0.4,0.3,0.3,0.3,0.7,0.8,0.8,0.9
4,id_0,NO2,9.3,7.1,6.1,5.7,5.5,5.3,5.5,7.1,7.5
5,id_0,NOx,9.9,7.5,6.4,5.9,5.8,6,6.2,7.8,8.4
6,id_0,O3,36,44,45,44,44,44,43,40,38
7,id_0,PM10,51,51,31,40,34,51,42,36,30
8,id_0,PM2.5,27,13,24,29,41,30,29,27,28
9,id_0,RAINFALL,NR,NR,NR,NR,NR,NR,NR,NR,NR


In [29]:
pm2_5_test = testdata[testdata['AMB_TEMP']=='PM2.5'].iloc[:,2:]

In [31]:
x_test = np.array(pm2_5_test, float)
x_test

array([[27., 13., 24., ..., 29., 27., 28.],
       [46., 47., 57., ..., 59., 61., 61.],
       [10., 10., 25., ..., 36., 25., 22.],
       ...,
       [16., 25., 12., ..., 11.,  3.,  0.],
       [29., 27., 23., ..., 35., 35., 26.],
       [12.,  8.,  7., ...,  0.,  0.,  0.]])

In [32]:
# adding bias = 0
x_test_b = np.concatenate((np.ones((x_test.shape[0],1)),x_test),axis=1)

In [33]:
# predicting result
y_star = np.dot(x_test_b, weights)

In [34]:
y_star

array([ 27.41440924,  61.55574843,  20.4980879 ,  29.534512  ,
        10.79764676,  36.20722448,  39.71643873,  16.57174599,
        48.23626866,  33.98383059,  39.25397608,  58.4832285 ,
        37.23346974,  41.66913818,  27.55245239,  13.18884588,
        52.30030581,  23.63327146,  19.76623547,  20.51351383,
        15.66271563,  28.49276635,  38.03473423,  11.0807695 ,
        58.24255325,  44.53166848,  28.87675413,   9.80658048,
        29.69979803,  36.3508985 ,  19.43428986,  12.71482776,
        17.01176121,   8.62672283,  18.82223583,  38.66456477,
        20.08404389,  28.39277523,   6.94421262,  59.6890592 ,
        46.63325543,  17.28590255,  49.70849742,  18.98558787,
        44.15952936,  51.19901953,  61.64641674,  40.29587749,
        51.04031554,  42.08148093,  30.75654682,  25.437231  ,
        41.07788212,  26.36300133,  38.24279478,  39.90050018,
        25.70022586,  22.49550167,  37.45517401,  31.50167434,
        15.99340048,  23.13493177,  31.72291016,  55.05

In [35]:
# put the predicted results to the output file
y_pre = pd.read_csv('sampleSubmission.csv')

In [36]:
y_pre.value = y_star
y_pre

Unnamed: 0,id,value
0,id_0,27.414409
1,id_1,61.555748
2,id_2,20.498088
3,id_3,29.534512
4,id_4,10.797647
5,id_5,36.207224
6,id_6,39.716439
7,id_7,16.571746
8,id_8,48.236269
9,id_9,33.983831


In [37]:
#y_pre.to_csv('submission.csv')