In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn
import torch.optim
import torch.nn.functional as F
import random
np.set_printoptions(threshold=1e3, suppress=True, precision=2, linewidth=80)

In [2]:
def reset_seed(s):
    torch.manual_seed(s)
    random.seed(s)
    np.random.seed(s)

In [3]:
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer
    
    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x

In [4]:
class Dataset:
    def __init__(self):
        self.X = None
        self.X_t = None
        self.y_hat = None
        self.df = None
        self.w = None
    def normalize(self):
        mean_x = np.nanmean(self.X, axis=1)
        std_x = np.nanstd(self.X, axis=1)
        std_x[std_x == 0] = 1
        self.X = (self.X - mean_x[:, np.newaxis]) / std_x[:, np.newaxis]
        self.X_t = self.X.T

trainset = Dataset()
trainset.full_df0 = pd.read_csv('train_datas_0.csv', dtype='float', na_values='-',
                                skiprows=[61-2, 86-2])
trainset.full_df1 = pd.read_csv('train_datas_1.csv', dtype='float', na_values='-', 
                                skiprows=np.arange(2162-2, 2208-2))
#.apply(pd.to_numeric, errors='coerce')

testset = Dataset()
testset.full_df = pd.read_csv('test_datas.csv', dtype='float')

In [5]:
exclude_columns = ['O3', 'WD', 'PM10']

In [12]:
def preprocess_training_data(sel_cols=['PM2.5']):
    trainset.df = pd.concat([trainset.full_df0, trainset.full_df1])
    
    # add hour and day
#     n0 = len(trainset.full_df0)
#     n1 = len(trainset.full_df1)
#     n = n0 + n1
#     trainset.df['hour'] = np.tile(np.arange(24), (n+23)//24)[:n]
#     trainset.df['day'] = np.tile(np.repeat(np.arange(365), 24), (n+364)//365)[:n]
#     print(trainset.df)

    # correct dataframe
    trainset.df.dropna(how='all', inplace=True)
    trainset.df.drop(trainset.df.index[(trainset.df == 0).all(axis=1)], inplace=True)
    for cnam in trainset.df.columns:
        # fill NaN with mean
        trainset.df[cnam].fillna(trainset.df[cnam].mean(), inplace=True)

        # replace outlier with mean
        cond = abs(trainset.df[cnam] - trainset.df[cnam].mean()) > 5 * trainset.df[cnam].std()
        trainset.df[cnam][cond] = trainset.df[cnam].mean()

    trainset.df.reindex()

    # extract feature
    c = len(sel_cols)
    d = c * 9 * 2
    n = len(trainset.df) - 9
    X = torch.zeros((n, d))
    y_hat = torch.zeros((n, 1))
    
    for i in range(n):
        X[i, 0:c*9] = torch.tensor(trainset.df.iloc[i:i+9][sel_cols].values.flatten())
        X[i, c*9:c*9*2] = X[i, 0:c*9] ** 2
        y_hat[i] = trainset.df.iloc[i+9]['PM2.5']

    trainset.X = X
    trainset.y_hat = y_hat
    #trainset.normalize()

preprocess_training_data()#trainset.df.columns.difference(exclude_columns))

In [13]:
def train(X, y_hat):
    n, d = X.shape
    print('n,d=',n,d)

    #reset_seed(1383267)
#     linear_module = torch.nn.Linear(d, 1, bias=True)
#     loss_func = torch.nn.MSELoss()
#     optim = torch.optim.SGD(linear_module.parameters(), lr=1e-7)#, betas=(0.99, 0.999))

#     net = Net(n_feature=d, n_hidden=1, n_output=1) #you can use different n_hidden & lr for test
#     print(net)
    net = torch.nn.Linear(d, 1, bias=True)
    optim = torch.optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999))
    loss_func = torch.nn.MSELoss()
#     plt.ion()

    num_iter = 1000

    print('iter,\tloss,\tw')

    for i in range(num_iter):
        y = net(X)
        loss = loss_func(y, y_hat) #+ 1e-5 * torch.norm(net.weight)
        optim.zero_grad()
        loss.backward()
        optim.step()

        if i % 1000 == 0:
            print('{},\t{:.2f},\t{}'.format(i, loss.item(), net.weight.view(d).detach().numpy()))
#             print('{},\t{:.2f}'.format(i, loss.item()))

    _X = np.concatenate((X.view((n,d)).detach().numpy(), np.ones((n,1))), axis=1)
    _y_hat = y_hat.view(n).detach().numpy()
    true_w, residuals, rank, s = np.linalg.lstsq(_X, _y_hat)

    print()
    print('true w and bias\t', true_w)
    print('true loss\t', np.linalg.norm(_X @ true_w - _y_hat) / n)
    print('estimated w\t', net.weight.view(d).detach().numpy())
    print('estimated bias\t', net.bias.view(1).detach().numpy())
    
    testset.net = net
    testset.true_w = true_w

#train(trainset.X, trainset.y_hat)

In [14]:
def validate():
    X = trainset.X
    y_hat = trainset.y_hat
    n, d = X.shape

    validate_sz = 2000
    losses = []
    loss_func = torch.nn.MSELoss()
    for i in range(0, n, validate_sz):
        idx_validate = pd.Series([False] * n)
        idx_validate[i:i+validate_sz] = True

        train(X[~idx_validate], y_hat[~idx_validate])
        y = testset.net(X[idx_validate])
        l = loss_func(y, y_hat[idx_validate])
        losses.append(l.item())
        
        break
    
    vn = validate_sz
    _X = np.concatenate((X[idx_validate].numpy(), np.ones((vn, 1))), axis=1)
    _y_hat = y_hat[idx_validate].numpy()
    true_loss = np.linalg.norm(_X @ testset.true_w - _y_hat) // vn

    print('validate loss with estimated', np.mean(losses))
    print('validate loss with true', true_loss)
    print()

validate()

n,d= 15403 18
iter,	loss,	w
0,	30832.97,	[-0.06  0.07 -0.11  0.2   0.   -0.08  0.22 -0.08  0.21 -0.08  0.12  0.17 -0.08
 -0.17  0.    0.15 -0.11  0.13]





true w and bias	 [ 0.07  0.    0.03 -0.01  0.02  0.07 -0.01  0.21  0.6  -0.    0.   -0.    0.
 -0.   -0.    0.   -0.    0.    0.67]
true loss	 0.04351952073365476
estimated w	 [ 0.06  0.07 -0.15  0.31  0.17  0.    0.2   0.06  0.22 -0.   -0.    0.   -0.
 -0.   -0.   -0.   -0.    0.01]
estimated bias	 [0.02]
validate loss with estimated 98.5306396484375
validate loss with true 22.0



In [15]:
def preprocess_testing_data(sel_cols=['PM2.5']):
    testset.df = testset.full_df.copy()

    c = len(sel_cols)
    d = c * 9 * 2
    n = (len(testset.df) + 8) // 9
    X = torch.zeros((n, d))

    # extract feature
    for i in range(n):
        X[i, 0:c*9] = torch.tensor(testset.df.iloc[i*9:(i+1)*9][sel_cols].values.flatten())
        X[i, c*9:c*9*2] = X[i, 0:c*9] ** 2

    testset.X = X
    #testset.normalize()

preprocess_testing_data()#testset.df.columns.difference(exclude_columns))

In [21]:
def test():
    n, d = testset.X.shape

#     y = testset.net(testset.X)
#     y = torch.round(y).view(n).detach().numpy()
    
    _X = np.concatenate((testset.X.numpy(), np.ones((n, 1))), axis=1)
    y = _X @ testset.true_w
    y = np.rint(y)

    testset.y = y
    print(y)

test()

[13. 17. 20. 35. 24. 40. 55. 63. 65. 56. 43. 36. 57. 50. 40. 47. 28. 30. 21.
 32. 31. 33. 27. 21. 24. 15.  6.  9. 21. 20. 22. 18. 18. 18. 23. 27. 16. 19.
  8.  6. 11. 22. 19. 25. 20. 27. 27. 34. 33. 30. 45. 31. 35. 37. 22. 26. 14.
 13.  6.  4.  4.  2.  5.  8.  9.  7.  4.  8. 21. 21. 26. 29. 24. 23. 52. 26.
 19. 14. 17. 16. 19. 14.  5. 11. 19. 17. 12.  8.  9. 10. 13. 12. 14. 19. 26.
 26. 39. 34. 30. 25. 47. 32. 24. 11.  3. 29. 27. 18. 15.  6.  3. 10. 16. 15.
 13. 19. 23. 22. 23. 16. 18. 23. 25. 30. 22. 27. 22. 23. 39. 54. 37. 13. 14.
  9. 23. 18. 16. 18. 28. 19. 27. 29. 26. 11. 12. 16. 21. 24. 26. 14.  4.  6.
  9. 20. 22. 25. 10. 15. 25. 21. 23. 19. 12. 10.  7. 13. 27. 31. 30. 27. 37.
 18.  4. 14. 15. 19. 22. 20. 29. 27. 22. 27. 26. 38. 28. 26. 21. 27. 31. 33.
 23. 24. 23. 17. 21. 24. 20. 25. 23. 25. 30. 24. 22. 22. 18. 12. 10.  5.  3.
 16. 15. 17. 15. 17. 20. 24. 15. 17. 20. 26. 24. 27. 22. 15. 20. 21. 24. 16.
  4. 20. 14. 17. 26. 24. 24. 20. 28. 38. 26. 28. 32. 19.  8. 14. 15. 27. 33.

In [22]:
pred_df = pd.DataFrame({
    'id': ['id_' + str(i) for i in range(500)],
    'value': testset.y
})
pred_df.to_csv('submission.csv', index=False)