In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn
import torch.optim
import torch.nn.functional as F
import random
np.set_printoptions(threshold=1e3, suppress=True, precision=2, linewidth=80)

In [2]:
def reset_seed(s):
    torch.manual_seed(s)
    random.seed(s)
    np.random.seed(s)

In [3]:
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer
    
    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x

In [4]:
class Dataset:
    def __init__(self):
        self.X = None
        self.X_t = None
        self.y_hat = None
        self.df = None
        self.w = None
    def normalize(self):
        mean_x = torch.mean(self.X, axis=0)
        std_x = torch.mean(self.X, axis=0)
        std_x[std_x == 0] = 1
        self.X = (self.X - mean_x[None, :]) / std_x[None, :]

trainset = Dataset()
trainset.full_df0 = pd.read_csv('train_datas_0.csv', dtype='float', na_values='-',
                                skiprows=[61-2, 86-2])
trainset.full_df1 = pd.read_csv('train_datas_1.csv', dtype='float', na_values='-', 
                                skiprows=np.arange(2162-2, 2208-2))
#.apply(pd.to_numeric, errors='coerce')

testset = Dataset()
testset.full_df = pd.read_csv('test_datas.csv', dtype='float')

In [5]:
exclude_columns = []

In [6]:
def preprocess_training_data(sel_cols=['PM2.5']):
    trainset.df = pd.concat([trainset.full_df0, trainset.full_df1])
    
    # add hour and day
#     n0 = len(trainset.full_df0)
#     n1 = len(trainset.full_df1)
#     n = n0 + n1
#     trainset.df['hour'] = np.tile(np.arange(24), (n+23)//24)[:n]
#     trainset.df['day'] = np.tile(np.repeat(np.arange(365), 24), (n+364)//365)[:n]
#     print(trainset.df)

    print('correcting dataframe')
    trainset.df.dropna(how='all', inplace=True)
    trainset.df.drop(trainset.df.index[(trainset.df == 0).all(axis=1)], inplace=True)
    for cnam in trainset.df.columns:
        # fill NaN with mean
        trainset.df[cnam].fillna(trainset.df[cnam].mean(), inplace=True)

        # replace outlier with mean
        cond = abs(trainset.df[cnam] - trainset.df[cnam].mean()) > 5 * trainset.df[cnam].std()
        trainset.df[cnam][cond] = trainset.df[cnam].mean()

    trainset.df.reindex()

    print('extracting feature')
    b = 9 # how many days before are used as feature
    c = len(sel_cols)
    d = c * b #+ b*c*b*c
    n = len(trainset.df) - b
    X = torch.zeros((n, d))
    y_hat = torch.zeros((n, 1))
    
    for i in range(n):
        if i % 1000 == 0:
            print('i',i)
        X[i, 0:c*b] = torch.tensor(trainset.df.iloc[i:i+b][sel_cols].values.flatten())
#         for j in range(c*b):
#             for k in range(c*b):
#                 X[i, c*b*(j+1) + k] = X[i, j] * X[i, k]
        y_hat[i] = trainset.df.iloc[i+b]['PM2.5']

    trainset.X = X
    trainset.y_hat = y_hat
    #trainset.normalize()

preprocess_training_data(trainset.full_df0.columns.difference(exclude_columns))

correcting dataframe
extracting feature
i 0
i 1000
i 2000
i 3000
i 4000
i 5000
i 6000
i 7000
i 8000
i 9000
i 10000
i 11000
i 12000
i 13000
i 14000
i 15000
i 16000
i 17000


In [11]:
def train(X, y_hat, model_path=None):
    n, d = X.shape
    print('n,d=',n,d)
    
    if model_path:
        print('loading old model')
        net = torch.load(model_path)
        net.eval()
    else:
        net = torch.nn.Sequential(
                torch.nn.Linear(d, 15),
                torch.nn.ReLU(),
                torch.nn.Linear(15, 1))
#         net = torch.nn.Linear(d, 1, bias=True)
#         net = Net(n_feature=d, n_hidden=1, n_output=1) #you can use different n_hidden & lr for test
#         print(net)

#     reset_seed(1383267)
#     linear_module = torch.nn.Linear(d, 1, bias=True)
#     loss_func = torch.nn.MSELoss()
#     optim = torch.optim.SGD(linear_module.parameters(), lr=1e-7)#, betas=(0.99, 0.999))
    optim = torch.optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999))
    loss_func = torch.nn.MSELoss()
#     plt.ion()

    num_iter = 10000

    print('iter,\tloss,\tw')

    for i in range(num_iter+1):
        y = net(X)
        loss = loss_func(y, y_hat) #+ 1e-5 * torch.sum(net.weight ** 2)
        optim.zero_grad()
        loss.backward()
        optim.step()

        if i % 1000 == 0:
#             print('{},\t{:.2f},\t{}'.format(i, loss.item(), net.weight.view(d).detach().numpy()))
            print('{},\t{:.2f}'.format(i, loss.item()))

    _X = np.concatenate((X.view((n,d)).detach().numpy(), np.ones((n,1))), axis=1)
    _y_hat = y_hat.view(n).detach().numpy()
    true_w, residuals, rank, s = np.linalg.lstsq(_X, _y_hat)

#     print()
#     print('true w and bias\t', true_w)
#     print('true loss\t', np.mean((_X @ true_w - _y_hat) ** 2))
#     print('estimated w\t', net.weight.view(d).detach().numpy())
#     print('estimated bias\t', net.bias.view(1).detach().numpy())
    
    testset.net = net
    testset.true_w = true_w

#train(trainset.X, trainset.y_hat)

In [12]:
def validate():
    X = trainset.X
    y_hat = trainset.y_hat
    n, d = X.shape

    validate_sz = 2000
    losses = []
    loss_func = torch.nn.MSELoss()
    for i in range(0, n, validate_sz):
        idx_validate = pd.Series([False] * n)
        idx_validate[i:i+validate_sz] = True

        train(X[~idx_validate], y_hat[~idx_validate], model_path=None)
        y = testset.net(X[idx_validate])
        l = loss_func(y, y_hat[idx_validate])
        losses.append(l.item())
        
        break
    
    vn = validate_sz
    _X = np.concatenate((X[idx_validate].numpy(), np.ones((vn, 1))), axis=1)
    _y_hat = y_hat[idx_validate].view(vn).numpy()
    true_loss = np.mean((_X @ testset.true_w - _y_hat) ** 2)

    print('validate loss with estimated', np.mean(losses))
    print('validate loss with true', true_loss)
    print()

validate()

n,d= 15403 135
iter,	loss,	w
0,	562.92
1000,	26.62
2000,	26.53
3000,	25.56
4000,	24.87
5000,	24.17
6000,	23.78
7000,	23.48
8000,	23.44
9000,	23.23
10000,	23.17




validate loss with estimated 76.8954086303711
validate loss with true 72.56097581491089



In [231]:
torch.save(testset.net, 'model-ord1-all-hidlayer.pt')

In [238]:
def preprocess_testing_data(sel_cols=['PM2.5']):
    testset.df = testset.full_df.copy()

    for cnam in testset.df.columns:
        # fill NaN with mean
        testset.df[cnam].fillna(testset.df[cnam].mean(), inplace=True)

        # replace outlier with mean
        cond = abs(testset.df[cnam] - testset.df[cnam].mean()) > 5 * testset.df[cnam].std()
        testset.df[cnam][cond] = testset.df[cnam].mean()

    # extract feature
    c = len(sel_cols)
    d = c * 9  + 9 * c * 9 * c
    n = (len(testset.df) + 8) // 9
    X = torch.zeros((n, d))

    for i in range(n):
        X[i, 0:c*9] = torch.tensor(testset.df.iloc[i*9:(i+1)*9][sel_cols].values.flatten())
        for j in range(c*9):
            for k in range(c*9):
                X[i, c*9*(j+1) + k] = X[i, j] * X[i, k]
    testset.X = X
    #testset.normalize()

preprocess_testing_data()#testset.full_df.columns.difference(exclude_columns))

In [245]:
def test():
    #train(trainset.X, trainset.y_hat)

    n, d = testset.X.shape

#     y = testset.net(testset.X)
#     y = y.view(n).detach().numpy()
    #y = torch.round(y).view(n).detach().numpy().astype(int)
    
    _X = np.concatenate((testset.X.numpy(), np.ones((n, 1))), axis=1)
    y = _X @ testset.true_w
    y = np.rint(y).astype(int)

    testset.y = y
    print(y)

test()

[14 17 20 34 24 40 62 64 65 57 41 35 58 55 40 46 28 30 21 32 30 33 26 20 24 15
  6 10 21 20 22 19 17 18 23 27 17 19  8  6 11 22 20 24 20 27 27 34 33 30 47 28
 28 38 22 25 23 13  6  4  4  2  5  8 10  7  4  8 22 21 26 29 23 23 46 26 19 13
 17 16 19 14  5 11 18 15 12  8  9 10 13 12 14 19 25 26 39 34 30 25 47 32 24 11
  3 29 27 18 15  6  3 10 15 15 13 19 22 22 23 16 18 23 25 30 22 26 22 23 39 53
 37 14 14  9 24 18 16 18 29 19 28 30 27 10 12 16 21 24 26 14  4  7  9 19 22 25
 10 15 25 21 23 19 12 10  7 12 27 31 28 27 36 19  4 14 15 20 22 20 29 27 21 27
 27 38 29 26 20 27 32 33 22 24 23 17 22 24 20 25 23 25 23 24 22 23 19 12 10  5
  3 16 15 18 16 16 20 24 15 17 20 26 24 27 22 15 20 21 23 17  2 20 14 17 26 24
 24 20 28 37 26 28 32 16  8 14 16 24 33 26 47 42 30 27 40 25 25 31 34 25 27 17
 18 23  5 29 33 22 12 15 15 21 18 24 22 38 34 34 25 22 26 37 32 29 33 35 39 36
 25 37 21 20 20 16 18  9 11 14 11 14 13  9  9  8  9 13 12  9  7  9  7  5  8  4
  9 21 22 14 13 24 23 24 13 17 17 19 11  5  6  7  7 

In [247]:
pred_df = pd.DataFrame({
    'id': ['id_' + str(i) for i in range(500)],
    'value': testset.y
})
pred_df.to_csv('submission.csv', index=False)