## Statistical Learning and Deep Learning HW4

Load dataset.

In [1]:
#load packages
import pickle
from sklearn import preprocessing
%matplotlib inline

# Load data
with open('msd_full.pickle', 'rb') as fh1:
    msd_data = pickle.load(fh1)

doscaling = 1
if doscaling == 1:
    xscaler = preprocessing.StandardScaler().fit(msd_data['X_train'])
    # standardize feature values
    X_train = xscaler.transform(msd_data['X_train'])
    X_test = xscaler.transform(msd_data['X_test'])
else:
    X_train = msd_data['X_train']
    X_test = msd_data['X_test']

Y_train = msd_data['Y_train']
Y_test = msd_data['Y_test'].astype('float32')
X_test = X_test.astype('float32')

y_mean = Y_train.mean()
Y_train_keep = Y_train.copy()
Y_test_keep = Y_test.copy()
Y_train = Y_train - y_mean
Y_test = Y_test - y_mean


# validation is the last 10% of training, subtraining is the first 90% of training
nvalid = int(X_train.shape[0] * 0.1)
nsubtrain = X_train.shape[0] - nvalid

X_subtrain = X_train[0:nsubtrain, :].astype('float32')
X_valid = X_train[nsubtrain:, :].astype('float32')
Y_subtrain = Y_train[0:nsubtrain].astype('float32')
Y_valid = Y_train[nsubtrain:].astype('float32')

Y_subtrain_keep = Y_train_keep[0:nsubtrain].astype('float32')
Y_valid_keep = Y_train_keep[nsubtrain:].astype('float32')

print("X_train shape = ", X_train.shape)
print("X_subtrain shape = ", X_subtrain.shape)
print("X_valid shape = ", X_valid.shape)
print()
print("Y_train shape = ", Y_train.shape)
print("Y_subtrain shape = ", Y_subtrain.shape)
print("Y_valid shape = ", Y_valid.shape)
print()
print("X_test shape = ", X_test.shape)
print("Y_test shape = ", Y_test.shape)

X_train shape =  (463715, 90)
X_subtrain shape =  (417344, 90)
X_valid shape =  (46371, 90)

Y_train shape =  (463715,)
Y_subtrain shape =  (417344,)
Y_valid shape =  (46371,)

X_test shape =  (51630, 90)
Y_test shape =  (51630,)


### Q1. Oridinary Least Square (OLS)

In [10]:
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error

ols = sm.OLS(Y_train, X_train)
ols_result = ols.fit()

In [11]:
print(f'The first 5 parameters: {ols_result.params[:5]}')

The first 5 parameters: [ 5.30975265 -2.88088114 -1.53234348  0.05737583 -0.33952889]


In [12]:
# predict
Y_predict = ols_result.predict(X_test)
print(f'The predicted Y is {Y_predict}')

The predicted Y is [-5.81070695  0.03250657  5.13960445 ... -1.39829429 -0.26047668
  0.05193056]


In [13]:
# RMSE
print(f'RMSE = {mean_squared_error(Y_test, Y_predict)}')

RMSE = 90.44315624585404


### Q2. MLP with Four Hidden Layers

In [37]:
import torch
from torch.utils import data
print("torch version:", torch.__version__) 

torch version: 1.7.1rc2


In [38]:
# define dataset
class Dataset(data.Dataset):
    
  def __init__(self, Xnp, Ynp):
        self.labels = Ynp
        self.nobs = Xnp.shape[0]        
        self.Xnp = Xnp
        self.Ynp = Ynp
        
  def __len__(self):
        return self.nobs
    
  def __getitem__(self, index):     
        X = self.Xnp[index]
        y = self.Ynp[index]
        return X, y

In [40]:
# create dataloader
subtrain_set = Dataset(X_subtrain, Y_subtrain)    
valid_set = Dataset(X_valid, Y_valid)
test_set = Dataset(X_test, Y_test)
print('subtrain length', len(subtrain_set))
print('valid length', len(valid_set))
print('test length', len(test_set))

batch_size = 1000
subtrain_loader = data.DataLoader(subtrain_set, batch_size=batch_size)
valid_loader = data.DataLoader(valid_set, batch_size=batch_size)
test_loader = data.DataLoader(test_set, batch_size=batch_size)

subtrain length 417344
valid length 46371
test length 51630


In [51]:
# create MLP model
d_hidden = 45
d_input = subtrain_set.Xnp.shape[1]
d_output = 1
CUDA_VISIBLE_DEVICES = 4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'using {device}')

MLP = torch.nn.Sequential(
    torch.nn.Linear(d_input, d_hidden),
    torch.nn.ReLU(),
    torch.nn.Linear(d_hidden, d_hidden),
    torch.nn.ReLU(),
    torch.nn.Linear(d_hidden, d_hidden),
    torch.nn.ReLU(),
    torch.nn.Linear(d_hidden, d_hidden),
    torch.nn.ReLU(),
    torch.nn.Linear(d_hidden, d_output)
)

using cuda


In [52]:
# optimizer
lr = 0.00001
momentum = 0
weight_decay = 0
optimizer = torch.optim.SGD(MLP.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

# loss
loss_func = torch.nn.MSELoss(reduction='sum')

In [54]:
# train
max_epoch = 100
max_step = 5000
valid_interval = 100
step_count = 0
best_step_count = 0

for epoch in range(max_epoch):
    for batch, (inputs, targets) in enumerate(subtrain_loader):
        targets = targets.reshape((-1, 1))
        inputs, targets = inputs.to(device), targets.to(device)
        MLP.to(device)
        MLP.train()
        optimizer.zero_grad()
        outputs = MLP(inputs)
        loss = loss_func(outputs, targets)
        loss.backward()
        optimizer.step()
        step_count += 1
    print(f'Epoch {epoch}, Step {step_count}: Loss = {loss.item()}')
        

Epoch 0, Step 418: Loss = 33725.8671875
Epoch 1, Step 836: Loss = 33955.67578125
Epoch 2, Step 1254: Loss = 33308.0703125
Epoch 3, Step 1672: Loss = 32549.05859375
Epoch 4, Step 2090: Loss = 32314.919921875
Epoch 5, Step 2508: Loss = 32432.322265625
Epoch 6, Step 2926: Loss = 32580.04296875
Epoch 7, Step 3344: Loss = 32655.34765625
Epoch 8, Step 3762: Loss = 32587.314453125
Epoch 9, Step 4180: Loss = 32320.896484375
Epoch 10, Step 4598: Loss = 32263.677734375
Epoch 11, Step 5016: Loss = 31648.2421875
Epoch 12, Step 5434: Loss = 31666.12890625
Epoch 13, Step 5852: Loss = 31723.912109375
Epoch 14, Step 6270: Loss = 31790.080078125
Epoch 15, Step 6688: Loss = 31371.58984375
Epoch 16, Step 7106: Loss = 31610.505859375
Epoch 17, Step 7524: Loss = 31531.337890625
Epoch 18, Step 7942: Loss = 31178.9375
Epoch 19, Step 8360: Loss = 31465.39453125
Epoch 20, Step 8778: Loss = 30891.0546875
Epoch 21, Step 9196: Loss = 30806.0625
Epoch 22, Step 9614: Loss = 30433.31640625
Epoch 23, Step 10032: Loss