In [1]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from torch.utils.data import TensorDataset, DataLoader
from torchsummary import summary

%matplotlib notebook

In [53]:
# Function to train net
def train(net, train_data, parameters, verbose):
    """
    Train NN on provided data sets.

    Args:
        net: neural network
        train_data: dataset containing training inputs and targets
        parameters: dict of hyperparameters for training with Adam
            - num_epochs: number of epochs [default (1)]
            - lr: learning rate [default (0.001)]
            - betas: coefficients for running averages [default (0.9, 0.999)]
            - eps: epsilon for numerical stability [default (1e-8)]
            - weight_decay: L2 regularization [default (0.0)]
            - step_size: period of learning rate decay [default (50)]
            - gamma: multiplicative factor of learning rate decay [default (0.1)]
            - clip_norm: option to clip gradient norm
            - max_norm: max norm of the gradients [default (5)]
        verbose: dict of printing actions
            - verbose: print real-time training results [default (True)]
            - print_every: interval to print real-time training results [default (1)]
    """

    X_train, y_train = train_data
    num_train = X_train.size(0)
    print('Training examples: ', num_train)
    print('')

    # Loss function and optimization method
    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(),
                           lr=parameters.get('lr', 1e-3),
                           betas=parameters.get('betas', (0.9, 0.999)),
                           eps=parameters.get('eps', 1e-8),
                           weight_decay=parameters.get('weight_decay', 0.0),
                          )
    scheduler = optim.lr_scheduler.StepLR(optimizer, 
                                          step_size=parameters.get('step_size', 50),
                                          gamma=parameters.get('gamma', 0.1),
                                         )

    num_epochs = parameters.get('num_epochs', 1)
    train_loss = np.zeros(num_epochs)
    print('Training %s ...' % net.__class__.__name__)

    net.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = net(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        if parameters.get('clip_norm', False):
            nn.utils.clip_grad_norm_(net.parameters(), parameters.get('max_norm', 5))
        optimizer.step()
    
        if verbose.get('verbose', True):
            if (epoch + 1) % verbose.get('print_every', 1) == 0:
                print('Epoch: %d/%d,    lr: %e,    Loss: %.6f' %
                      (epoch + 1, num_epochs, scheduler.get_last_lr()[0], loss / num_train))
        
        train_loss[epoch] = loss / num_train
        scheduler.step()
        
    plotloss(net.__class__.__name__, criterion, train_loss.flatten())
    plt.show()

    print('')

    return net

In [95]:
# Function to train net
def train_RLRP(net, train_data, parameters, verbose):
    """
    Train NN on provided data sets using ReduceLROnPlateau learning rate scheduler.

    Args:
        net: neural network
        train_data: dataset containing training inputs and targets
        parameters: dict of hyperparameters for training with Adam
            - num_epochs: number of epochs [default (1)]
            - lr: learning rate [default (0.001)]
            - betas: coefficients for running averages [default (0.9, 0.999)]
            - eps: epsilon for numerical stability [default (1e-8)]
            - weight_decay: L2 regularization [default (0.0)]
            - factor: multiplicative factor of learning rate decay [default (0.1)] 
            - patience: epochs below threshold before reducing learning rate [default (1000)]
            - threshold: threshold for measuring the new optimum [default (1e-4)]
            - clip_norm: option to clip gradient norm
            - max_norm: max norm of the gradients [default (5)]
        verbose: dict of printing actions
            - verbose: print real-time training results [default (True)]
            - print_every: interval to print real-time training results [default (1)]
    """

    X_train, y_train = train_data
    num_train = X_train.size(0)
    print('Training examples: ', num_train)
    print('')

    # Loss function and optimization method
    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(),
                           lr=parameters.get('lr', 1e-3),
                           betas=parameters.get('betas', (0.9, 0.999)),
                           eps=parameters.get('eps', 1e-8),
                           weight_decay=parameters.get('weight_decay', 0.0),
                          )
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                     mode='min',
                                                     factor=parameters.get('factor', 0.1),
                                                     patience=parameters.get('patience', 1000),
                                                     verbose=True,
                                                     threshold=parameters.get('threshold', 1e-4),
                                                     cooldown=0,
                                                    )

    num_epochs = parameters.get('num_epochs', 1)
    train_loss = np.zeros(num_epochs)
    print('Training %s ...' % net.__class__.__name__)

    net.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = net(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        if parameters.get('clip_norm', False):
            nn.utils.clip_grad_norm_(net.parameters(), parameters.get('max_norm', 5))
        optimizer.step()
    
        if verbose.get('verbose', True):
            if (epoch + 1) % verbose.get('print_every', 1) == 0:
                print('Epoch: %d/%d,    Loss: %.6f' %
                      (epoch + 1, num_epochs, loss / num_train))
        
        train_loss[epoch] = loss / num_train
        scheduler.step(loss)
        
    plotloss(net.__class__.__name__, criterion, train_loss.flatten())
    plt.show()

    print('')

    return net

In [34]:
# Function to train net with mini-batches
def batch_train(net, train_data, batch_size, parameters, verbose):
    """
    Train NN on provided data sets.

    Args:
        net: neural network
        train_data: PyTorch Dataset containing training inputs and targets
        batch_size: mini-batch size
        parameters: dict of hyperparameters for training with Adam
            - num_epochs: number of epochs [default (1)]
            - lr: learning rate [default (0.001)]
            - betas: default (0.9, 0.999)
            - eps: epsilon [default (1e-8)]
            - weight_decay: L2 regularization [default (0.0)]
            - step_size: period of learning rate decay [default (50)]
            - gamma: multiplicative factor of learning rate decay [default (0.1)]
            - max_norm: max norm of the gradients [default (5)]
        verbose: dict of printing actions
            - verbose: print real-time training results [default (True)]
            - print_every: interval to print real-time training results [default (1)]
    """

    trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)
    num_train = len(trainloader)
    print('Training examples: ', num_train)
    print('')

    # Loss function and optimization method
    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(),
                           lr=parameters.get('lr', 1e-3),
                           betas=parameters.get('betas', (0.9, 0.999)),
                           eps=parameters.get('eps', 1e-8),
                           weight_decay=parameters.get('weight_decay', 0.0),
                          )
    scheduler = optim.lr_scheduler.StepLR(optimizer, 
                                          step_size=parameters.get('step_size', 50),
                                          gamma=parameters.get('gamma', 0.1),
                                         )

    num_epochs = parameters.get('num_epochs', 1)
    train_loss = np.zeros(num_epochs)
    print('Training %s ...' % net.__class__.__name__)

    net.train()
    for epoch in range(num_epochs):
        
        running_loss = 0.0
        for i, (X_batch, y_batch) in enumerate(trainloader):
            
            optimizer.zero_grad()
            outputs = net(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), parameters.get('max_norm', 5))
            optimizer.step()
            running_loss += loss.item()
            train_loss[epoch] += loss.item()

        if verbose.get('verbose', True):
            if (epoch + 1) % verbose.get('print_every', 1) == 0:
                print('Epoch: %d/%d,    lr: %e,    Loss: %.6f' %
                      (epoch + 1, num_epochs, scheduler.get_last_lr()[0], 
                       running_loss / num_train))
        
        train_loss[epoch] /= num_train
        scheduler.step()
        
    plotloss(net.__class__.__name__, criterion, train_loss.flatten())
    plt.show()

    print('')

    return net

In [109]:
# Plot training loss and, optionally, validation loss
def plotloss(title, criterion, train_loss, val_loss=None):
    num_epochs = len(train_loss)
    plt.figure(figsize=(6, 4))
    plt.plot(np.arange(1, num_epochs + 1), train_loss, label='Train')
    if val_loss is not None:
        plt.plot(np.arange(1, num_epochs + 1), val_loss, label='Validation')
    plt.title(title)
    plt.xlabel('Epochs')
    # plt.xticks(np.arange(1, num_epochs + 1))  # 5 tick marks
    plt.ylabel(criterion.__class__.__name__)
    plt.yscale('log')
    plt.legend()
    plt.tight_layout()

# Each zeolite = line

In [8]:
each_zeolite = pd.read_csv('../data/chemistry-channel-info/each-zeolite-info.csv', 
                                dtype={'zeolite': str})
each_zeolite = each_zeolite[each_zeolite.num_channels != 0]  # remove zeolites with no channels

In [75]:
X_train, X_test, y_train, y_test = train_test_split(each_zeolite[each_zeolite.columns[1:6]].values,
                                                    each_zeolite.kH_C18.values, test_size=0.3,
                                                    random_state=12)

In [11]:
class FourLayerNet(nn.Module):
    def __init__(self):
        super(FourLayerNet, self).__init__()
        self.regressor = nn.Sequential(
            nn.Linear(5, 100),
            nn.LeakyReLU(),
            nn.Linear(100, 100),
            nn.LeakyReLU(),
            nn.Linear(100, 100),
            nn.LeakyReLU(),
            nn.Linear(100, 1),
        )

    def forward(self, x):
        x = self.regressor(x).view(-1)
        return x

In [12]:
summary(FourLayerNet(), (1, 5))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 100]             600
         LeakyReLU-2               [-1, 1, 100]               0
            Linear-3               [-1, 1, 100]          10,100
         LeakyReLU-4               [-1, 1, 100]               0
            Linear-5               [-1, 1, 100]          10,100
         LeakyReLU-6               [-1, 1, 100]               0
            Linear-7                 [-1, 1, 1]             101
Total params: 20,901
Trainable params: 20,901
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.08
Estimated Total Size (MB): 0.08
----------------------------------------------------------------


In [108]:
# 10 examples
fourlayernet = FourLayerNet()
fourlayernet = train(fourlayernet, 
                     train_data=(
                         torch.Tensor(X_train).float()[:10], 
                         torch.Tensor(y_train).float()[:10],
                     ), 
                     parameters={
                         'num_epochs': 20000,
                         'lr': 5e-4,
                         'weight_decay': 0.0,
                         'step_size': 4000,
                         'gamma': 0.1,
                         'clip_norm': True,
                         'max_norm': 1,
                     },
                     verbose={
                         'print_every': 2000,
                     },
                    )

Training examples:  10

Training FourLayerNet ...
Epoch: 2000/20000,    lr: 5.000000e-04,    Loss: 5316.959961
Epoch: 4000/20000,    lr: 5.000000e-04,    Loss: 1579.348877
Epoch: 6000/20000,    lr: 5.000000e-05,    Loss: 265.486572
Epoch: 8000/20000,    lr: 5.000000e-05,    Loss: 106.744919
Epoch: 10000/20000,    lr: 5.000000e-06,    Loss: 60.852184
Epoch: 12000/20000,    lr: 5.000000e-06,    Loss: 42.891243
Epoch: 14000/20000,    lr: 5.000000e-07,    Loss: 39.204468
Epoch: 16000/20000,    lr: 5.000000e-07,    Loss: 36.775467
Epoch: 18000/20000,    lr: 5.000000e-08,    Loss: 36.460419
Epoch: 20000/20000,    lr: 5.000000e-08,    Loss: 36.185249


<IPython.core.display.Javascript object>




In [51]:
# 50 examples
fourlayernet = FourLayerNet()
fourlayernet = train(fourlayernet, 
                     train_data=(
                         torch.Tensor(X_train).float()[:50], 
                         torch.Tensor(y_train).float()[:50],
                     ), 
                     parameters={
                         'num_epochs': 100000,
                         'lr': 1e-3,
                         'weight_decay': 0.0,
                         'step_size': 40000,
                         'gamma': 0.1,
                         'clip_norm': True,
                         'max_norm': 1,
                     },
                     verbose={
                         'print_every': 10000,
                     },
                    )

Training examples:  50

Training FourLayerNet ...
Epoch: 10000/100000,    lr: 1.000000e-03,    Loss: 58101.921875
Epoch: 20000/100000,    lr: 1.000000e-03,    Loss: 20241.818359
Epoch: 30000/100000,    lr: 1.000000e-03,    Loss: 2890.696777
Epoch: 40000/100000,    lr: 1.000000e-04,    Loss: 705.537354
Epoch: 50000/100000,    lr: 1.000000e-04,    Loss: 75.517471
Epoch: 60000/100000,    lr: 1.000000e-04,    Loss: 13.833266
Epoch: 70000/100000,    lr: 1.000000e-04,    Loss: 16.789000
Epoch: 80000/100000,    lr: 1.000000e-05,    Loss: 57.105881
Epoch: 90000/100000,    lr: 1.000000e-05,    Loss: 3.570388
Epoch: 100000/100000,    lr: 1.000000e-05,    Loss: 3.112560


<IPython.core.display.Javascript object>




In [66]:
# 50 examples [changed lr scheduler -- now ReduceLROnPlateau, previously StepLR]
fourlayernet = FourLayerNet()
fourlayernet = train_RLRP(fourlayernet, 
                          train_data=(
                              torch.Tensor(X_train).float()[:50], 
                              torch.Tensor(y_train).float()[:50],
                          ), 
                          parameters={
                              'num_epochs': 100000,
                              'lr': 1e-3,
                              'weight_decay': 0.0,
                              'factor': 0.25,
                              'patience': 5000,
                              'threshold': 1e-2,
                              'clip_norm': True,
                              'max_norm': 1,
                          },
                          verbose={
                              'print_every': 10000,
                          },
                         )

Training examples:  50

Training FourLayerNet ...
Epoch: 10000/100000,    Loss: 53768.933594
Epoch: 20000/100000,    Loss: 22622.490234
Epoch: 30000/100000,    Loss: 8356.929688
Epoch: 40000/100000,    Loss: 1220.954712
Epoch 44423: reducing learning rate of group 0 to 2.5000e-04.
Epoch: 50000/100000,    Loss: 64.653763
Epoch 50424: reducing learning rate of group 0 to 6.2500e-05.
Epoch: 60000/100000,    Loss: 15.234971
Epoch: 70000/100000,    Loss: 13.381964
Epoch: 80000/100000,    Loss: 12.800307
Epoch 82981: reducing learning rate of group 0 to 1.5625e-05.
Epoch: 90000/100000,    Loss: 9.776922
Epoch: 100000/100000,    Loss: 8.796189


<IPython.core.display.Javascript object>




In [88]:
# 100 examples [changed lr scheduler -- now ReduceLROnPlateau, previously StepLR]
fourlayernet = FourLayerNet()
fourlayernet = train_RLRP(fourlayernet, 
                          train_data=(
                              torch.Tensor(X_train).float()[:100], 
                              torch.Tensor(y_train).float()[:100],
                          ), 
                          parameters={
                              'num_epochs': 100000,
                              'lr': 1e-3,
                              'weight_decay': 0.0,
                              'factor': 0.1,
                              'patience': 5000,
                              'threshold': 1e-4,
                              'clip_norm': True,
                              'max_norm': 1,
                          },
                          verbose={
                              'print_every': 10000,
                          },
                         )

Training examples:  100

Training FourLayerNet ...
Epoch: 10000/100000,    Loss: 809035.187500
Epoch: 20000/100000,    Loss: 507278.875000
Epoch: 30000/100000,    Loss: 278435.843750
Epoch: 40000/100000,    Loss: 126077.773438
Epoch: 50000/100000,    Loss: 120046.867188
Epoch: 60000/100000,    Loss: 44760.281250
Epoch: 70000/100000,    Loss: 9560.480469
Epoch: 80000/100000,    Loss: 1292.342529
Epoch 86451: reducing learning rate of group 0 to 1.0000e-04.
Epoch: 90000/100000,    Loss: 108.661591
Epoch: 100000/100000,    Loss: 82.841766


<IPython.core.display.Javascript object>




In [54]:
# Full set
fourlayernet = FourLayerNet()
fourlayernet = train(fourlayernet, 
                     train_data=(
                         torch.Tensor(X_train).float(), 
                         torch.Tensor(y_train).float(),
                     ), 
                     parameters={
                         'num_epochs': 10000,
                         'lr': 1e-2,
                         'weight_decay': 0.0,
                         'step_size': 4000,
                         'gamma': 0.1,
                         'clip_norm': False,
                         'max_norm': 1,
                     },
                     verbose={
                         'print_every': 1000,
                     },
                    )

Training examples:  156553

Training FourLayerNet ...
Epoch: 1000/10000,    lr: 1.000000e-02,    Loss: 13447170048.000000
Epoch: 2000/10000,    lr: 1.000000e-02,    Loss: 13246898176.000000
Epoch: 3000/10000,    lr: 1.000000e-02,    Loss: 12965279744.000000
Epoch: 4000/10000,    lr: 1.000000e-02,    Loss: 12596765696.000000
Epoch: 5000/10000,    lr: 1.000000e-03,    Loss: 12366592000.000000
Epoch: 6000/10000,    lr: 1.000000e-03,    Loss: 12304583680.000000
Epoch: 7000/10000,    lr: 1.000000e-03,    Loss: 12253996032.000000
Epoch: 8000/10000,    lr: 1.000000e-03,    Loss: 12200573952.000000
Epoch: 9000/10000,    lr: 1.000000e-04,    Loss: 12182346752.000000
Epoch: 10000/10000,    lr: 1.000000e-04,    Loss: 12174338048.000000


<IPython.core.display.Javascript object>




In [97]:
class SixLayerNet(nn.Module):
    def __init__(self):
        super(SixLayerNet, self).__init__()
        self.regressor = nn.Sequential(
            nn.Linear(5, 100),
            nn.LeakyReLU(),
            nn.Linear(100, 200),
            nn.LeakyReLU(),
            nn.Linear(200, 200),
            nn.LeakyReLU(),
            nn.Linear(200, 200),
            nn.LeakyReLU(),
            nn.Linear(200, 100),
            nn.LeakyReLU(),
            nn.Linear(100, 1),
        )

    def forward(self, x):
        x = self.regressor(x).view(-1)
        return x

In [110]:
summary(SixLayerNet(), (1, 5))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 100]             600
         LeakyReLU-2               [-1, 1, 100]               0
            Linear-3               [-1, 1, 200]          20,200
         LeakyReLU-4               [-1, 1, 200]               0
            Linear-5               [-1, 1, 200]          40,200
         LeakyReLU-6               [-1, 1, 200]               0
            Linear-7               [-1, 1, 200]          40,200
         LeakyReLU-8               [-1, 1, 200]               0
            Linear-9               [-1, 1, 100]          20,100
        LeakyReLU-10               [-1, 1, 100]               0
           Linear-11                 [-1, 1, 1]             101
Total params: 121,401
Trainable params: 121,401
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/

In [100]:
sixlayernet = SixLayerNet()
sixlayernet = train(sixlayernet, 
                    train_data=(
                        torch.Tensor(X_train).float()[:100], 
                        torch.Tensor(y_train).float()[:100],
                    ),
                    parameters={
                        'num_epochs': 10000,
                        'lr': 1e-4,
                        'weight_decay': 0.0,
                        'step_size': 7000,
                        'gamma': 0.1,
                        'max_norm': 1,
                    },
                    verbose={
                        'print_every': 1000,
                    },
                   )

Training examples:  100

Training SixLayerNet ...
Epoch: 1000/10000,    lr: 1.000000e-04,    Loss: 31884824.000000
Epoch: 2000/10000,    lr: 1.000000e-04,    Loss: 28663472.000000
Epoch: 3000/10000,    lr: 1.000000e-04,    Loss: 26428340.000000
Epoch: 4000/10000,    lr: 1.000000e-04,    Loss: 22836718.000000
Epoch: 5000/10000,    lr: 1.000000e-04,    Loss: 16944094.000000
Epoch: 6000/10000,    lr: 1.000000e-04,    Loss: 10846093.000000
Epoch: 7000/10000,    lr: 1.000000e-05,    Loss: 1637521.625000
Epoch: 8000/10000,    lr: 1.000000e-05,    Loss: 912203.625000
Epoch: 9000/10000,    lr: 1.000000e-05,    Loss: 643998.562500
Epoch: 10000/10000,    lr: 1.000000e-05,    Loss: 567641.250000


<IPython.core.display.Javascript object>




In [248]:
plt.figure(figsize=(8, 9))
sns.set()
for i, layer in enumerate([0, 2, 4, 6, 8, 10]):
    plt.subplot(3, 2, i + 1)
    sns.distplot(sixlayernet.regressor[layer].weight.detach().numpy(), 
                 bins=50, hist=True, kde=True)
    plt.title('Layer #%i' % layer)
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [30]:
class SmallNet(nn.Module):
    def __init__(self):
        super(SmallNet, self).__init__()
        self.regressor = nn.Sequential(
            nn.Linear(5, 50),
            nn.LeakyReLU(),
            nn.Linear(50, 50),
            nn.LeakyReLU(),
            nn.Linear(50, 1),
        )

    def forward(self, x):
        x = self.regressor(x).view(-1)
        return x

In [None]:
summary(FourLayerNet(), (1, 5))

In [45]:
smallnet = SmallNet()
smallnet = train(smallnet, 
                 train_data=(
                     torch.Tensor(X_train).float()[:10], 
                     torch.Tensor(y_train).float()[:10]
                 ), 
                 parameters={
                     'num_epochs': 10000,
                     'lr': 1e-5,
                     'weight_decay': 0.0,
                     'step_size': 1000,
                     'gamma': 1,
                     'max_norm': 1,
                 },
                 verbose={
                     'verbose': True,
                     'print_every': 1000,
                 },
                )

Training examples:  10

Training SmallNet ...
Epoch: 1000/10000,    lr: 1.000000e-05,    Loss: 2661890.500000
Epoch: 2000/10000,    lr: 1.000000e-05,    Loss: 2660977.750000
Epoch: 3000/10000,    lr: 1.000000e-05,    Loss: 2659708.500000
Epoch: 4000/10000,    lr: 1.000000e-05,    Loss: 2658112.000000
Epoch: 5000/10000,    lr: 1.000000e-05,    Loss: 2656128.500000
Epoch: 6000/10000,    lr: 1.000000e-05,    Loss: 2653728.500000
Epoch: 7000/10000,    lr: 1.000000e-05,    Loss: 2650859.500000
Epoch: 8000/10000,    lr: 1.000000e-05,    Loss: 2647478.000000
Epoch: 9000/10000,    lr: 1.000000e-05,    Loss: 2643546.250000
Epoch: 10000/10000,    lr: 1.000000e-05,    Loss: 2639030.750000


<IPython.core.display.Javascript object>




In [27]:
fourlayernet = FourLayerNet()
fourlayernet = batch_train(fourlayernet, 
                           train_data=(
                               TensorDataset(
                                   torch.Tensor(X_train).float(), 
                                   torch.Tensor(y_train).float(),
                               )
                           ),
                           batch_size=300,
                           parameters={
                               'num_epochs': 100,
                               'lr': 1e-4,
                               'weight_decay': 0.0,
                               'step_size': 200,
                               'gamma': 0.5,
                               'max_norm': 5,
                           },
                           verbose={
                               'print_every': 10,
                           },
                          )

Training examples:  884

Training FourLayerNet ...
Epoch: 10/100,    lr: 1.000000e-04,    Loss: 1251214743261269.750000
Epoch: 20/100,    lr: 1.000000e-04,    Loss: 1251283724627433.750000
Epoch: 30/100,    lr: 1.000000e-04,    Loss: 1251280688458464.250000
Epoch: 40/100,    lr: 1.000000e-04,    Loss: 1251278916986009.500000
Epoch: 50/100,    lr: 1.000000e-04,    Loss: 1251276932857487.500000
Epoch: 60/100,    lr: 1.000000e-04,    Loss: 1251275152042135.500000
Epoch: 70/100,    lr: 1.000000e-04,    Loss: 1251272172626947.500000
Epoch: 80/100,    lr: 1.000000e-04,    Loss: 1251271025117072.500000
Epoch: 90/100,    lr: 1.000000e-04,    Loss: 1251269501395098.750000
Epoch: 100/100,    lr: 1.000000e-04,    Loss: 1251269128726739.750000


<IPython.core.display.Javascript object>




In [None]:
gbr = GradientBoostingRegressor(loss='ls', n_estimators=5000, learning_rate=0.1, 
                                verbose=1, random_state=12)

In [None]:
gbr.fit(X_train, y_train)
print('Test MSE: %e' % metrics.mean_squared_error(y_test, gbr.predict(X_test)))

In [None]:
feature_importance = gbr.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(8, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(each_zeolite.columns[1:6])[sorted_idx])
plt.title('Feature Importance (MDI)')
plt.tight_layout()
plt.show()

### 2.2) Extract extra info from IZA zeolites

In [215]:
# View psd-histo text file
GON_0_path = '../data/zeolite-geometry/IZA/psd-histo/GON-0.psd_histo'
f = open(GON_0_path, 'r')
print(f.read())
f.close()

Pore size distribution histogram
Bin size (A): 0.1
Number of bins: 1000
From: 0
To: 100
Total samples: 500000
Accessible samples: 36748
Fraction of sample points in node spheres: 0.073496
Fraction of sample points outside node spheres: 0

Bin Count Cumulative_dist Derivative_dist
0 0 1 0
0.1 0 1 0
0.2 0 1 0
0.3 0 1 0
0.4 0 1 0
0.5 0 1 0
0.6 0 1 0
0.7 0 1 0
0.8 0 1 0
0.9 0 1 0
1 0 1 0
1.1 0 1 0
1.2 0 1 0
1.3 0 1 0
1.4 0 1 0
1.5 0 1 0
1.6 0 1 0
1.7 0 1 0
1.8 0 1 0
1.9 0 1 0
2 0 1 0
2.1 0 1 0
2.2 0 1 0
2.3 0 1 0
2.4 0 1 0
2.5 0 1 0
2.6 0 1 0
2.7 0 1 0
2.8 0 1 0
2.9 0 1 0
3 0 1 0
3.1 0 1 0
3.2 0 1 0
3.3 0 1 0
3.4 0 1 0
3.5 0 1 0
3.6 0 1 0
3.7 0 1 0
3.8 0 1 0
3.9 0 1 0
4 0 1 0
4.1 0 1 0
4.2 0 1 0
4.3 0 1 0
4.4 0 1 0
4.5 0 1 0
4.6 0 1 0
4.7 0 1 0
4.8 0 1 0
4.9 0 1 0
5 0 1 0
5.1 0 1 0
5.2 0 1 0
5.3 96 1 0.0130619
5.4 0 0.997388 0.0130619
5.5 407 0.997388 0.0553772
5.6 58 0.986312 0.0632687
5.7 1537 0.984734 0.217019
5.8 0 0.942908 0.209127
5.9 0 0.942908 0
6 1530 0.942908 0.208175
6.1 33120 0

In [216]:
data = np.loadtxt(GON_0_path, skiprows=11)
data.shape

(1000, 4)

In [217]:
# number of total samples
f = open(GON_0_path, 'r')
for i, line in enumerate(f):
    if i == 5:
        num_tot = int(line.split(' ')[-1])
        break
num_tot

500000

In [218]:
# number of accessible samples
f = open(GON_0_path, 'r')
for i, line in enumerate(f):
    if i == 6:
        num_acc = int(line.split(' ')[-1])
        break
num_acc

36748

In [226]:
# mean
mean = np.dot(data[:, 0].T, data[:, 1]) / num_acc
mean

6.069582018069011

In [235]:
# median
cum_sum = np.cumsum(data[:, 1])
med_idx = (num_acc + 1) / 2

if np.ceil(med_idx) != np.floor(med_idx):
    upper_idx = np.where(cum_sum > np.ceil(med_idx))[0][0]
    lower_idx = np.where(cum_sum > np.floor(med_idx))[0][0]
    if data[upper_idx, 0] != data[lower_idx, 0]:
        median = (data[upper_idx, 0] + data[lower_idx, 0]) / 2
    else:
        median = data[np.where(cum_sum > med_idx)[0][0], 0]
else:
    median = data[np.where(cum_sum > med_idx)[0][0], 0]
median

6.1

In [236]:
# 1st quartile
cum_sum = np.cumsum(data[:, 1])
first_idx = (num_acc + 1) / 4

if np.ceil(first_idx) != np.floor(first_idx):
    upper_idx = np.where(cum_sum > np.ceil(first_idx))[0][0]
    lower_idx = np.where(cum_sum > np.floor(first_idx))[0][0]
    if data[upper_idx, 0] != data[lower_idx, 0]:
        quart_1 = (data[upper_idx, 0] + data[lower_idx, 0]) / 2
    else:
        quart_1 = data[np.where(cum_sum > first_idx)[0][0], 0]
else:
    quart_1 = data[np.where(cum_sum > first_idx)[0][0], 0]
quart_1

6.1

In [237]:
# 3rd quartile
cum_sum = np.cumsum(data[:, 1])
third_idx = (num_acc + 1) * 3 / 4

if np.ceil(third_idx) != np.floor(third_idx):
    upper_idx = np.where(cum_sum > np.ceil(third_idx))[0][0]
    lower_idx = np.where(cum_sum > np.floor(third_idx))[0][0]
    if data[upper_idx, 0] != data[lower_idx, 0]:
        quart_3 = (data[upper_idx, 0] + data[lower_idx, 0]) / 2
    else:
        quart_3 = data[np.where(cum_sum > third_idx)[0][0], 0]
else:
    quart_3 = data[np.where(cum_sum > third_idx)[0][0], 0]
quart_3

6.1

In [339]:
plt.figure(figsize=(8, 3))
sns.set()
headers = ['', 'Histogram', 'Cumulative distribution', 'Derivative distribution']
for col in [1, 2, 3]:
    plt.subplot(1, 3, col)
    sns.lineplot(x=data[:, 0], y=data[:, col])
    plt.title(headers[col])
    plt.xlabel('Pore size [Å]')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [341]:
plt.figure(figsize=(8, 3))
sns.set()
plt.subplot(1, 2, 1)
sns.lineplot(x=data[:, 0], y=data[:, 1])
plt.xlim(4, 8)
plt.subplot(1, 2, 2)
sns.lineplot(x=data[:, 0], y=cum_sum, label='cumulative sum')
plt.plot(np.linspace(0, 99.9, 1000), 1000*[med_idx], label='median')
plt.xlim(4, 8)
plt.legend()
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [245]:
# Testing method for extracting PSD information from IZA text files
GON_0_path = '../data/zeolite-geometry/IZA/psd-histo/GON-0.psd_histo'
with open('../data/zeolite-geometry/psd-histo_TEST.csv', mode='w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['zeolite', 'mean', '1st_quartile', 'median', '3rd_quartile'])
    _zeo = GON_0_path.split('/')[-1]
    zeo = _zeo.split('.')[0]
    f = open(GON_0_path, 'r')
    for i, line in enumerate(f):
        if i == 6:
            num_acc = int(line.split(' ')[-1])
            break
    data = np.loadtxt(GON_0_path, skiprows=11)
    mean = np.dot(data[:, 0].T, data[:, 1]) / num_acc
    cum_sum = np.cumsum(data[:, 1])
    quarts = np.zeros(3)
    for i, quart in enumerate([1, 2, 3]):
        quart_idx = (num_acc + 1) * quart / 4
        if np.ceil(quart_idx) != np.floor(quart_idx):
            upper_idx = np.where(cum_sum > np.ceil(quart_idx))[0][0]
            lower_idx = np.where(cum_sum > np.floor(quart_idx))[0][0]
            if data[upper_idx, 0] != data[lower_idx, 0]:
                quarts[i] = (data[upper_idx, 0] + data[lower_idx, 0]) / 2
            else:
                quarts[i] = data[np.where(cum_sum > quart_idx)[0][0], 0]
        else:
            quarts[i] = data[np.where(cum_sum > quart_idx)[0][0], 0]
    writer.writerow([zeo, mean, quarts[0], quarts[1], quarts[2]])
    csv_file.close()

### Read CSV file for PSD info

In [334]:
df_psd = pd.read_csv('../data/zeolite-geometry/IZA-psd.csv')

In [335]:
# Merge psd with channel info
psd_chan = df_psd.merge(each_zeolite, left_on='zeolite', right_on='zeolite')

In [252]:
class SixLayerNet_V2(nn.Module):
    def __init__(self):
        super(SixLayerNet_V2, self).__init__()
        self.regressor = nn.Sequential(
            nn.Linear(9, 100),
            nn.LeakyReLU(),
            nn.Linear(100, 200),
            nn.LeakyReLU(),
            nn.Linear(200, 200),
            nn.LeakyReLU(),
            nn.Linear(200, 200),
            nn.LeakyReLU(),
            nn.Linear(200, 100),
            nn.LeakyReLU(),
            nn.Linear(100, 1),
        )

    def forward(self, x):
        x = self.regressor(x).view(-1)
        return x

In [255]:
X_train, X_test, y_train, y_test = train_test_split(psd_chan[psd_chan.columns[1:10]].values,
                                                    psd_chan.kH_C18.values, test_size=0.2,
                                                    random_state=12)

In [296]:
sixlayernet_V2 = SixLayerNet_V2()
sixlayernet_V2 = train(sixlayernet_V2, 
                       train_data=(
                           torch.Tensor(X_train).float()[:10], 
                           torch.Tensor(y_train).float()[:10],
                       ),
                       parameters={
                           'num_epochs': 10000,
                           'lr': 1e-4,
                           'weight_decay': 0.0,
                           'step_size': 4000,
                           'gamma': 0.1,
                           'max_norm': 1,
                       },
                       verbose={
                           'print_every': 1000,
                       },
                      )

Training examples:  10

Training SixLayerNet_V2 ...
Epoch: 1000/10000,    lr: 1.000000e-04,    Loss: 792201856.000000
Epoch: 2000/10000,    lr: 1.000000e-04,    Loss: 4996531.000000
Epoch: 3000/10000,    lr: 1.000000e-04,    Loss: 49270.648438
Epoch: 4000/10000,    lr: 1.000000e-05,    Loss: 18430.683594
Epoch: 5000/10000,    lr: 1.000000e-05,    Loss: 15781.818359
Epoch: 6000/10000,    lr: 1.000000e-05,    Loss: 15677.965820
Epoch: 7000/10000,    lr: 1.000000e-05,    Loss: 15145.934570
Epoch: 8000/10000,    lr: 1.000000e-06,    Loss: 15055.648438
Epoch: 9000/10000,    lr: 1.000000e-06,    Loss: 14714.213867
Epoch: 10000/10000,    lr: 1.000000e-06,    Loss: 14587.109375


<IPython.core.display.Javascript object>




In [279]:
gbr = GradientBoostingRegressor(loss='ls', n_estimators=3000, learning_rate=0.1, 
                                verbose=1, random_state=12)

In [280]:
gbr.fit(X_train, y_train)
print('Test MSE: %e' % metrics.mean_squared_error(y_test, gbr.predict(X_test)))

      Iter       Train Loss   Remaining Time 
         1 109071888973.1241            4.14s
         2 93132598189.8615            3.93s
         3 80221772655.4188            3.64s
         4 70333834730.4651            3.34s
         5 61698564559.8884            3.16s
         6 55308782678.8529            3.08s
         7 50133040083.1798            3.04s
         8 45939930980.3551            2.99s
         9 42532327472.5121            2.93s
        10 39772157904.1984            2.89s
        20 14619226643.3497            2.50s
        30  6988150291.0242            2.36s
        40  4223014777.8722            2.28s
        50  2708566900.6766            2.33s
        60  1978989498.2620            2.39s
        70  1529232029.6577            2.35s
        80  1148358194.6162            2.31s
        90   864550530.9993            2.30s
       100   667263035.9156            2.27s
       200    81670441.8180            2.02s
       300    15217552.8227            1.89s
       4

In [317]:
feature_importance = gbr.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(8, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(psd_chan.columns[1:11])[sorted_idx])
plt.title('Feature Importance (MDI)')
plt.tight_layout()
plt.savefig('/Users/samhoover/Downloads/fig_4.png', dpi=300)
plt.show()

<IPython.core.display.Javascript object>

In [290]:
# SVR with rbf kernel with standard scaling
svr_scaled = make_pipeline(StandardScaler(), 
                               SVR(kernel='poly', degree=5))
svr_scaled.fit(X_train, y_train)
print('MSE: %e' % metrics.mean_squared_error(y_test, svr_scaled.predict(X_test)))

MSE: 4.689875e+10


# Going to drop a lot of 0 kH_C18 examples from dataset
I think all the 0 kH_C18 examples are skewing predictions

In [342]:
# original number of entries
num_ex = len(each_zeolite)
num_ex

331574

In [343]:
# total number of 0 kH_C18 entries
num_0 = len(each_zeolite[each_zeolite.kH_C18 == 0])
num_0

231053

In [344]:
# fraction of examples that contain 0 kH_C18
num_0 / num_ex

0.6968369051855694

In [345]:
plt.figure(figsize=(8, 6))
plt.hist(each_zeolite.kH_C18, bins=50, alpha=0.5, edgecolor='black')
plt.xlabel('kH_C18 [mol/kg/MPa]')
plt.ylabel('Frequency')
plt.yscale('log')
plt.show()

<IPython.core.display.Javascript object>

In [None]:
# number of nonzero kH_C18 examples
df_dropped = each_zeolite[each_zeolite.kH_C18 != 0]
len(df_dropped)

In [None]:
df_dropped

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(df_dropped.kH_C18, bins=50, alpha=0.5, edgecolor='black')
plt.xlabel('kH_C18 [mol/kg/MPa]')
plt.ylabel('Frequency')
plt.yscale('log')
plt.show()

In [None]:
# split dataset into train, valid, test
X_train, X_test, y_train, y_test = train_test_split(df_dropped[df_dropped.columns[1:6]].values,
                                                    df_dropped.kH_C18.values, test_size=0.2)

In [None]:
train_data = (torch.Tensor(X_train).float()[:10], torch.Tensor(y_train).float()[:10])

In [None]:
sixlayernet = train(sixlayernet, 
                    train_data=train_data, 
                    parameters={'num_epochs': 5000,
                                'lr': 1e-3,
                               'weight_decay': 0.0
                               },
                    verbose={'print_every': 100},
                   )

In [None]:
X_reduced = PCA(n_components=2, random_state=0).fit_transform(X_train)
X_reduced.shape

In [None]:
fig = plt.figure(figsize=(8, 6))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_train,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
plt.title("First two PCA directions")
plt.xlabel("1st eigenvector")
plt.ylabel("2nd eigenvector")
plt.show()

In [None]:
X_train = torch.Tensor(X_train).float()
y_train = torch.Tensor(y_train).float()
X_test = torch.Tensor(X_test).float()
y_test = torch.Tensor(y_test).float()

In [None]:
net = train(net, train_data=(X_train, y_train), 
            parameters={'num_epochs': 100,
                        'lr': 0.001,
                        'weight_decay': 0.0
                       },
            verbose={'print_every': 10},
            options={'plot_loss': True},
           )

In [None]:
predictions = net(X_test)

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(predictions.detach().numpy(), alpha=0.3, label='predictions')
plt.hist(y_test, alpha=0.3, label='targets')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(predictions.detach().numpy(), alpha=0.3, label='predictions')
plt.yscale('log')
plt.legend()
plt.show()