# ROSES Unit 7 Machine Learning
## Dr. Zachary Ross

In [None]:
import torch
import numpy as np
import pylab as plt
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 18})

The simplest of supervised learning algorithms are linear models. We'll start by looking at linear classifiers, which are suitable for datasets where the classes are fully separable by a plane, such as in this example:

In [None]:
n_samp_class = 100
c = 3
d = 2
center = 3

X1 = np.column_stack([np.ones(n_samp_class), np.random.normal(0, 1, size=n_samp_class), np.random.normal(center, 1, size=n_samp_class)])
X2 = np.column_stack([np.ones(n_samp_class), np.random.normal(-center, 1, size=n_samp_class), np.random.normal(-center, 1, size=n_samp_class)])
X3 = np.column_stack([np.ones(n_samp_class), np.random.normal(center, 1, size=n_samp_class), np.random.normal(-center, 1, size=n_samp_class)])
Y1 = np.zeros(X1.shape[0])
Y2 = np.ones(X2.shape[0])
Y3 = np.ones(X3.shape[0])*2

X = np.concatenate([X1, X2, X3])
Y = np.concatenate([Y1, Y2, Y3])
idx = np.arange(Y.shape[0])
X = X[idx,:]
Y = Y[idx].astype(np.int)

idx = np.where(Y==0)
plt.scatter(X[idx,1], X[idx,2], c='r')
idx = np.where(Y==1)
plt.scatter(X[idx,1], X[idx,2], c='k')
idx = np.where(Y==2)
plt.scatter(X[idx,1], X[idx,2], c='b')
plt.show()

This condition is called linear separability. An example of a dataset that is not linearly separable is:

In [None]:
N = 100
D = 2
K = 5
X = np.zeros((N*K,D))
y = np.zeros(N*K)
for j in range(K):
    ix = range(N*j,N*(j+1))
    r = np.random.uniform(j+0.2, j+0.6, size=N)
    theta = np.random.uniform(0, 360, size=N)
    X[ix] = np.c_[r*np.sin(theta), r*np.cos(theta)]
    y[ix] = j

plt.figure(figsize=(8,8))
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, edgecolor='k', cmap=plt.cm.rainbow)
plt.show()

We'll later consider models suitable for these types of datasets. For now, we'll focus on linear classifiers.

First, we load the dataset. It's stored in a Numpy NPZ file.

In [None]:
f = np.load("seismograms.npz")
X = f['X']
Y = f['Y']

In machine learning, the inputs, x, are called features, while the values to predict, y, are called targets or labels. Let's inspect the dataset to get a sense for what the different traces look like.

In [None]:
dt = 0.01
t = np.arange(X.shape[1]) * dt
for i in range(6):
    fig, ax = plt.subplots(3, 1, sharex=True, sharey=True, figsize=(8,6))
    if Y[i] == 0:
        color = 'k'
        label = 'P-wave'
    elif Y[i] == 1:
        color = 'r'
        label = 'S-wave'
    elif Y[i] == 2:
        color = 'b'
        label = 'Noise'
    for j in range(3):
        ax[j].plot(t, X[i,:,j], c=color, lw=1, label=label)
    ax[2].set_xlabel("Time (sec)")
    plt.legend()
    plt.tight_layout()
    plt.show()

This dataset is a mixture of 3 signal types: P-waves, S-waves, and noise seismograms. Since we have ground truth labels for all waveforms, we can train a classifier to predict the correct class. With the types of models we will consider here (shallow networks), they often require you to pre-process the data to engineer good features.

In [None]:
# We are going to use spectral amplitudes for this tutorial, rather than the full time series
X = np.fft.rfft(X, axis=1)
X = np.abs(X)
freq = np.fft.rfftfreq(400, d=0.01)

# We'll preprocess the data by taking the logarithm
X = np.log10(X)

# And then it's common to normalize the data
X -= np.mean(X, axis=0)[None,:,:]
X /= np.std(X, axis=0)[None,:,:]

# Here we are going to restrict ourselves to 20 Hz and less
idx = np.where(freq<=20.0)[0]
X = X[:,idx,1]

num_features = X.shape[1]
print(X.shape)

Now, we need to get this into a form that Pytorch can easily work with. We'll define a Dataset that stores our features and labels.

In [None]:
class WfDataset(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.features = torch.from_numpy(features).float()
        self.labels = torch.from_numpy(labels).long()

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]

        return x, y

    def __len__(self):
        return len(self.features)

Now, we build a waveform dataset and get the number of data samples (seismograms)

In [None]:
dataset = WfDataset(X, Y)
n_samples = len(dataset)
print(n_samples, "seismograms in the dataset")

In supervised learning, a portion of the dataset is typically set aside for independent cross-validation. Here we will use 25% of the seismograms for our validation set

In [None]:
n_val = int(0.25*n_samples)
indices = list(range(n_samples))

# Randomly select seismograms for the validation set
validation_idx = np.random.choice(indices, size=n_val, replace=False)
train_idx = list(set(indices) - set(validation_idx))

Pytorch is designed to work easily with DataLoader objects, which automatically produce batches of data that can be iteratively processed during training. Let's create a DataLoader for both the training and validation datasets.

In [None]:
from torch.utils.data.sampler import SubsetRandomSampler
train_sampler = SubsetRandomSampler(train_idx)
validation_sampler = SubsetRandomSampler(validation_idx)

train_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=32,
    shuffle=False,
    sampler=train_sampler,
)

val_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=1024,
    shuffle=False,
    sampler=validation_sampler
)

The first model that we will consider is a linear softmax classifier. The model has the form $y = wx + b$, where $w$ is called the weight matrix (dims: 3x400), and $b$ is the bias vector (dims: 3x1). Given an input seismogram, it will output a vector $y$ (dims: 3x1), which represent the probabilities of the signal belonging to each respective class.

In [None]:
# Models in pytorch are defined by classes. 
class LinearSoftmaxClassifier(torch.nn.Module):
    
    def __init__(self, num_features, num_classes):
        super(LinearSoftmaxClassifier, self).__init__()        
        
        # This is where our linear model is defined. The inputs have dimension 400 (the number of time steps), and the output, y, has dimension 3
        self.layer1 = torch.nn.Linear(num_features, num_classes)
        
    # This function defines what happens when you input a seismogram, x
    def forward(self, x):
        
        # This is our linear classifier equation, y = wx + b
        y = self.layer1(x)

        return y

Next we instantiate our model

In [None]:
model = LinearSoftmaxClassifier(num_features=num_features, num_classes=3)

Right now, $w$ and $b$ in our model are uninitialized parameters. The goal is to learn optimal parameter values that minimize our prediction error against the ground truth. The process by which these parameters are learned is called 'training'.

Neural networks (which includes our linear softmax classifier) are typically trained with gradient descent algorithms, which take the form:

$\theta^{i+1} = \theta^{i} - \eta \nabla_\theta L$,

where $\theta^i$ represents the full set of parameters at iteration $i$, $\eta$ is the learning rate (step size), and $L$ is an objective function called the loss. To perform gradient descent, we first pass some seismograms through the model and make class predictions for each. Then, $L$ is calculated by measuring the prediction error against the ground truth. Since we want to minimize this prediction error, we calculate the gradient $\nabla_\theta L$ with respect to $\theta$, which is used in the above equation to determine the parameter updates.

In [None]:
# Set up a gradient descent optimizer. lr is the learning rate. We provide the optimizer with a list of all model parameters
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

We also need to define our loss function, $L$. For this problem, we'll use the cross-entropy loss, which is equivalent to the likelihood of our model correctly predicting the true class of a batch of seismograms.

In [None]:
loss = torch.nn.CrossEntropyLoss()

Now we are ready to train our network. We'll perform gradient descent by iterating over the dataset and presenting batches of seismograms to the network

In [None]:
def train(train_loader, val_loader, model, lr, num_epochs=100, weight_decay=0):

    training_acc = []
    validation_acc = []
    
    training_loss = []
    validation_loss = []
    
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss = torch.nn.CrossEntropyLoss()

    # Each epoch is defined by one full cycle through the dataset
    for epoch in range(num_epochs):

        categorical_accuracy = 0
        count = 0
        tmp_loss = []
        
        # This iterates over mini-batches of seismograms until the whole dataset has been seen
        for data in train_loader:

            x, y_true = data

            # Need to reset the gradients for each iteration
            optimizer.zero_grad()

            # Forward pass for class predictions
            y_pred = model(x)

            # Calculate prediction error
            L = loss(y_pred, y_true)

            # Now, call autograd, which calculates the gradients of L w.r.t the parameters
            L.backward()

            # Perform parameter update
            optimizer.step()

            # Store accuracy for later
            y_pred_labels = torch.argmax(y_pred, dim=1)
            categorical_accuracy += (y_pred_labels == y_true).sum().item()
            count += y_true.shape[0]
            
            tmp_loss.append(L.item())

        training_acc.append(categorical_accuracy/count)
        training_loss.append(np.mean(tmp_loss))
        
        categorical_accuracy = 0
        count = 0
        tmp_loss = []
        
        # Now let's do this again for the validation dataset. This time, we don't need to calculate the gradients since we are not updating the parameters.
        for data in val_loader:

            x, y_true = data

            # Forward pass for class predictions
            y_pred = model(x)

            # Calculate prediction error
            L = loss(y_pred, y_true)

            # Store accuracy for later
            tmp_loss.append(L.item())
            y_pred_labels = torch.argmax(y_pred, dim=1)
            categorical_accuracy += (y_pred_labels == y_true).sum().item()
            count += y_true.shape[0]
            
        validation_acc.append(categorical_accuracy/count)
        validation_loss.append(np.mean(tmp_loss))
        
        if epoch % 10 ==0:
            print("Epoch", epoch, "finished. Validation accuracy:", validation_acc[-1])

    return training_acc, validation_acc, training_loss, validation_loss

In [None]:
model = LinearSoftmaxClassifier(num_features=num_features, num_classes=3)
out = train(train_loader, val_loader, model, lr=1e-4, num_epochs=100)

Now, it's time to look at the training history

In [None]:
def plot_results(out):
    training_acc, validation_acc, training_loss, validation_loss = out
    
    fig, ax = plt.subplots(1,2,sharex=True, figsize=(16,6))
    ax[0].plot(np.arange(len(training_acc)), training_loss, c='k', label="Training")
    ax[0].plot(np.arange(len(validation_acc)), validation_loss, c='b', label="Validation")
    ax[0].set_xlabel("Iteration")
    ax[0].set_ylabel("Loss")
    ax[0].legend(loc='upper right')
    ax[0].set_xlim((0, None))
    
    ax[1].plot(np.arange(len(training_acc)), training_acc, c='k', label="Training")
    ax[1].plot(np.arange(len(validation_acc)), validation_acc, c='b', label="Validation")
    ax[1].set_xlabel("Iteration")
    ax[1].set_ylabel("Accuracy")
    ax[1].set_ylim((0, 1))
    ax[1].set_xlim((0, None))
    ax[1].legend(loc='upper right')
    plt.tight_layout()
    plt.show()
    
plot_results(out)

We can see that the training accuracy of the model increases iteratively during the training process and eventually converges to its optimal value. The validation accuracy is close to the training accuracy at all epochs, which tells us that we are not overfitting on the data. Overall, the accuracy of our model is about 53%, which is much better than randomly guessing. 

During training, the learning rate has a significant influence on the convergence properties. Linear models are convex and are guaranteed to converge, but may take longer or faster depending on the learning rate. Try adjusting the learning rate to 1e-2, 1e-1, and 1e-3 and repeating this process.

In [None]:
lr = 1e-3
model = LinearSoftmaxClassifier(num_features=num_features, num_classes=3)
out = train(train_loader, val_loader, model, lr, num_epochs=100)
plot_results(out)

However, this doesn't really improve the model's performance much. The main reason is that  The goal is then to learn a transformation of the data into a space where the classes are linearly separable.

One powerful class of models to do this, which extends our linear classifier model, is the artificial neural network. Neural networks are layered systems that sequentially feed the outputs of each layer into the next layer. For example, a simple two-layer neural network looks like this:

$y = W_2\varphi(W_1x + b_1) + b_2$

where $f = W_1x + b_1$ is exactly the linear model described before, and $\varphi(f)$ is some non-linear function called an activation function. Here, $f$ is input to a second linear model with different parameters, after applying the activation function $\varphi(f)$. Common examples of $\varphi(f)$ include the sigmoid function:

In [None]:
f = np.linspace(-5, 5, 1000)
sigmoid = lambda x: np.exp(x) / (1 + np.exp(x))
fig = plt.figure()
plt.plot(f, sigmoid(f))
plt.xlabel("f")
plt.ylabel("Activation")
plt.show()

Why do we need $\varphi(f)$ in the first place? It has been proven mathematically that for certain classes of activation functions, a neural network can approximate any non-linear function (provided it has certain properties). Without it, our model can only approximate linear functions, no matter how many layers we have.

Therefore $z = \varphi(W_1x + b_1)$ represents a learned non-linear mapping into a new space, $z$, which is (hopefully) linearly separable. The function $\varphi(f)$ has been chosen carefully such that our model can learn an arbitrary non-linear mapping. A simple example of such a model is below:

In [None]:
class TwoLayerNeuralNetwork(torch.nn.Module):
    
    def __init__(self, num_features=400, num_classes=3, num_hidden=40):
        super(TwoLayerNeuralNetwork, self).__init__()        
        
        # This is slightly modified from before. The inputs have dimension 400 (the number of time steps), and the output, f, has dimension num_hidden
        self.layer1 = torch.nn.Linear(num_features, num_hidden)
        
        # Unlike with the linear classifier, the 2-layer NN has a second layer that receives the outputs of layer 1. The number of outputs for this layer is now 3.
        self.layer2 = torch.nn.Linear(num_hidden, num_classes)
        
    # This function defines what happens when you input a seismogram, x
    def forward(self, x):
        
        # This is our linear classifier equation, z = wx + b
        f = self.layer1(x)
        
        # However to achieve a non-linear mapping function, we need to apply a non-linear activation to the outputs.
        z = torch.relu(f)
        
        # Now, we will have transformed x into z, where z is linearly separable. Next we can use our linear softmax classifier:
        out = self.layer2(z)

        return out

The new layer, $z$, that we have added to this model (compared with the linear model), is referred to as a hidden layer, since the values are not directly observed as either inputs, or outputs. Unlike in the linear model, where the dimensions of $W$ and $b$ are determined by $x$ and $y$, the dimensions of $W_1$ and $b_1$ in a neural network are user-specified hyperparameters. Let's build a neural network with 10 hidden units, i.e. $dim(W_1)=(10x400)$ and $dim(b_1)=(10x1)$

We can train this model using the function we previously defined:

In [None]:
model = TwoLayerNeuralNetwork(num_features=num_features, num_classes=3, num_hidden=40)
out = train(train_loader, val_loader, model, lr=5e-4, num_epochs=100)
plot_results(out)

The performance of this simple neural network is slightly better than what we got with the linear classifier. However modern deep neural networks can achieve far better performance and are composed of potentially hundreds of layers.

Besides the performance, neural networks are quite different than the linear classifer that we trained before. In particular, the loss for a neural network is non-convex, which makes training much harder. Try adjusting the learning rate below to 1e-2, 1e-3, and 1e-5 and see how the training results change.

In [None]:
model = TwoLayerNeuralNetwork(num_features=num_features, num_classes=3, num_hidden=40)
out = train(train_loader, val_loader, model, lr=1e-2, num_epochs=100)
plot_results(out)

The number of hidden units has a substantial influence on the performance of the neural network because it controls the amount of detail that can be learned. See what happens to our performance (and the training time) when we increase this to 100:

In [None]:
lr = 1e-2
model = TwoLayerNeuralNetwork(num_features=num_features, num_classes=3, num_hidden=100)
out = train(train_loader, val_loader, model, lr, num_epochs=100)
plot_results(out)

What about if we add a third layer to the network?

In [None]:
class ThreeLayerNeuralNetwork(torch.nn.Module):
    
    def __init__(self, num_features=400, num_classes=3, num_hidden=40):
        super(ThreeLayerNeuralNetwork, self).__init__()        
        
        # This is slightly modified from before. The inputs have dimension 400 (the number of time steps), and the output, f, has dimension num_hidden
        self.layer1 = torch.nn.Linear(num_features, num_hidden)
        
        # Unlike with the linear classifier, the 2-layer NN has a second layer that receives the outputs of layer 1. The number of outputs for this layer is now 3.
        self.layer2 = torch.nn.Linear(num_hidden, num_hidden)
        
        self.layer3 = torch.nn.Linear(num_hidden, num_classes)
        
    # This function defines what happens when you input a seismogram, x
    def forward(self, x):
        
        # This is our linear classifier equation, z = wx + b
        f = self.layer1(x)
        
        # However to achieve a non-linear mapping function, we need to apply a non-linear activation to the outputs.
        z = torch.relu(f)
        
        f = self.layer2(z)
        
        z = torch.relu(f)
        
        # Now, we will have transformed x into z, where z is linearly separable. Next we can use our linear softmax classifier:
        out = self.layer3(z)

        return out

In [None]:
model = ThreeLayerNeuralNetwork(num_features=num_features, num_classes=3, num_hidden=40)
out = train(train_loader, val_loader, model, lr=1e-2, num_epochs=100)
plot_results(out)

And that's all for this tutorial!