In [1]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sympy import *
import matplotlib.pyplot as plt
import operator

from IPython.core.display import display

import torch
from torch.autograd import Variable
import torch.utils.data as data_utils
import torch.nn.init as init

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

init_printing(use_unicode=True)

First up, lets's read in the data and see what's in it.

In [2]:
data = pd.read_csv("../input/data.csv")

In [3]:
data.head()

Let's prepare out training and test sets.

The diagnosis column contains the train labels, so let's extract that and turn the strings into 1 and 0.

The id and "Unamed: 32" columns won't help us in the prediction so lets drop them.

In [4]:
x = data.drop(["id", "diagnosis", "Unnamed: 32"], axis=1)
diag = { "M": 1, "B": 0}
y = data["diagnosis"].replace(diag)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=85)

Now lets scale the data ready for entry into the network and load it in the format expected by pytorch.

In [5]:
scaler = StandardScaler()
transformed = scaler.fit_transform(x_train)

train = data_utils.TensorDataset(torch.from_numpy(transformed).float(),
                                 torch.from_numpy(y_train.as_matrix()).float())
dataloader = data_utils.DataLoader(train, batch_size=128, shuffle=False)

To create the pytorch model, we'll first define a function that can create a sequential network of any size. This might come in useful later if we want to search for the optimal hyper parameters.

In [6]:
def create_model(layer_dims):
    model = torch.nn.Sequential()
    for idx, dim in enumerate(layer_dims):
        if (idx < len(layer_dims) - 1):
            module = torch.nn.Linear(dim, layer_dims[idx + 1])
            init.xavier_normal(module.weight)
            model.add_module("linear" + str(idx), module)
        else:
            model.add_module("sig" + str(idx), torch.nn.Sigmoid())
        if (idx < len(layer_dims) - 2):
            model.add_module("relu" + str(idx), torch.nn.ReLU())

    return model

In a similar manor to the train set, let's now scale and prepare a test set to let us know how our predictions are going.

In [7]:
scaler = StandardScaler()
transformed = scaler.fit_transform(x_test)

test_set = torch.from_numpy(transformed).float()
test_valid = torch.from_numpy(y_test.as_matrix()).float()

Finally, we can now specify the hyperparameters and iterate through our train data to train the model.

For now we'll use a fairly arbitrary network with 2 hidden layers of 20 and 10 nodes and a learning rate of 0.0007. For optimization, it uses Adam.

In [8]:
## Create model and hyper parameters
dim_in = x_train.shape[1]
dim_out = 1

layer_dims = [dim_in, 20, 10, dim_out]

model = create_model(layer_dims)

loss_fn = torch.nn.MSELoss(size_average=False)
learning_rate = 0.0007
n_epochs = 300
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Now run model
history = { "loss": [], "accuracy": [], "loss_val": [], "accuracy_val": [] }
for epoch in range(n_epochs):
    loss = None

    for idx, (minibatch, target) in enumerate(dataloader):
        y_pred = model(Variable(minibatch))

        loss = loss_fn(y_pred, Variable(target.float()))
        prediction = [1 if x > 0.5 else 0 for x in y_pred.data.numpy()]
        correct = (prediction == target.numpy()).sum()
        
        # This can be uncommented for a per mini batch feedback
        #history["loss"].append(loss.data[0])
        #history["accuracy"].append(100 * correct / len(prediction))
        
        y_val_pred = model(Variable(test_set))
        loss_val = loss_fn(y_val_pred, Variable(test_valid.float()))
        prediction_val = [1 if x > 0.5 else 0 for x in y_val_pred.data.numpy()]
        correct_val = (prediction_val == test_valid.numpy()).sum()
        
        # This can be uncommented for a per mini batch feedback
        #history["loss_val"].append(loss_val.data[0])
        #history["accuracy_val"].append(100 * correct_val / len(prediction_val))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    history["loss"].append(loss.data[0])
    history["accuracy"].append(100 * correct / len(prediction))
    history["loss_val"].append(loss_val.data[0])
    history["accuracy_val"].append(100 * correct_val / len(prediction_val))
        
    print("Loss, accuracy, val loss, val acc at epoch", epoch + 1,history["loss"][-1], 
          history["accuracy"][-1], history["loss_val"][-1], history["accuracy_val"][-1] )


index, value = max(enumerate(history["accuracy_val"]), key=operator.itemgetter(1))

print("Best accuracy was {} at iteration {}".format(value, index))

Not bad at all! Around %98 accuracy with no optimization at this point.

Let's plot the loss and accuracy to see if everything looks good.

In [9]:
plt.plot(history['accuracy'])
plt.plot(history['accuracy_val'])
plt.title('Model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history['loss'])
plt.plot(history['loss_val'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Obviously a few things to work on but not bad for a first attempt.