In [None]:
# import libraries we will need later
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib import image as mp_image
import seaborn as sns
import os
import shutil
%matplotlib inline

# for assessing model later
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
# The images are in a folder named 'input/natural-images/natural_images'
training_folder_name = '../input/histopathologic-cancer-detection/train'

# All images are 96x96 pixels
img_size = (96,96)

# Make classes easy to access for later
classes = ["cancer","no_cancer"]
print(classes)

In [None]:
# Import PyTorch libraries
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

print("Libraries imported - ready to use PyTorch", torch.__version__)

In [None]:
from PIL import Image

# function to resize images, in case any are not actually 96 x 96 pixels
def resize_image(src_image, size=(96,96), bg_color="white"): 
    from PIL import Image, ImageOps 
    
    # resize the image so the longest dimension matches our target size
    src_image.thumbnail(size, Image.ANTIALIAS)
    
    # Create a new square background image
    new_image = Image.new("RGB", size, bg_color)
    
    # Paste the resized image into the center of the square background
    new_image.paste(src_image, (int((size[0] - src_image.size[0]) / 2), int((size[1] - src_image.size[1]) / 2)))
    return new_image

In [None]:
# main folder to use later, just saving file directory
training_folder_name = '../input/histopathologic-cancer-detection'

# new location for the resized images
train_folder = '../working/data/edadatafolder'

# to use to read all the files
import glob

# store the size we want
size = (96,96)

# Create the output folder if it doesn't already exist
if os.path.exists(train_folder):
    shutil.rmtree(train_folder)

# Create a dictionary with the file names and if they have cancer or not
#skiprows = 1, so we skip the first row, which is just "label" and "id"
cancer_dict = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv', header=None, index_col=0, squeeze=True, skiprows = 1).to_dict()

# Loop through each subfolder in the input folder
print('Transforming images...')
fileglob = glob.glob('../input/histopathologic-cancer-detection/train', recursive = False)
total_files = 200000

#loop through everything we want to use
for files in fileglob:
    print("started for loop")
    file_names = os.listdir('../input/histopathologic-cancer-detection/train')
    # start a for loop that loops thorugh all the files we want
    for i in range(total_files):
        if (i % (total_files/10) == 0):
            print ("working " + str(i/(total_files/10)) + "/10")
        # the file number 15037 supposedly crashes the program, so we avoid it
        if (i != 15037):
            
            # get the file names, then add it to the output folder, with the labels under the respective folder
            file_name = file_names[i]
            result = cancer_dict[file_name[:-4]]
            saveFolder = os.path.join(train_folder, classes[result])
            if not os.path.exists(saveFolder):
                os.makedirs(saveFolder)
            file_path = os.path.join('../input/histopathologic-cancer-detection/train', file_name)
            
            # now get the images, resize them, then throw them into the new folders
            image = Image.open(file_path)
            resized_image = resize_image(image, size)
            saveAs = os.path.join(saveFolder, file_name)
            resized_image.save(saveAs)

print('Done')

In [None]:
def load_dataset(data_path):
    import torch
    import torchvision
    import torchvision.transforms as transforms
    # Load all the images
    transformation = transforms.Compose([
        # Randomly augment the image data
            # Random horizontal flip
        transforms.RandomHorizontalFlip(0.5),
            # Random vertical flip
        transforms.RandomVerticalFlip(0.3),
        # transform to tensors
        transforms.ToTensor(),
        # Normalize the pixel values (in R, G, and B channels)
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

    # Load all of the images, transforming them
    full_dataset = torchvision.datasets.ImageFolder(
        root=data_path,
        transform=transformation
    )
    
    
    # Split into training (70% for training, 30% for testing)
    train_size = int(0.7 * len(full_dataset))
    test_size = len(full_dataset) - train_size
    
    # use torch.utils.data.random_split for training/test split
    train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
    
    # define a loader for the training data 
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=3,
        num_workers=0,
        shuffle=False
    )
    
    # define a loader for the testing data 
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=3,
        num_workers=0,
        shuffle=False
    )
        
    return train_loader, test_loader




#####################################################################################################




# Recall that we have resized the images and saved them into
train_folder = '../working/data/edadatafolder'

# Set the iterative dataloaders for test and training data
train_loader, test_loader = load_dataset(train_folder)
batch_size = train_loader.batch_size
print("Data loaders ready to read", train_folder)


In [None]:
# Create a neural net class
class Net(nn.Module):
    
    
    # Defining the Constructor
    def __init__(self, num_classes=3):
        super(Net, self).__init__()
        
        
        # Our images are RGB, so we have input channels = 3 and output = 32. Kernel size = 3
        # because it worked well with my neural network
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=2)
        
        # doubling the input to output with each hidden layer
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=2)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=2)
        
        # batchnorm has learnable parameters, but pass in the correct size to use in relu
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        self.bn4 = nn.BatchNorm2d(256)
        
        #setting variables to use inside of the relu and when we actually maxpool
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.avg=nn.AvgPool2d(4)
        #256*1*1 becuase 256 input channels. out_features = 2, cancer and non_cancer
        self.fc = nn.Linear(in_features=256*1*1, out_features=num_classes)

    def forward(self, x):
        # relu using variables we previously defined
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = self.pool(F.relu(self.bn4(self.conv4(x))))
        x = self.avg(x)
        
        # Flatten
        x = x.view(-1, 256*1*1)
        # Feed to fully-connected layer to predict class
        x = self.fc(x)
        # Return class probabilities via a log_softmax function 
        return torch.log_softmax(x, dim=1)
    
device = "cpu"
if (torch.cuda.is_available()):
    # if GPU available, use cuda, so faster training
    device = "cuda"

# Create an instance of the model class and allocate it to the device
model = Net(num_classes=len(classes)).to(device)

print(model)

In [None]:
def train(model, device, train_loader, optimizer, epoch):
    # Set the model to training mode
    model.train()
    train_loss = 0
    print("Starting Epoch:", epoch)
    # Process the images in batches
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        # Reset the optimizer
        optimizer.zero_grad()
        
        # Push the data forward through the model layers
        output = model(data)
        
        # Get the loss
        loss = loss_criteria(output, target)

        # Keep a running total
        train_loss += loss.item()
        
        # Backpropagate and learn
        loss.backward()
        optimizer.step()
        
        # not printing loss during epochs, as it lags quite a bit with too many inputs. 
        # loss after each epoch is what we care about after all, not after x images inside an epoch
            
    # return average loss for the epoch
    avg_loss = train_loss / (batch_idx+1)
    return avg_loss

In [None]:
def test(model, device, test_loader):
    # Switch the model to evaluation mode (so we don't backpropagate or drop)
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        batch_count = 0
        for data, target in test_loader:
            batch_count += 1
            data, target = data.to(device), target.to(device)
            
            # Get the predicted classes for this batch
            output = model(data)
            
            # Calculate the loss for this batch
            test_loss += loss_criteria(output, target).item()
            
            # Calculate the accuracy for this batch
            _, predicted = torch.max(output.data, 1)
            correct += torch.sum(target==predicted).item()

    # Calculate the average loss for this epoch then return it
    avg_loss = test_loss / batch_count
    return avg_loss

In [None]:
# learning rate set to 0.005, which was what i found to be best
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Specify the loss criteria
loss_criteria = nn.CrossEntropyLoss()

# Track metrics in these arrays
epoch_nums = []
training_loss = []
validation_loss = []

# Train over 15 epochs because we have the time, but I found that 5 to 15 doesn't change too much honestly
epochs = 15
print('Training on', device)
for epoch in range(1, epochs + 1):
        train_loss = train(model, device, train_loader, optimizer, epoch)
        test_loss = test(model, device, test_loader)
        print("Average Loss After Epoch " + str(epoch) + ": " + str(test_loss))
        epoch_nums.append(epoch)
        training_loss.append(train_loss)
        validation_loss.append(test_loss)
#make a plot to see the progression of loss
plt.figure(figsize=(15,15))
plt.plot(epoch_nums, training_loss)
plt.plot(epoch_nums, validation_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['training', 'validation'], loc='upper right')
plt.show() 

In [None]:
# Defining Labels and Predictions
truelabels = []
predictions = []
model.eval()
print("Getting predictions from test set...")
#get predictions and real answers
for data, target in test_loader:
    for label in target.data.numpy():
        truelabels.append(label)
    for prediction in model(data.to(device)).cpu():
        p = prediction.data.numpy()
        predictions.append(p.argmax(0))

# define then plot confusion matrix
cm = confusion_matrix(truelabels, predictions)
tick_marks = np.arange(len(classes))
print(cm)
# create a dataframe of the confusion matrix
df_cm = pd.DataFrame(cm, index = classes, columns = classes)
print("PLEASE WORK")
#write accuracy, then print it
accuracy = (100*(cm[0,0] + cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1]))
print("accuracy: "+ str(accuracy) + "%")
#plot the final confusion matrix
plt.figure(figsize = (7,7))
sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues, fmt='g')
plt.xlabel("Predicted Shape", fontsize = 20)
plt.ylabel("True Shape", fontsize = 20)
plt.show()

In [None]:
import shutil
shutil.rmtree("../working/data/edadatafolder")