# X-Ray: Anomaly-Detection and Classification

### Computational Methods II Final Project

#### Sarah Yam and Joseph Hostyk

In [70]:
%matplotlib inline 
import os
import matplotlib.pyplot as plt 
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import inspect
from torchsummary import summary
from PIL import Image
from collections import defaultdict, Counter
import random
import copy



### Load and clean our data:

In [94]:
### Get labels:
def getPatientInfo(patientDataFile):
    
    patientDataDict = defaultdict(dict)
    classificationsToPatients = defaultdict(set)
    
    with open(patientDataFile, "r") as patients:
        header = patients.readline().strip().split(",")
        for patient in patients:
            
            # We're only keeping frontal (78k), leaving out lateral (10k)
            
            if "lateral" not in patient:
            
                patient = dict(zip(header, patient.strip().split(",")))
                ID = patient["Patient ID"]
                path = patient["Path"]
                classificationRow = [column for column in patient if patient[column] == "1" and column != "Sum per Row"]

                # Might be 'No Finding' and 'Support Devices'. In which case, we want the last option.
                # Otherwise, there should just be one label. In which case we want the last option.
                try:
                    classification = classificationRow[-1]
                    if len(classificationRow) > 1 and classificationRow != ['No Finding', 'Support Devices']:
                        print(classificationRow)
                    patientDataDict[ID]["Path"] = path
                    patientDataDict[ID]["Classification"] = classification
                    
                    classificationsToPatients[classification].add(ID)

                # The final row of the file is just a tally of the columns.
                except IndexError as e:
                    continue
                    
    return patientDataDict, classificationsToPatients

patientDataFile = "selectedPatients.csv"
allPatientDataDict, classificationsToPatients = getPatientInfo(patientDataFile)
    

In [95]:
allPatientDataDict
# classificationsToPatients


defaultdict(dict,
            {'patient00001': {'Path': 'CheXpert-v1.0-small/train/patient00001/study1/view1_frontal.jpg',
              'Classification': 'Support Devices'},
             'patient00002': {'Path': 'CheXpert-v1.0-small/train/patient00002/study1/view1_frontal.jpg',
              'Classification': 'Fracture'},
             'patient00005': {'Path': 'CheXpert-v1.0-small/train/patient00005/study1/view1_frontal.jpg',
              'Classification': 'Support Devices'},
             'patient00008': {'Path': 'CheXpert-v1.0-small/train/patient00008/study2/view1_frontal.jpg',
              'Classification': 'Support Devices'},
             'patient00009': {'Path': 'CheXpert-v1.0-small/train/patient00009/study1/view1_frontal.jpg',
              'Classification': 'Atelectasis'},
             'patient00011': {'Path': 'CheXpert-v1.0-small/train/patient00011/study3/view1_frontal.jpg',
              'Classification': 'Support Devices'},
             'patient00012': {'Path': 'CheXpert-v1.

### Split our data:

In [96]:
# function to split each disease into 80/20

def splitData(classificationsToPatients):
    
    trainNames = []
    testNames = []
    
    trainPercentage = 0.8
    
    for classification, patientsWithThatClassification in classificationsToPatients.items():
        
        total = len(patientsWithThatClassification)
        trainAmount = int(trainPercentage * total)
        
        print("There are {} in {}.".format(total, classification))
        
        hm = list(patientsWithThatClassification)
        
        random.shuffle(hm)
        
        trainNamesForThisClassification = hm[:trainAmount]
        testNamesForThisClassification = hm[trainAmount:]
        
        
        trainNames += trainNamesForThisClassification
        testNames += testNamesForThisClassification
        

    return trainNames, testNames


In [97]:
trainNames, testNames = splitData(classificationsToPatients)
print("We're left with {} train and {} test.".format(len(trainNames), len(testNames)))

There are 24695 in Support Devices.
There are 2392 in Fracture.
There are 7199 in Atelectasis.
There are 2158 in Lung Lesion.
There are 1899 in Pneumonia.
We're left with 30673 train and 7670 test.


In [100]:
### Write them out:
# with open("test.txt", "w") as out:
#     out.write("\n".join(testNames))

In [None]:
class ImageDataset(Dataset):
    """ dataset."""

    def __init__(self, names, allPatientsDataDict):
        """
        Args:
            names (string): names
        """
        self.names = names
        self.patientInfo = allPatientsDataDict

    def __len__(self):
        return len(self.names)

    def __getitem__(self, index):
        
        individualName = self.names[index]
        
        
        individualDisease = self.patientInfo[individualName]["Classification"]
        individuaPath = self.patientInfo[individualName]["Path"]
        
        
        image = Image.open(individuaPath)
        pixelArray = np.array(image)
        

        patientInfo = {"classification": individualDisease, "image": pixelArray}

        return patientInfo
    

In [57]:
train_dataset = ImageDataset(trainNames, allPatientsDataDict)
test_dataset = ImageDataset(testNames, allPatientsDataDict)



NameError: name 'ImageDataset' is not defined

In [None]:
batch_size = 100
num_epochs = 1876

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

### Our model.

### Training the model.

### Testing the model.

## Below is the code from the Lab.

In [3]:
print(train_dataset.train_data.size())
print(train_dataset.train_labels.size())

torch.Size([60000, 28, 28])
torch.Size([60000])


In [4]:
print(test_dataset.test_data.size())
print(test_dataset.test_labels.size())

torch.Size([10000, 28, 28])
torch.Size([10000])


In [5]:
batch_size = 100
num_epochs = 1876

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

## Building a Dynamic CNN
The above code is great for getting an idea of how to build a model. But in production we would want to be able to iterate through multiple different architectural choices like number of layers, filter sizes, pooling kernels, strides, activation functions, etc. In order to be able to do this we need to build our model using arguments which specify the architecture and loops. Otherwise we'd have to explicitly type out every layer of our model, for every change we wanted to implement.

Now just in case any of your are getting flashbacks to lab 4, not to worry. I've implemented most of the function for you. You're encouraged to walk through it to understand what's happening. **All you have to do is implement the method `calculateFinalOutputSize` that calculates the size of the final output, which is needed to calculate the size of the final FCNN layer.**

In [54]:
class CNNModelDynamic(nn.Module):
    def __init__(self, input_shape, n_classes,
                 in_channels_list, out_channels_list,
                 kernel_size_list, stride_list,
                 padding_list, pool_kernel_list,
                 pool_stride_list,
                 pooling_list, activations_list):
        super(CNNModelDynamic, self).__init__()
        localArgs = locals().items()
        argLens = set()
        ignoredArgs = ['self', "__class__", "input_shape", "n_classes"]
        for argName, arg in localArgs:
            if argName not in ignoredArgs:
                argLens.add(len(arg))
        assert len(argLens) == 1, ("mismatch in lengths of arguments."
                                   "All params for each layer must be specified")
        finalOutputSize = self.calculateFinalOutputSize(input_shape,kernel_size_list, stride_list,
                                         padding_list, pool_kernel_list, pool_stride_list)
        modules = list()
        for layerIdx in range(0, argLens.pop()):
            modules.append(nn.Conv2d(in_channels = in_channels_list[layerIdx],
                                 out_channels = out_channels_list[layerIdx],
                                 kernel_size = kernel_size_list[layerIdx],
                                 stride = stride_list[layerIdx],
                                 padding = padding_list[layerIdx]))
            modules.append(activations_list[layerIdx])
            modules.append(pooling_list[layerIdx](kernel_size = pool_kernel_list[layerIdx],
                                                  stride = pool_stride_list[layerIdx]))
        self.convolutions = nn.Sequential(*modules)
        self.finalLayer = nn.Linear(finalOutputSize**2*out_channels_list[-1], n_classes)
        
    def outputFromConvLayer(self, w, k, p, s):
        return (w-k+2*p)/float(s) + 1
    
    def outputFromPoolLayer(self, w, k, s):
        return (w-k)/float(s) + 1
    
    def calculateFinalOutputSize(self, input_shape, kernel_size_list, stride_list,
                                 padding_list, pool_kernel_list, pool_stride_list):
        """
        Calculates the shape of the final output assuming that every conv layer is followed
        by a pooling layer.
        """
         #### your code here ###
        currentInput = input_shape
        for i in range(len(kernel_size_list)):
            currentInput = self.outputFromConvLayer(currentInput, kernel_size_list[i], padding_list[i], stride_list[i])
            currentInput = self.outputFromPoolLayer(currentInput, pool_kernel_list[i], pool_stride_list[i])
        finalOutputShape = currentInput
        
#         print("Final shape is", int(finalOutputShape))
        
         #### end code here ###
        return(int(finalOutputShape))
        
        
    def forward(self, x):
        out = self.convolutions(x)
        out = out.view(out.size(0), -1)
        out = self.finalLayer(out)
        return(out)

### Dynamic Model Summary
Using the summary function from torchsummary print out the layers, shapes, and number of parameters in the model.

In [55]:
#### your code here ###
cnnDynamicOG = CNNModelDynamic(input_shape = 28, n_classes = 10,
                 in_channels_list = [1, 16], out_channels_list = [16, 32],
                 kernel_size_list = [5, 5], stride_list = [1, 1],
                 padding_list = [0,0], pool_kernel_list = [2, 2],
                 pool_stride_list = [2, 2],
                 pooling_list = [nn.MaxPool2d, nn.MaxPool2d], activations_list = [nn.ReLU(), nn.ReLU()])
summary(cnnDynamicOG, input_size=(1, 28, 28))

Final shape is 4
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 24, 24]             416
              ReLU-2           [-1, 16, 24, 24]               0
         MaxPool2d-3           [-1, 16, 12, 12]               0
            Conv2d-4             [-1, 32, 8, 8]          12,832
              ReLU-5             [-1, 32, 8, 8]               0
         MaxPool2d-6             [-1, 32, 4, 4]               0
            Linear-7                   [-1, 10]           5,130
Total params: 18,378
Trainable params: 18,378
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.19
Params size (MB): 0.07
Estimated Total Size (MB): 0.27
----------------------------------------------------------------


## Training a Dynamic CNN
Now we can train our model. First train the original model implemented above. It shoud train just fine and achieve decent accuracy. 

1. Your first task is to implement early stopping by keeping track of your best accuracy and stopping training if accuracy doesn't improve after 3 checks. I suggest that you check your evaluation accuracy every 50 iterations unless you have a GPU then by all means check whenever you want. Hell, check twice an iteration. Ain't nothing stopping you with a GPU. Once your best model is found, quit out and stop running. This is where you'd usually save a model, but don't worry about doing that.

2. Your second task is to experiment with different configurations. Try at least two architectural changes (depth, convolution layers, channels, strides, **optimization**, average pooling, etc) and record some observations about model performance for your two configurations. I would suggest using Adam as an optimization function.

In [56]:
def doTheThing(model, optimizer):
    numIterations = 0
    bestAccuracy = 0
    patience = 2 # how many times should we be ok with our accuracy not increasing?
    checksWithoutIncrease = 0
    num_epochs = 5 # Change later
    for epoch in range(num_epochs):
        print("Epoch: {}".format(epoch))
        for i, (images, labels) in enumerate(train_loader):
            # Load images
            images = images.requires_grad_()

            # Clear gradients w.r.t. parameters
            optimizer.zero_grad()

            # Forward pass to get output/logits
            outputs = model.forward(images)

            # Calculate Loss: softmax --> cross entropy loss
            loss = criterion(outputs, labels)

            # Getting gradients w.r.t. parameters
            loss.backward()

            # Updating parameters
            optimizer.step()

            numIterations += 1
            if numIterations % 50 == 0:
                correct = 0
                total = 0
                # Iterate through test dataset
                for images, labels in test_loader:
                    # Load images
                    images = images.requires_grad_()

                    # Forward pass only to get logits/output
                    outputs = model.forward(images)

                    # Get predictions from the maximum value
                    _, predicted = torch.max(outputs.data, 1)

                    # Total number of labels
                    total += labels.size(0)

                    # Total correct predictions
                    correct += (predicted == labels).sum()

                accuracy = 100 * correct / total
                # Check if early stopping criteria are met
                #### your code here ###

                if accuracy > bestAccuracy:
                    bestAccuracy = accuracy
                    checksWithoutIncrease = 0
                else:
                    checksWithoutIncrease += 1
                    print("\tGone {} rounds without increasing:".format(checksWithoutIncrease))

                #### end code here ###
                print('\tIteration: {}. Loss: {}. Testing Accuracy: {}'.format(numIterations, loss.item(), accuracy))
        # remember, early stopping is qutting out of all training.
        #### your code here ###
                if checksWithoutIncrease > patience:
                    print("We did not increase accuracy over the past 3 rounds. Quitting!")
                    return bestAccuracy, numIterations * (epoch + 1)
        #### end code here ###

In [57]:
cnnDynamicOG = CNNModelDynamic(input_shape = 28, n_classes = 10,
                 in_channels_list = [1, 16], out_channels_list = [16, 32],
                 kernel_size_list = [5, 5], stride_list = [1, 1],
                 padding_list = [0,0], pool_kernel_list = [2, 2],
                 pool_stride_list = [2, 2],
                 pooling_list = [nn.MaxPool2d, nn.MaxPool2d], activations_list = [nn.ReLU(), nn.ReLU()])
optimizerSGD = torch.optim.SGD(cnnDynamicOG.parameters(), lr=learning_rate)
optimizerAdam = torch.optim.Adam(cnnDynamicOG.parameters())

Final shape is 4


In [73]:
cnnDynamiclargerKernels = CNNModelDynamic(input_shape = 28, n_classes = 10,
                 in_channels_list = [1, 16], out_channels_list = [16, 32],
                 kernel_size_list = [8, 8], stride_list = [1, 1],
                 padding_list = [0,0], pool_kernel_list = [2, 2],
                 pool_stride_list = [2, 2],
                 pooling_list = [nn.MaxPool2d, nn.MaxPool2d], activations_list = [nn.ReLU(), nn.ReLU()])
optimizerSGDlargerKernels = torch.optim.SGD(cnnDynamiclargerKernels.parameters(), lr=learning_rate)

Final shape is 1


In [65]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.01

In [74]:
# bestAccuracyOriginal, totalIterationsOriginal = doTheThing(cnnDynamicOG, optimizerSGD)
# bestAccuracyWithAdam, totalIterationsWithAdam = doTheThing(cnnDynamicOG, optimizerAdam)
bestAccuracyWithLargerKernels, totalIterationsWithLargerKernels = doTheThing(cnnDynamiclargerKernels, optimizerSGDlargerKernels)


Epoch: 0
	Iteration: 50. Loss: 2.261282444000244. Testing Accuracy: 13
	Iteration: 100. Loss: 2.19022274017334. Testing Accuracy: 31
	Iteration: 150. Loss: 2.08729887008667. Testing Accuracy: 49
	Iteration: 200. Loss: 1.8707693815231323. Testing Accuracy: 56
	Iteration: 250. Loss: 1.678735375404358. Testing Accuracy: 64
	Iteration: 300. Loss: 1.2326034307479858. Testing Accuracy: 72
	Iteration: 350. Loss: 0.9652111530303955. Testing Accuracy: 74
	Iteration: 400. Loss: 0.6076298356056213. Testing Accuracy: 77
	Iteration: 450. Loss: 0.7554090619087219. Testing Accuracy: 79
	Iteration: 500. Loss: 0.6046813726425171. Testing Accuracy: 80
	Iteration: 550. Loss: 0.5280731320381165. Testing Accuracy: 81
	Gone 1 rounds without increasing:
	Iteration: 600. Loss: 0.48492884635925293. Testing Accuracy: 81
Epoch: 1
	Iteration: 650. Loss: 0.43832606077194214. Testing Accuracy: 82
	Gone 1 rounds without increasing:
	Iteration: 700. Loss: 0.34086087346076965. Testing Accuracy: 82
	Iteration: 750. Los

In [76]:
print("With the original settings, the best accuracy achieved was {}, after {} iterations.".format(bestAccuracyOriginal, totalIterationsOriginal))
print("Using Adam as the optimizer, the best accuracy achieved was {}, after {} iterations.".format(bestAccuracyWithAdam, totalIterationsWithAdam))
print("With a larger kernel size, the best accuracy achieved was {}, after {} iterations.".format(bestAccuracyWithLargerKernels, totalIterationsWithLargerKernels))



With the original settings, the best accuracy achieved was 85, after 1900 iterations.
Using Adam as the optimizer, the best accuracy achieved was 93, after 550 iterations.
With a larger kernel size, the best accuracy achieved was 86, after 3900 iterations.
