# PAS 520: Machine Learning and AI

- Scott Doyle
- 2021-11-02
- Dept. of Pathology & Anatomical Sciences
- scottdoy@buffalo.edu

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

#from ipywidgets import interact
#import ipywidgets as widgets

In [2]:
import os
import sys
#import torch
#from torch import nn
#from torch import optim
#import torch.nn.functional as F

#from torchvision.datasets import MNIST
#from torchvision.utils import make_grid
#from torchvision.transforms import ToTensor
#from torchvision import datasets, transforms, models, utils
#from torch.utils.data import DataLoader, random_split

from skimage.io import imread

#from plotly.subplots import make_subplots
#import plotly.express as px
#import plotly.graph_objects as go

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

In [3]:
# create an output dir for the models, if one doesn't exist
save_model_dir = os.path.join('data', 'mnist-models')
os.makedirs(save_model_dir, exist_ok=True)

# Grab the colormap to use from matplotlib
cmap = plt.rcParams['axes.prop_cycle'].by_key()['color']

# Set various font sizes
global_font_size = 20
title_font_size = 30
hoverlabel_font_size = 10

# Breast Cancer: Cytology

In [4]:
# Load feature values
all_features = pd.read_csv('data/breast_cytology_features/wisconsin_breast_cancer_data.csv')

In [5]:
all_features.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
# Remove NaN values
complete_features = all_features.dropna()
complete_features = complete_features.set_index('Row')

KeyError: "None of ['Row'] are in the columns"

In [None]:
complete_features.head()

In [None]:
# Divide dataframe
feature_data = complete_features.copy()
feature_data = feature_data.drop(['proggroup', 'poigroup'], axis=1)

In [None]:
feature_data.head()

In [None]:
sample_names = complete_features.copy()['Row']
prog_group = complete_features.copy()['proggroup']
poi_group = complete_features.copy()['poigroup']

In [None]:
# Scale Features
scaled_features = StandardScaler().fit_transform(feature_data.values)
scaled_features_df = pd.DataFrame(scaled_features, index=feature_data.index, columns=feature_data.columns)

# Create label vectors
prog_labels = LabelEncoder().fit_transform(prog_group)
poi_labels = LabelEncoder().fit_transform(poi_group)

In [None]:
# Set up an interactive scatterplot
def display_unsupervised(f1, f2):
    X1 = scaled_features_df[f1]
    X2 = scaled_features_df[f2]
    
    plt.scatter(X1, X2)
    plt.show()

In [None]:
interact(display_unsupervised, f1=scaled_features_df.columns, f2=scaled_features_df.columns)

In [None]:
# Set up an interactive scatterplot
def display_progression(f1, f2):
    X1 = scaled_features_df[f1]
    X2 = scaled_features_df[f2]
    
    plt.scatter(X1, X2, c=prog_labels)
    plt.show()    

In [None]:
interact(display_progression, f1=scaled_features_df.columns, f2=scaled_features_df.columns)

In [None]:
# Load up the geometry feature data
occ_feat_path = os.path.join('data', 'occ_features_old', 'all_features_old.csv')
occ_df = pd.read_csv(occ_feat_path)

In [None]:
# Set the index to be the row name
occ_df = occ_df.set_index('Row')

In [None]:
occ_poi = occ_df.copy()
#occ_values = occ_poi.drop(['ROI', 'Occult metastasis', 'progression', 'wpoi'], axis=1)
occ_values = occ_poi.drop(['Progression', 'POI'], axis=1)

poi_labels = occ_poi[['POI']].copy()
prog_labels = occ_poi[['Progression']].copy()
#met_labels = occ_poi[['Occult metastasis']].copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
poi_labels = label_encoder.fit_transform(poi_labels)
prog_labels = label_encoder.fit_transform(prog_labels)
# met_labels = label_encoder.fit_transform(met_labels)

In [None]:
# Calculate feature correlations
corr_matrix = occ_values.corr()
upper_triangle_locations = np.triu( np.ones(corr_matrix.shape), k=1).astype(bool)
upper = corr_matrix.where(upper_triangle_locations)

to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
print('{} columns to drop: {}'.format(len(to_drop), to_drop))

In [None]:
occ_slim = occ_values.drop(occ_values[to_drop], axis=1)
occ_slim.head()

In [None]:
# Scale the dataset


input_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
#         ('feat_selection', SelectKBest(f_classif, k=5))
    ])

# This does the same thing, adds a name automatically
# input_pipeline = make_pipeline(StandardScaler(), SelectKBest(chi2, k=2))
transformed_values = input_pipeline.fit_transform(occ_slim)

# Stuff these values back into a dataframe
training_df = pd.DataFrame(transformed_values, index=occ_slim.index, columns=occ_slim.columns)
training_df['POI'] = poi_labels

In [None]:
training_df.head()

In [None]:
# feat_order = np.argsort(input_pipeline['feat_selection'].scores_)

In [None]:
# feat_set = ['mom5_satDist', 'mean_triArea', 'skew_triArea', 'geo_mean_triArea','mom5_triLength', 'mom5_satWave']
# feat_set = ['edge_min_value', 'edge_max_value','edge_interquartile_range','edge_median','edge_variance','edge_skewness','edge_hyperskewness_5th_moment', 'tri_max_value', 'tri_interquartile_range', 'tri_skewness']


In [None]:
fig = px.scatter_matrix(training_df,
                        dimensions=training_df.columns[:5],
                        color="POI")
fig.show()

In [None]:
fig = px.scatter_matrix(training_df,
                        dimensions=training_df.columns[5:10],
                        color="POI")
fig.show()

In [None]:
fig = px.scatter_matrix(training_df,
                        dimensions=training_df.columns[10:15],
                        color="POI")
fig.show()

In [None]:
fig = px.scatter_matrix(training_df,
                        dimensions=training_df.columns[15:20],
                        color="POI")
fig.show()

In [None]:
fig = px.scatter_matrix(training_df,
                        dimensions=training_df.columns[21:25],
                        color="POI")
fig.show()

In [None]:
fig = px.scatter_matrix(training_df,
                        dimensions=training_df.columns[25:],
                        color="POI")
fig.show()

In [None]:
# Create a pipeline to scale the features and then select the best
# fig = px.scatter_matrix(occ_poi_data,
#     dimensions=wpoi_plotcols,
#     color="wpoi")
fig = px.scatter_matrix(training_df,
#                         dimensions=feat_set,
                       color="POI")
fig.show()

In [None]:
# fig = px.scatter(training_df,x="mom5_satDist", y="mom5_satWave", color="poi")
# fig.update_traces()
# fig.show()

# Neural Network Stuff

In [None]:
# Note that by default, the MNIST data is NOT in PyTorch Format
# Passing `transform=ToTensor()` will convert the data appropriately
mnist_data = MNIST('./data', download=True, train=True, transform=ToTensor())

batch_size = 4
training_data = DataLoader(mnist_data, batch_size=batch_size)

# Recall that the data is the first element of the tuple in the "training.pt" file
print(f'The dataset size is: {len(training_data)}')

In [None]:
sample_data = next(iter(training_data))
sample_images = sample_data[0]
sample_labels = sample_data[1]

In [None]:
sample_data

In [None]:
fig = px.imshow(np.array(sample_images[1]), facet_col=0, color_continuous_scale='gray')

fig.update_layout(
    coloraxis_showscale=False,
    xaxis=dict(
        showticklabels=False
    ),
    yaxis=dict(
        showticklabels=False
    )
)

fig.for_each_annotation(lambda a: a.update(text=""))

fig.write_html("img/plot_mnist_sample.html", include_plotlyjs='cdn')
fig.show()

In [None]:
# # Reshape the sample data
# sample_data_reshaped = sample_data.view(num_samples, -1)
# sample_data_reshaped = sample_data_reshaped.numpy()

# Define the red rectangles that will go on the images
x = 10.5
y = 9.45
num_rows = 28
num_cols = 28

sample_image = np.array(sample_images[0])
fig = px.imshow(sample_image, facet_col=0, color_continuous_scale='gray')

# Shape defined programatically
fig.add_shape(
    type='rect',
    x0=x, x1=x+5, y0=y, y1=y+1,
    xref='x', yref='y',
    line_color='red'
)

fig.update_layout(
   coloraxis_showscale=False,
)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)


fig.show()

In [None]:
def get_unrolled_data(label_to_grab, num_to_grab=100):
    
    class_data = []
    for idx, batch in enumerate(training_data):
        for image, label in zip(batch[0], batch[1]):
            if label == label_to_grab:
                class_data.append(image)

    # Slice the data to only look at the first few samples
    class_data = class_data[:num_to_grab]
    
    # "unroll" the data to produce an image that is 
    # (28*28, num_to_grab) large
    class_data_unrolled = np.hstack([np.array(x).reshape(28*28,1) for x in class_data])
    
    return class_data_unrolled

In [None]:
class_zeros = get_unrolled_data(0, num_to_grab=1000)
class_ones = get_unrolled_data(1, num_to_grab=1000)

In [None]:
fig = px.imshow(class_zeros, color_continuous_scale='gray')

fig.update_layout(
    coloraxis_showscale=False,
    xaxis=dict(
        showticklabels=False
    ),
    yaxis=dict(
        showticklabels=False
    )
)

fig.show()

In [None]:
fig = make_subplots(1,2)

fig.add_trace(px.imshow(np.array(sample_images[1]), facet_col=0).data[0], 1, 1)
fig.add_trace(px.imshow(class_zeros).data[0], 1, 2)

layout = px.imshow(class_zeros, color_continuous_scale='gray').layout
fig.layout.coloraxis = layout.coloraxis

fig.update_layout(
    coloraxis_showscale=False,
    xaxis=dict(
        showticklabels=False
    ),
    yaxis=dict(
        showticklabels=False
    )
)

fig.write_html("img/plot_mnist_zero.html", include_plotlyjs='cdn')
fig.show()

In [None]:
fig = make_subplots(1,2)

# Make this just the unrolled classes

# Doing this from https://stackoverflow.com/questions/64268081/creating-a-subplot-of-images-with-plotly
fig.add_trace(px.imshow(class_zeros).data[0], 1, 1)
fig.add_trace(px.imshow(class_ones).data[0], 1, 2)

fig.update_xaxes(title="All Zeros", col=1)
fig.update_xaxes(title="All Ones", col=2)

layout = px.imshow(class_ones, color_continuous_scale='gray').layout
fig.layout.coloraxis = layout.coloraxis

fig.update_layout(
    coloraxis_showscale=False
)

fig.write_html("img/plot_mnist_comparison.html", include_plotlyjs='cdn')
fig.show()

# Nuclei Classification Images (In Plotly)

In [None]:
img1 = imread('img/he_cell01_resized.png')
img2 = imread('img/he_cell02_resized.png')

fig = make_subplots(1,2)
fig.add_trace(px.imshow(img1).data[0], 1, 1)
fig.add_trace(px.imshow(img2).data[0], 1, 2)

fig.write_html("img/plot_nuclei_comparison.html", include_plotlyjs='cdn')
fig.show()

In [None]:
img1 = imread('img/he_nocell01_resized.png')
img2 = imread('img/he_nocell02_resized.png')

fig = make_subplots(1,2)
fig.add_trace(px.imshow(img1).data[0], 1, 1)
fig.add_trace(px.imshow(img2).data[0], 1, 2)

fig.write_html("img/plot_stroma_comparison.html", include_plotlyjs='cdn')
fig.show()

# Nuclei Classification

In [None]:
# Load the dataset into a Pandas dataframe
img_dir = os.path.join('data', 'breast_cancer_nuclei', 'patches_64')

In [None]:
args = {}

# Training and testing batch size
args["train_batch_size"] = 4#8 # 64
args["test_batch_size"] = 4#8 # 1000

# How long to train for
args["epochs"] = 2 # 100

# Learning rate: "Speed" with which the optimizer adjusts weights
args["lr"] = 0.01

# Momentum: How quickly the weights respond to changing gradients
args["momentum"] = 0.5

# Whether to use CUDA or not
args["no_cuda"] = False

# Seed for reproducible training
args["seed"] = 1

# How often to spit out log / progress updates
args["log_interval"] = 10

# Whether to save the trained model
args["save_model"] = False

# Decide whether to use CUDA
use_cuda = not args["no_cuda"] and torch.cuda.is_available()

# Set the seed
torch.manual_seed(args["seed"])

# Select the device to use based on the `use_cuda` flag
device = torch.device("cuda" if use_cuda else "cpu")

# Keyword arguments for the dataloader
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

In [None]:
data_transform = transforms.Compose(
    [transforms.Resize(64),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

nuclei_trainset = datasets.ImageFolder(root=os.path.join(img_dir, 'train'), transform=data_transform)
nuclei_testset = datasets.ImageFolder(root=os.path.join(img_dir, 'test'), transform=data_transform)

nuclei_trainloader = torch.utils.data.DataLoader(nuclei_trainset, batch_size=args['train_batch_size'],
                                                 shuffle=True, num_workers=2)
nuclei_testloader = torch.utils.data.DataLoader(nuclei_trainset, batch_size=args['test_batch_size'],
                                                 shuffle=False, num_workers=2)

classes = ('nonnuclei', 'nuclei')

In [None]:
def imshow(images):
    img_grid = utils.make_grid(images)
    img = img_grid / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    
    fig, ax = plt.subplots(figsize=(20,10))
    ax.imshow(np.transpose(npimg, (1, 2, 0)))
    ax.axis('off')
    plt.show()

In [None]:
# Get some random training images (one iteration of the dataloader)
dataiter = iter(nuclei_trainloader)
images, labels = dataiter.next()

num_samples = images.shape[0]
images_reshaped = np.moveaxis(np.array(images), 1, -1)

In [None]:
num_samples

In [None]:
# Preate a plotly matrix of the images

fig = px.imshow(images_reshaped, facet_col=0, zmin=-1, zmax=1)

fig.update_layout(coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)

fig.for_each_annotation(lambda a: a.update(text=""))

for idx in range(num_samples):
    fig.layout.annotations[idx]['text'] = classes[labels[idx]]

fig.write_html("img/plot_nuclei_classes.html", include_plotlyjs='cdn')

fig.show()

In [None]:
class NucleiNet(nn.Module):
    def __init__(self, disp_size):
        super(NucleiNet, self).__init__()
        
        # Flag whether or not to print out information about the tensor
        self.disp_size = disp_size
        
        # nn.Conv2d(in_channels, out_channels, kernel_size)
        self.conv1 = nn.Conv2d(3, 6, 3, 1, 1)
        
        # nn.MaxPool2d(kernel_size, stride)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 3, 1, 1)
        self.pool2 = nn.MaxPool2d(2, 2)
        
        # nn.Linear(in_features, out_features)
        self.fc1 = nn.Linear(16 * 16 * 16, 512)
        self.fc2 = nn.Linear(512, 120)
        self.fc3 = nn.Linear(120, 84)
        self.fc4 = nn.Linear(84, 2)

    def forward(self, x):
        if self.disp_size:
            print('x input size:\t\t\t\t\t{}'.format(x.shape))

        x = self.pool1(F.relu(self.conv1(x)))
        if self.disp_size:
            print('After first block [Conv->Relu->Pool]:\t\t{}'.format(x.shape))
        
        x = self.pool2(F.relu(self.conv2(x)))
        if self.disp_size:
            print('After second block [Conv->Relu->Pool]:\t\t{}'.format(x.shape))

        x = x.view(-1, 16 * 16 * 16)
        if self.disp_size:
            print('After reshape:\t\t\t\t\t{}'.format(x.shape))

        x = F.relu(self.fc1(x))
        if self.disp_size:
            print('After first linear layer:\t\t\t{}'.format(x.shape))

        x = F.relu(self.fc2(x))
        if self.disp_size:
            print('After second linear layer:\t\t\t{}'.format(x.shape))
            
        x = F.relu(self.fc3(x))
        if self.disp_size:
            print('After third linear layer:\t\t\t{}'.format(x.shape))
            
        x = self.fc4(x)
        if self.disp_size:
            print('After fourth linear layer:\t\t\t{}'.format(x.shape))
            print(' ')
        return x

In [None]:
# Create a model and set the "disp_size" to True, so it will print out the size of each layer
nuclei_net = NucleiNet(disp_size=True)

# Run an image batch through just to get some output
_ = nuclei_net(images)

In [None]:
# In PyTorch you can list out the different layers as "children" of the model
list(nuclei_net.children())[0:4]

In [None]:
# You can also pull out specific layers of the model and use them to build a new one
# Here we look at the first four layers, which include the two convolutional and pooling layers
nuclei_features = nn.Sequential(*list(nuclei_net.children())[0:4])

print("First three layers:")
print(nuclei_features)

In [None]:
outputs = nuclei_features(images)
print("size of outputs: {}".format(outputs.shape))

# Which image in the batch do you want to look at?
target_img = 0

# Set up the filter block
num_channels = outputs.shape[0]

# Set up the display of the filter block for this image
rows = int(np.floor(np.sqrt(num_channels)))
if np.mod(np.sqrt(num_channels), 1) != 0:
    # There is a remainder
    cols = rows + 1
else:
    cols = rows


In [None]:
# Plot the original
fig = px.imshow(np.transpose(images[target_img].cpu() / 2 + 0.5, (1,2,0)))
fig.update_xaxes(showticklabels=False, title="Original Image").update_yaxes(showticklabels=False)
fig.write_html("img/plot_nuclei_example.html", include_plotlyjs='cdn')
fig.show()

In [None]:
output_numpy = outputs[target_img,:,:,:].detach().cpu()
fig = px.imshow(output_numpy, facet_col=0, facet_col_wrap=4, color_continuous_scale='gray')

fig.update_layout(coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)

fig.for_each_annotation(lambda a: a.update(text=""))

for idx in range(output_numpy.shape[0]):
    fig.layout.annotations[idx]['text'] = f"Filter {idx}"

fig.write_html("img/plot_nuclei_filters.html", include_plotlyjs='cdn')

fig.show()

In [None]:
nuclei_net = NucleiNet(disp_size=False)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Device: {}".format(device))

# move model to the right device
nuclei_net.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(nuclei_net.parameters(), lr=0.001, momentum=0.9)

In [None]:
list_loss = []
avg_loss = []
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(nuclei_trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        #inputs, labels = data

        # Move to the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = nuclei_net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 20 == 19:    # print every 20 mini-batches
            print('[%d, %5d] loss: %.5f' %
                  (epoch + 1, i + 1, running_loss / 20))
            list_loss.append(running_loss / 20)
            running_loss = 0.0
    
    # Record average loss for this epoch
    avg_loss.append(np.mean(list_loss))

print('Finished Training')

In [None]:
fig = px.line(y=avg_loss, labels={'x':'Training Epoch', 'y':'Loss Value'})
fig.write_html("img/plot_nuclei_training.html", include_plotlyjs='cdn')
fig.show()

In [None]:
#dataiter = iter(nuclei_testloader)
images, labels = dataiter.next()
images = images.to(device)

outputs = nuclei_net(images)
_, predicted = torch.max(outputs, 1)
images_reshaped = np.moveaxis(np.array(images.cpu()), 1, -1)

In [None]:
# Preate a plotly matrix of the images

fig = px.imshow(images_reshaped, facet_col=0, zmin=-1, zmax=1)

fig.update_layout(coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)

fig.for_each_annotation(lambda a: a.update(text=""))

for idx in range(num_samples):
    fig.layout.annotations[idx]['text'] = f'{classes[predicted[idx]]} : {classes[labels[idx]]}'

fig.write_html("img/plot_nuclei_inference.html", include_plotlyjs='cdn')
fig.show()

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for data in nuclei_testloader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = nuclei_net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the {} testing images: {} %'.format(
    total, 100 * correct / total))

In [None]:
img = imread("img/he_tissue_sample.jpg")
img_gt = imread("img/ground_truth.png")

fig = make_subplots(1,2)
fig.add_trace(px.imshow(img).data[0], 1, 1)
fig.add_trace(px.imshow(img_gt).data[0], 1, 2)

fig.update_layout(coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)

fig.update_xaxes(title="Original Image", col=1)
fig.update_xaxes(title="Ground Truth", col=2)


In [None]:
img_seg = imread("img/segmentation.jpg")
img_gt = imread("img/ground_truth.png")

fig = make_subplots(1,2)
fig.add_trace(px.imshow(img_seg).data[0], 1, 1)
fig.add_trace(px.imshow(img_gt).data[0], 1, 2)

fig.update_layout(coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)

fig.update_xaxes(title="Segmented Image", col=1)
fig.update_xaxes(title="Ground Truth", col=2)
