In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import torch.nn.init as init
from torch.autograd import Variable
import argparse
import numpy as np
from PIL import Image
from utils.dataloader import *
from utils.auc_test import *
from utils import new_transforms

In [34]:
#parser = argparse.ArgumentParser()
#parser.add_argument('--data', type=str, default='Lung', help='Data to train on (Lung/Breast/Kidney)')
#parser.add_argument('--file_path', type=str, default='/beegfs/jmw784/Capstone/', help='Root path where the tiles are')
#opt = parser.parse_args()
root_dir = '/beegfs/jmw784/Capstone/TestTilesSorted/'
#out_file = opt.file_path + opt.data + "_FileMappingDict.p"

In [35]:
def find_classes(dir):
    # Classes are subdirectories of the root directory
    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
    classes.sort()
    class_to_idx = {classes[i]: i for i in range(len(classes))}
    return classes, class_to_idx

def getCoords(tile_list): 
    
    '''
    Given a list of tiles, with format: 
    [test, valid, train]_NAME_x_y.jpeg
    Returns a two list of same size with xcoords and y coords    
    '''
    
    xcoords = [re.split("_", i)[-2] for i in tile_list]
    xcoords = list(map(int, xcoords))
    ycoords = [re.split("_", i)[-1] for i in tile_list]
    ycoords = [re.sub(".jpeg", "", i) for i in ycoords]
    ycoords = list(map(int, ycoords))
    
    return xcoords, ycoords

def fileCleaner(tile_list): 
    
    '''
    Given a list of tiles, remove coords ("_X_Y_") and ".jpeg" termination
    '''
    
    tile_list = [re.sub("_[0-9]*_[0-9]*.jpeg", "", x) for x in tile_list]
    
    return (tile_list)

def get2Darray(xcoords, ycoords, tiles_input): 
    
    '''
    Given a list of xcoords, ycoords and files, returns a 2D array where each file
       correspond to the pair of coords
    '''
    
    xmax = max(xcoords) + 1
    ymax = max(ycoords) + 1
    tiles_output = np.empty((ymax, xmax), dtype=np.dtype((str, 100)))
    for i in range(0,len(xcoords)): 
        tiles_output[ycoords[i], xcoords[i]] = tiles_input[i]
        
    return tiles_output

def fastdump(obj, file):
    p = pickle.Pickler(file)
    p.fast = True
    p.dump(obj)



In [36]:
classes, class_to_idx = find_classes(root_dir)
print(class_to_idx)

tile_files = {}
original_files = {}
main_dict = {}

print(classes)

{'T3': 2, 'T1': 0, 'T2': 1}
['T1', 'T2', 'T3']


In [5]:
for c in classes:
    tile_files[c] = os.listdir(root_dir + c)
    original_files[c] = fileCleaner(tile_files[c])
    
    for file in set(original_files[c]): 
            index_list = [i for i, x in enumerate(original_files[c]) if x==file]
            tiles = [tile_files[c][i] for i in index_list]
            xs, ys = getCoords(tiles)
            tiles_array = get2Darray(xs, ys, tiles)
            loop_dict = {file:[tiles_array, class_to_idx[c]]}
   


In [31]:
loop_dict 

{'test_NORMAL': [array([['', '', '', ..., '', '', ''],
         ['', '', '', ..., '', '', ''],
         ['', '', '', ..., '', '', ''],
         ..., 
         ['', '', '', ..., '', '', 'test_NORMAL_72_31.jpeg'],
         ['', '', '', ..., '', '', 'test_NORMAL_72_32.jpeg'],
         ['', '', '', ..., '', '', 'test_NORMAL_72_33.jpeg']],
        dtype='<U100'), 2]}

In [7]:
tiles_array

array([['', '', '', ..., '', '', ''],
       ['', '', '', ..., '', '', ''],
       ['', '', '', ..., '', '', ''],
       ..., 
       ['', '', '', ..., '', '', 'test_NORMAL_72_31.jpeg'],
       ['', '', '', ..., '', '', 'test_NORMAL_72_32.jpeg'],
       ['', '', '', ..., '', '', 'test_NORMAL_72_33.jpeg']],
      dtype='<U100')

In [16]:
root_dir = "/beegfs/sb3923/DeepCancer/alldata/AlldataTilesSorted/"
num_classes = 9
tile_dict_path = '/beegfs/sb3923/DeepCancer/alldata/Alldata_FileMappingDict.p'
imgSize = 299

transform = transforms.Compose([new_transforms.Resize((imgSize,imgSize)),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


NameError: name 'transforms' is not defined

In [176]:
#if want to rerun things
test_data = TissueData2(root_dir, 'test', transform = transform, metadata=False)
classes = test_data.classes
print(classes)

os.chdir('/scratch/jmw784/capstone/deep-cancer')
import pickle
pickle.dump( test_data, open( "test_data.p", "wb" ) )
pickle.dump( classes, open( "classes.p", "wb" ) )

Loading from: ST_Normal_Kidney


KeyboardInterrupt: 

In [15]:
os.chdir('/scratch/jmw784/capstone/deep-cancer')
import pickle
test_data = pickle.load( open( "test_data.p", "rb" ))
classes = pickle.load( open( "classes.p", "rb" ))


In [17]:
c = pickle.load( open( "classes.p", "rb" ))

In [24]:
#test_loader = torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=False)

In [26]:
#with open(tile_dict_path, 'rb') as f:
#    tile_dict = pickle.load(f)

In [27]:
def get_tile_probability(tile_path):
    # Some tiles are empty with no path, return nan
    if tile_path == '':
        return np.full(num_classes, np.nan)
    
    tile_path = root_dir + tile_path
 
    with open(tile_path, 'rb') as f:
        with Image.open(f) as img:
            img = img.convert('RGB')
            
    img = transform(img).unsqueeze(0)
    
    # Turn output into probabilities with softmax
    var_img = Variable(img, volatile=True).cuda()
    output = F.softmax(model(var_img)[0]).data.squeeze(0)
    output = output.cpu()
    
    #viz = (model(var_img)[1]).squeeze(0)#torch.FloatTensor of size 1x5184
    #viz = viz.cpu()
    #print(output.numpy())
    return output.numpy()#, viz.data.numpy()
    

In [28]:
def get_tile_probability2(tile_path):
    # Some tiles are empty with no path, return nan
    if tile_path == '':
        return np.full(5184, np.nan)
    
    tile_path = root_dir + tile_path
    #print(tile_path)
 
    with open(tile_path, 'rb') as f:
        with Image.open(f) as img:
            img = img.convert('RGB')
            
    img = transform(img).unsqueeze(0)
    
    # Turn output into probabilities with softmax
    var_img = Variable(img, volatile=True).cuda()
    viz = (model(var_img)[1]).squeeze(0)#torch.FloatTensor of size 1x5184
    viz = viz.cpu()
    #print(viz.data.numpy())
    return viz.data.numpy() #numpy.ndarray

In [29]:
file_list = test_data.filenames

for file in file_list:
        tile_paths, label = tile_dict[file]
        #print(tile_paths)
        
        
folder = classes[label]

def add_folder(tile_path):
    if tile_path == '':
        return ''
    else:
        return folder + '/' + tile_path
        
    
    
add_folder_v = np.vectorize(add_folder)
tile_paths = add_folder_v(tile_paths)


m=(tile_paths[6])
print(m)


prob_v2 = np.vectorize(get_tile_probability2, otypes=[np.ndarray])
lastlayer = prob_v2(m)
       
print(lastlayer)

KeyError: 'test_TCGA-KL-8329-11A-01-TS1.55cef38f-9e3a-47aa-af40-e2b889a8e4a1'

In [32]:
file_list

['test_TCGA-KL-8329-11A-01-TS1.55cef38f-9e3a-47aa-af40-e2b889a8e4a1',
 'test_TCGA-BP-4162-11A-01-TS1.a625779e-8a95-436f-a9a5-bd95e73211d9',
 'test_TCGA-UW-A7GN-11Z-00-DX1.ABAA5DC6-D63C-4BBF-AB0C-213B95C79720',
 'test_TCGA-CZ-5460-11A-01-TS1.baccf72f-058d-41bb-8517-5a10483fd4da',
 'test_TCGA-KN-8426-11A-01-TS1.15bd5f79-a766-4bbd-b1c9-7fc9bcae121b',
 'test_TCGA-KO-8410-11A-01-TS1.37cb6597-c082-4174-8d77-97afb473d345',
 'test_TCGA-CJ-5682-11A-01-TS1.a1f1d495-68b5-477e-aee2-56f77b891e60',
 'test_TCGA-BP-5169-11A-01-TS1.a81527fa-b410-47fc-9086-11916d74dfac',
 'test_TCGA-KL-8334-11A-01-TS1.61a3e199-8e61-4941-b908-8ccd0ecfba0c',
 'test_TCGA-BQ-7044-11A-01-TS1.88058788-1360-478c-8039-99c733d08345',
 'test_TCGA-BP-4346-11A-01-TS1.f09ecda2-cab1-4f2b-a616-26f9ce4fd479',
 'test_TCGA-BP-4337-11A-01-TS1.1e7ce1da-d1f3-4df1-ad44-a84354fcd5e6',
 'test_TCGA-KL-8330-11A-01-TS1.c98209ca-656f-4873-ad6e-887374fa0034',
 'test_TCGA-BQ-7053-11A-01-TS1.dbd6f544-85aa-4da3-b9bb-519843ba42d2',
 'test_TCGA-CW-6097-

In [21]:
def aggregate(file_list, method):

    """
    Given a list of files, return scores for each class according to the
    method and labels for those files.
    @param file_list: A list of file paths to do predictions on
    @param method: 'average' - returns the average probability score across
                               all tiles for that file
                   'max' - predicts each tile to be the class of the maximum
                           score, and returns the proportion of tiles for
                           each class
    @return: a ndarray of class probabilities for all files in the list
             a ndarray of the labels
    """

    model.eval()
    predictions = []
    true_labels = []
    last_layer = []

    for file in file_list:
        tile_paths, label = tile_dict[file]

        folder = classes[label]

        def add_folder(tile_path):
            if tile_path == '':
                return ''
            else:
                return folder + '/' + tile_path

        # Add the folder for the class name in front
        add_folder_v = np.vectorize(add_folder)
        tile_paths0 = add_folder_v(tile_paths)
        

        # Get the probability array for the file
        prob_v = np.vectorize(get_tile_probability, otypes=[np.ndarray])
        probabilities = prob_v(tile_paths0)
        
        
        tile_paths1 = add_folder_v(tile_paths)
        
        
        #print(probabilities)
        #lastlayer = prob_v2(tile_paths)
       
        #print(probabilities[0])
        #print(type(probabilities))
        #print(probabilities.shape)
        #m=(tile_paths[6])
        #print(m)
        
        prob_v2 = np.vectorize(get_tile_probability2, otypes=[np.ndarray])
        lastlayer = prob_v2(tile_paths1)
        
        #print(lastlayer[0])
        #print(type(lastlayer))
        #print(lastlayer.shape)


        #probabilities
        probabilities = np.stack(probabilities.flat)
        prediction = np.nanmean(probabilities, axis = 0)
        
        tile_label = np.argmax(probabilities,axis=1)
        
        
        #last layer
        last_layer_weights = np.stack(lastlayer.flat)
        
        a = last_layer_weights[np.ix_(label == tile_label),:]
        #print((np.squeeze(a)).ndim)
        
        #print(last_layer_weights.shape)
        #print(np.array(label == tile_label))
        #print(last_layer_weights.shape[np.array(label == tile_label),:])
        if (np.squeeze(a)).ndim>1:
            lastlayerweights = np.nanmean(np.squeeze(a), axis = 0)
            print(lastlayerweights.shape)
        
            predictions.append(prediction)
            true_labels.append(label)
            last_layer.append(lastlayerweights)
        else:
            print('no tile is correctly classified')
        
    return np.array(predictions), np.array(true_labels), np.array(last_layer)

In [22]:
class BasicConv2d(nn.Module):

    def __init__(self, in_channels, out_channels, pool, **kwargs):
        super(BasicConv2d, self).__init__()

        self.pool = pool
        self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
        self.relu = nn.LeakyReLU()
        
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        x = self.conv(x)

        if self.pool:
            x = F.max_pool2d(x, 2)
        
        x = self.relu(x)
        x = self.bn(x)
        x = self.dropout(x)
        return x

# Define model
class cancer_CNN(nn.Module):
    def __init__(self, nc, imgSize, ngpu):
        super(cancer_CNN, self).__init__()
        self.nc = nc
        self.imgSize = imgSize
        self.ngpu = ngpu
        self.data = 'all'
        self.conv1 = BasicConv2d(nc, 16, False, kernel_size=5, padding=1, stride=2, bias=True)
        self.conv2 = BasicConv2d(16, 32, False, kernel_size=3, bias=True)
        self.conv3 = BasicConv2d(32, 64, True, kernel_size=3, padding=1, bias=True)
        self.conv4 = BasicConv2d(64, 64, True, kernel_size=3, padding=1, bias=True)
        self.conv5 = BasicConv2d(64, 128, True, kernel_size=3, padding=1, bias=True)
        self.conv6 = BasicConv2d(128, 64, True, kernel_size=3, padding=1, bias=True)
        self.linear = nn.Linear(5184, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = x.view(x.size(0), -1)
        llw = x
        x = self.linear(x)
        return x, llw

In [None]:
model = cancer_CNN(3, imgSize, 1)
model.cuda()
model_path = "/scratch/jmw784/capstone/deep-cancer/experiments/train_all_downsample_2/epoch_17.pth"
state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
model.load_state_dict(state_dict)
predictions, labels, lastlayer = aggregate(test_data.filenames, method='average')


(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
(5184,)
