# Data Preparation

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import pickle

import numpy as np

import os

In [19]:
df = pd.read_csv('../csv/starting.csv')
df

Unnamed: 0,paths,patient_id,tissue,cat,notes,maj_tissue,size
0,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-14DAR-0426,Adipose - Visceral (Omentum),hyperplasia,"2 pieces, ~5% fasca, delineated. Minute focus ...",Adipose,"(55775, 37700)"
1,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-13QBU-0926,Esophagus - Muscularis,clean_specimens,"6 pieces, all muscularis, good specimens",Esophagus,"(71711, 40331)"
2,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-18QFQ-0326,Muscle - Skeletal,atrophy,2 pieces; skeletal muscle with scant internal ...,Muscle,"(49799, 24496)"
3,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-139YR-1526,Pancreas,fibrosis,"2 pieces; islets well visualized, focal PanIN-...",Pancreas,"(63743, 37311)"
4,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-R55C-0526,Lung,emphysema,"2 pieces, 10x7 & 8x5 mm; patchy emphysema",Lung,"(39839, 32246)"
...,...,...,...,...,...,...,...
7723,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-1GTWX-0626,Lung,congestion,"2 pieces, moderate congestion",Lung,"(41831, 26034)"
7724,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-1AX8Z-2226,Stomach,gastritis,6 pieces: 2 without muscularis; chronic gastritis,Stomach,"(51791, 35651)"
7725,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-1J8Q3-1526,Adipose - Subcutaneous,fibrosis,2 pieces; <10% fibrous content,Adipose,"(53783, 38942)"
7726,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-UJHI-0126,Spleen,congestion,"2 pieces, 8x7 & 8x7mm; moderate congestion; di...",Spleen,"(45815, 32790)"


In [4]:
vectorizer = CountVectorizer()

vectorizer.fit(df.notes)

vocab = vectorizer.vocabulary_

processor = vectorizer.build_preprocessor()
tokenizer = vectorizer.build_tokenizer()

df['tokens'] = df.notes.apply(processor)
df['tokens'] = df.tokens.apply(tokenizer)

# https://datagy.io/python-remove-punctuation-from-string/#:~:text=One%20of%20the%20easiest%20ways,maketrans()%20method.
import string
df['strp_punc'] = df['notes'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

df['1stword'] = df['strp_punc'].apply(lambda x: x.split(' ')[0].lower())

df['1stword'] = df['1stword'].apply(lambda x: x if x not in vocab.keys() else None)
df['1stword'] = df['1stword'].apply(lambda x: [x])

df['complete_tokens'] = df['1stword'] + df['tokens']

df.to_pickle('../csv/tokenized_data.pkl')
df

Unnamed: 0,paths,patient_id,tissue,cat,notes,maj_tissue,size,tokens,strp_punc,1stword,complete_tokens
0,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-14DAR-0426,Adipose - Visceral (Omentum),hyperplasia,"2 pieces, ~5% fasca, delineated. Minute focus ...",Adipose,"(55775, 37700)","[pieces, fasca, delineated, minute, focus, of,...",2 pieces 5 fasca delineated Minute focus of me...,[2],"[2, pieces, fasca, delineated, minute, focus, ..."
1,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-13QBU-0926,Esophagus - Muscularis,clean_specimens,"6 pieces, all muscularis, good specimens",Esophagus,"(71711, 40331)","[pieces, all, muscularis, good, specimens]",6 pieces all muscularis good specimens,[6],"[6, pieces, all, muscularis, good, specimens]"
2,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-18QFQ-0326,Muscle - Skeletal,atrophy,2 pieces; skeletal muscle with scant internal ...,Muscle,"(49799, 24496)","[pieces, skeletal, muscle, with, scant, intern...",2 pieces skeletal muscle with scant internal f...,[2],"[2, pieces, skeletal, muscle, with, scant, int..."
3,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-139YR-1526,Pancreas,fibrosis,"2 pieces; islets well visualized, focal PanIN-...",Pancreas,"(63743, 37311)","[pieces, islets, well, visualized, focal, pani...",2 pieces islets well visualized focal PanIN1 s...,[2],"[2, pieces, islets, well, visualized, focal, p..."
4,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-R55C-0526,Lung,emphysema,"2 pieces, 10x7 & 8x5 mm; patchy emphysema",Lung,"(39839, 32246)","[pieces, 10x7, 8x5, mm, patchy, emphysema]",2 pieces 10x7 8x5 mm patchy emphysema,[2],"[2, pieces, 10x7, 8x5, mm, patchy, emphysema]"
...,...,...,...,...,...,...,...,...,...,...,...
7723,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-1GTWX-0626,Lung,congestion,"2 pieces, moderate congestion",Lung,"(41831, 26034)","[pieces, moderate, congestion]",2 pieces moderate congestion,[2],"[2, pieces, moderate, congestion]"
7724,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-1AX8Z-2226,Stomach,gastritis,6 pieces: 2 without muscularis; chronic gastritis,Stomach,"(51791, 35651)","[pieces, without, muscularis, chronic, gastritis]",6 pieces 2 without muscularis chronic gastritis,[6],"[6, pieces, without, muscularis, chronic, gast..."
7725,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-1J8Q3-1526,Adipose - Subcutaneous,fibrosis,2 pieces; <10% fibrous content,Adipose,"(53783, 38942)","[pieces, 10, fibrous, content]",2 pieces 10 fibrous content,[2],"[2, pieces, 10, fibrous, content]"
7726,/project/GutIntelligenceLab/ss4yd/gtex_data/ac...,GTEX-UJHI-0126,Spleen,congestion,"2 pieces, 8x7 & 8x7mm; moderate congestion; di...",Spleen,"(45815, 32790)","[pieces, 8x7, 8x7mm, moderate, congestion, dis...",2 pieces 8x7 8x7mm moderate congestion discre...,[2],"[2, pieces, 8x7, 8x7mm, moderate, congestion, ..."


In [5]:
for word in pd.unique(df['1stword'].apply(lambda x: x[0])):
    vocab[word]=len(vocab)

with open('../csv/word2idx.pickle','wb') as file:
    pickle.dump(vocab, file)

In [6]:
patch_path = '/project/GutIntelligenceLab/ss4yd/gtex_data/process_path_level1/'
dirs = os.listdir(patch_path)
dir_patch_dict = {}
for pid in dirs:
    patches = [os.path.join(patch_path, pid, x) for x in os.listdir(os.path.join(patch_path, pid))]
    dir_patch_dict[pid] = patches
    
patch_paths = [x for xs in dir_patch_dict.values() for x in xs]

In [7]:
pdf = pd.DataFrame(patch_paths, columns=['patch_paths'])

pdf['pid'] = pdf['patch_paths'].apply(lambda x: x.split('/')[-2])

split = pd.read_csv('../csv/train_test_val_split.csv')

train = split[split['dtype']=='train'].sample(frac=0.1, random_state=1)
val = split[split['dtype']=='val'].sample(frac=0.1, random_state=1)
test = split[split['dtype']=='test'].sample(frac=0.1, random_state=1)

split_small = pd.concat([train, val, test])
print("length smaller split:{}".format(len(split_small)))

pdf = pdf.merge(split_small, on='pid')
print("Number of patches: {}".format(len(pdf)))

length smaller split:773
Number of patches: 69757


In [8]:
pdf.to_csv('../csv/working_df.csv', index=False)

In [9]:
final_df = pdf.merge(df, left_on='pid', right_on='patient_id')

In [10]:
final_df=final_df.drop(columns=['patient_id', 'paths'])
final_df.to_pickle('../csv/final_starting_df.csv')

# Get representations

In [11]:
import pandas as pd
import numpy as np
import os
from PIL import Image

import torch
import torch.nn as nn
from torchvision.io import read_image, ImageReadMode
from torchvision.transforms import ToTensor
import torchvision.transforms as transforms
import torchvision.models as models
import torchvision

from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import trange, tqdm

In [12]:
class GetRepsDataset(Dataset):
    
    def __init__(self, df, dtype, transform=None):
        self.df = df
        self.dtype = dtype
        self.transform = transform
        self.typ_df = df[df['dtype']==dtype]
        
    def __len__(self):
        return len(self.typ_df)
    
    def __getitem__(self, idx):
        img_path = self.typ_df.patch_paths.iloc[idx]
        
        image = read_image(img_path, mode=ImageReadMode.RGB)
        
        if self.transform:
            image = self.transform(image)
        return image, img_path

In [13]:
normalize = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))

transform=transforms.Compose([
    transforms.Resize([224,224]),
    transforms.ConvertImageDtype(torch.float),
    normalize,
        ])

train_dataset = GetRepsDataset(final_df, 'train', transform)
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=64, shuffle=True, num_workers=1, pin_memory=True)

train_loader.dataset.__getitem__(1)

(tensor([[[ 1.9235,  1.4098,  0.9646,  ..., -0.1143,  0.3481,  0.8447],
          [ 1.1700,  1.0673,  0.2111,  ..., -0.6452,  0.2111,  1.2728],
          [ 0.7762,  0.7591,  0.1083,  ..., -0.4739, -0.2342,  0.6734],
          ...,
          [ 1.2214,  1.3413,  0.6049,  ...,  0.5193,  0.5364,  0.6392],
          [ 1.5639,  1.6153,  0.5022,  ...,  0.1254,  0.3309,  0.1426],
          [ 1.6838,  1.5125,  0.8104,  ...,  0.2624,  0.6049, -0.0801]],
 
         [[ 1.3957,  0.1527, -0.6702,  ..., -1.4755, -1.3529, -0.8978],
          [-0.5126, -0.9503, -1.2129,  ..., -1.5105, -1.3004, -0.7052],
          [-1.1429, -1.1604, -1.4055,  ..., -1.5280, -1.3880, -1.0028],
          ...,
          [-0.2675, -0.3901, -0.8102,  ..., -1.1954, -1.2654, -1.1078],
          [-0.3025,  0.0301, -0.8978,  ..., -1.3354, -1.2654, -1.4580],
          [-0.2325, -0.0574, -0.8627,  ..., -1.2479, -1.1429, -1.4230]],
 
         [[ 1.5768,  0.7402,  0.1128,  ..., -0.6541, -0.5495,  0.0256],
          [ 0.3219, -0.0267,

In [14]:
class Resnet18backbone(nn.Module):
    def __init__(self):
        super(Resnet18backbone, self).__init__()
        
        self.resnet_head = nn.Sequential(*list(models.resnet18(pretrained=True).children())[:-1])
        
    def forward(self, x):
        out = self.resnet_head(x)
        return out

In [15]:
# test
model = Resnet18backbone()

normalize = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
batch_size = 64

transform=transforms.Compose([
    transforms.Resize([224,224]),
    transforms.ConvertImageDtype(torch.float),
    normalize,
        ])

df = pd.read_csv('../csv/working_df.csv')

train_dataset = GetRepsDataset(df, 'train', transform)
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=64, shuffle=False, num_workers=1, pin_memory=True)

model(train_loader.dataset.__getitem__(1)[0].unsqueeze(0)).shape

torch.Size([1, 512, 1, 1])

In [16]:
def check_cuda():
    if torch.cuda.is_available():       
        device = torch.device("cuda")
        print(f'There are {torch.cuda.device_count()} GPU(s) available.')
        print('Device name:', torch.cuda.get_device_name(0))
        return device
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
        return device
    
device=check_cuda()

No GPU available, using the CPU instead.


In [17]:
model = model.to(device)

rep_list = []
path_list = []
model.eval()
for img, path in tqdm(train_loader):
    
    img = img.to(device)    
    
    in_batch_size = img.shape[0]
    
    reps = model(img)
    rep_list.append(reps.detach().detach().cpu().numpy().reshape(in_batch_size, -1))
    path_list += path
    
rep_array = np.concatenate(rep_list, axis=0)
repsdf = pd.DataFrame(rep_array)
repsdf['patch_paths']=path_list

repsdf.to_csv('../csv/working_train_reps.csv', index=False)

  0%|          | 0/802 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
valid_dataset = GetRepsDataset(df, 'val', transform)
valid_loader = torch.utils.data.DataLoader(valid_dataset,batch_size=64, shuffle=False, num_workers=1, pin_memory=True)

model = model.to(device)

rep_list = []
path_list = []
model.eval()
for img, path in tqdm(valid_loader):
    
    img = img.to(device)    
    
    in_batch_size = img.shape[0]
    
    reps = model(img)
    rep_list.append(reps.detach().detach().cpu().numpy().reshape(in_batch_size, -1))
    path_list += path
    
rep_array = np.concatenate(rep_list, axis=0)
valrepsdf = pd.DataFrame(rep_array)
valrepsdf['patch_paths']=path_list

valrepsdf.to_csv('../csv/working_valid_reps.csv', index=False)

# Cluster using faiss

In [2]:
import faiss

In [18]:
repsdf = pd.read_csv('../csv/working_train_reps.csv')
final_df = pd.read_pickle('../csv/final_starting_df.csv')

In [None]:
repsdf

In [19]:
X = repsdf[[str(x) for x in list(range(512))]]
X = np.ascontiguousarray(X)
X = X.astype('float32')

In [20]:
ncentroids = 8
niter = 300
verbose = False
d = X.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose, nredo=20)
kmeans.train(X)

255477.296875

In [None]:
D, I = kmeans.index.search(X, 1)

repsdf['cluster_assignment']=I

repsdf['reps']=repsdf[[str(x) for x in range(512)]].values.tolist()
repsdf=repsdf.drop(columns=[str(x) for x in range(512)])

repsdf['pid'] = repsdf['patch_paths'].apply(lambda x: x.split('/')[-2])
repsdf

In [None]:
valreps = pd.read_csv('../csv/working_valid_reps.csv')
X_val = valreps[[str(x) for x in list(range(512))]]
X_val = np.ascontiguousarray(X_val)
X_val = X_val.astype('float32')

D, I = kmeans.index.search(X_val, 1)

valreps['cluster_assignment']=I

valreps['reps']=valreps[[str(x) for x in range(512)]].values.tolist()
valreps=valreps.drop(columns=[str(x) for x in range(512)])

valreps['pid'] = valreps['patch_paths'].apply(lambda x: x.split('/')[-2])
valreps

In [None]:
repsdf = pd.concat([repsdf, valreps])

In [None]:
final_df['pid']

In [None]:
generatingdf=repsdf.drop(columns=['reps']).merge(final_df.drop_duplicates('pid'),on='pid')

generatingdf=generatingdf.drop(columns=['patch_paths_y','tokens','1stword'])

generatingdf

In [None]:
generatingdf = generatingdf[['patch_paths_x','pid','cluster_assignment','complete_tokens','dtype','notes']]

generatingdf

In [None]:
generatingdf.to_pickle('../csv/generating_training_df.pickle')