In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import os
import glob
from tqdm import tqdm, tqdm_notebook
import pandas as pd
import fitsne
from sklearn.model_selection import cross_val_score

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from skimage.color import rgb2gray

from pywsi.io import WSIReader
from pywsi.io.operations import read_as_rgb

from pywsi.segmentation import poisson_deconvolve, perform_binary_cut, max_clustering
from pywsi.segmentation import collapse_labels, collapse_small_area, laplace_of_gaussian
from pywsi.segmentation import gmm_thresholding, label_nuclei, extract_features, summarize_region_properties

from pywsi.normalization import MacenkoNormalization
from pywsi.normalization import ReinhardNormalization
from pywsi.normalization import VahadaneNormalization
from pywsi.normalization import XuNormalization

from sklearn.decomposition import PCA, FastICA
from skimage.color import rgb2gray
from skimage.io import imread
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
import umap
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('paper', font_scale=2)

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import ElasticNet
from tpot import TPOTClassifier
import pickle
from multiprocessing import Pool

scaler = StandardScaler()

Populating the interactive namespace from numpy and matplotlib


In [2]:
normal_patches_dir = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/normal_patches_test/level_0/'
tumor_patches_dir = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/tumor_patches_test/level_0/'
segmented_tsv_dir = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/normal_patches_test_segmented/level_0/'

In [2]:
np.random.seed(42)
list_of_tumor_files = list(glob.glob('{}*.png'.format(tumor_patches_dir)))
list_of_normal_files = list(glob.glob('{}*.png'.format(normal_patches_dir)))

#list_of_tumor_files = list(np.random.choice(list_of_tumor_files, 20000))
#list_of_normal_files = list(np.random.choice(list_of_normal_files, 20000))

In [None]:
def draw_nuclei(patch, local_max_search_radius=3, min_radius=5, max_radius=15, min_nucleus_area=100):
    patch = read_as_rgb(patch)
    label_nuclei(patch,
                 local_max_search_radius=local_max_search_radius, 
                 min_radius=min_radius,
                 max_radius=max_radius, 
                 min_nucleus_area=min_nucleus_area)
    

In [None]:
interact(draw_nuclei, patch=list_of_tumor_files+list_of_normal_files)


In [None]:
patch = read_as_rgb(list_of_tumor_files[0])

In [None]:
region_properties, fg_mask = label_nuclei(patch)

In [None]:
features_df = []
labels = []
def process_sample(sample):
    patch = read_as_rgb(sample)
    region_properties, _ = label_nuclei(patch, draw=False) 
    summary = summarize_region_properties(region_properties, 
                                          patch)
    return summary
#for sample in tqdm_notebook(list_of_normal_files):

with tqdm_notebook(total=len(list_of_tumor_files)) as pbar:
    with Pool(processes=32) as p:
        for i, summary in enumerate(p.imap_unordered(process_sample, list_of_tumor_files)):
            pbar.update()
            if summary is None:
                print('Nothing found for {}'.format(sample))
                continue
            else:
                labels.append('tumor')
    features_df.append(summary)
    
pickle.dump(features_df, open('normal.pickle', 'wb'))

In [None]:
with tqdm_notebook(total=len(list_of_normal_files)) as pbar:
    with Pool(processes=32) as p:
        for i, summary in enumerate(p.imap_unordered(process_sample, list_of_tumor_files)):
            pbar.update()
            if summary is None:
                print('Nothing found for {}'.format(sample))
                continue
            else:
                labels.append('normal')
    features_df.append(summary)
pickle.dump(features_df, open('tumor.pickle', 'wb'))

In [None]:
#tfile = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/normal_patches_test/level_0/tumor_048_33856_186816_256.png'
y = np.array([1  if label=='normal' else 0 for label in labels])

In [None]:
f = pd.DataFrame(features_df)
X = f.values

In [None]:
X_scaled = scaler.fit(X).transform(X)

In [None]:
X_scaled.shape

# PCA

We start of with doing PCA/tSNE on the features.


# UMAP

In [None]:

embedding = umap.UMAP(n_neighbors=20,
                      min_dist=0.3,
                      metric='correlation').fit_transform(X_scaled)
fig = plt.figure(figsize=(10, 10))

for color, i, target_name in zip(colors, [0, 1], label_matrix):
    plt.scatter(embedding[y == i, 0], embedding[y == i, 1], color=color, alpha=.8, lw=lw,
                label=target_name)
fig.tight_layout()
plt.title('UMAP')

In [None]:
Y = fitsne.FItSNE(X_scaled.copy(order='C'))# max_iter=500)

In [None]:
std_clf = make_pipeline(StandardScaler(), PCA(n_components=2))
std_clf.fit(X)



In [None]:
colors = ['navy', 'darkorange']
lw = 0.2
label_matrix = ['normal', 'tumor']

fig = plt.figure(figsize=(10, 10))
for color, i, target_name in zip(colors, [0, 1], label_matrix):
    plt.scatter(Y[y == i, 0], Y[y == i, 1], color=color, alpha=.8, lw=lw,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)

fig.tight_layout()
plt.title('FIt-SNE')

# Random Forest

In [None]:
RANDOM_STATE = 42

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
                                                    test_size=0.30,
                                                    random_state=RANDOM_STATE)
rf = RandomForestClassifier(n_estimators=40)
print(clf.feature_importances_)

# LASSO

In [None]:
lasso = linear_model.Lasso(alpha = 0.1)
lasso.fit(X_train, y_train)

In [None]:
y_pred_lasso = lasso.predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)

In [None]:
r2_score_lasso

In [None]:
alpha = 0.001
enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
r2_score_enet

In [None]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)
pipeline_optimizer.fit(X_train, y_train)


In [None]:
print(pipeline_optimizer.score(X_test, y_test))


In [17]:
df = pd.DataFrame()
for f in list_of_normal_files:
    uid = f.replace('.png', '.tsv').replace(os.path.dirname(f), '').replace('/', '')
    temp_df = pd.read_table(os.path.join(segmented_tsv_dir, uid))
    df = pd.concat([df, temp_df])
    break


In [11]:
segmented_tsv_dir

'/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/normal_patches_test_segmented/level_0/'

In [18]:
df

Unnamed: 0,area,bbox_area,compactness,convex_area,eccentricity,equivalent_diameter,extent,fractal_dimension,inertia_tensor_eigvals_1,inertia_tensor_eigvals_2,...,moments_hu_6,moments_hu_7,nuclei,nuclei_intensity_over_entire_image,orientation,perimeter,solidity,texture,total_nuclei_area,total_nuclei_area_ratio
0,195.971831,346.746479,26.083501,249.605634,0.63141,15.552323,0.578918,0.82986,24.369564,13.537401,...,4e-06,-4.112912e-08,71,0.427526,0.326445,69.960757,0.789925,0.009722,13914.0,0.212311


# Load df from files

In [4]:
import pandas as pd
normal_segmented_tsv_dir = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/normal_patches_test_segmented/level_0/'
tumor_segmented_tsv_dir = '/Z/personal-folders/interns/saket/histopath_data/CAMELYON16_patches/tumor_patches_test_segmented/level_0/'

In [7]:
df = pd.DataFrame()
y = []
for f in tqdm_notebook(glob.glob(normal_segmented_tsv_dir+'/*.tsv')):
    df = pd.concat((df, pd.read_table(f)))
    y.append(0)

for f in tqdm_notebook(glob.glob(tumor_segmented_tsv_dir+'/*.tsv')):
    df = pd.concat((df, pd.read_table(f)))
    y.append(1)
    
    

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=94100), HTML(value='')))

EmptyDataError: No columns to parse from file

In [None]:
len(df_with_label.index) - len(y)

In [37]:
len(df_with_label.index)

293952

In [None]:
y = np.array(y)
len(y[y==0])

In [72]:
label = [0 for x in range(len(glob.glob(normal_segmented_tsv_dir+'/*.tsv')))]
label += [1 for x in range(len(df.index)-len(label))]


In [23]:
len(label)

293952

In [66]:
df_with_label = df.copy().drop(columns=['0'])

In [67]:
df_with_label['label'] = np.nan

In [73]:
df_with_label['label'] = label

In [74]:
df_with_label.head()

Unnamed: 0,area,bbox_area,compactness,convex_area,eccentricity,equivalent_diameter,extent,fractal_dimension,inertia_tensor_eigvals_1,inertia_tensor_eigvals_2,...,moments_hu_7,nuclei,nuclei_intensity_over_entire_image,orientation,perimeter,solidity,texture,total_nuclei_area,total_nuclei_area_ratio,label
0,167.701754,311.280702,24.094043,210.54386,0.756238,14.397744,0.561444,0.765615,27.606664,9.096642,...,2.976246e-08,57.0,0.380603,0.131881,62.442148,0.810059,0.013039,9559.0,0.145859,0
0,158.887324,302.394366,24.32586,202.464789,0.746371,13.963734,0.549637,0.765657,25.505003,9.164804,...,1.133948e-06,71.0,0.413825,-0.042969,61.07399,0.804361,0.01224,11281.0,0.172134,0
0,178.083333,304.928571,24.220398,221.714286,0.686596,14.807577,0.595765,0.747148,23.847463,11.552167,...,5.420098e-09,84.0,0.390758,0.401353,63.516287,0.810729,0.012269,14959.0,0.228256,0
0,197.285714,333.085714,27.214328,250.057143,0.634482,15.467094,0.598656,0.75774,25.230365,13.524688,...,2.634835e-07,70.0,0.418451,0.07698,70.660689,0.788042,0.01229,13810.0,0.210724,0
0,199.652174,341.086957,25.63534,252.065217,0.675135,15.67927,0.584387,0.82027,27.24685,12.918863,...,-9.572924e-06,46.0,0.411748,0.197567,69.719296,0.791037,0.016963,9184.0,0.140137,0


In [75]:

df_with_label = df_with_label.dropna()
label = df_with_label['label']
df_with_label = df_with_label.drop(columns=['label'])
df_with_label.to_csv('normal_tumor_segmented_df.tsv', sep='\t', index=False, header=True)

In [77]:
matrix = df_with_label.as_matrix()
matrix.shape

(293939, 46)

In [78]:
X_scaled = scaler.fit(matrix).transform(matrix)


In [79]:
RANDOM_STATE = 42

X_train, X_test, y_train, y_test = train_test_split(X_scaled, label,
                                                    test_size=0.30,
                                                    random_state=RANDOM_STATE)


In [80]:
lasso = linear_model.Lasso(alpha = 0.1)
lasso.fit(X_train, y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [81]:
y_pred_lasso = lasso.predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)

In [82]:
r2_score_lasso

-3.3262134180311875e-07

In [83]:
alpha = 0.001
enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
r2_score_enet



0.19293760725386455

In [None]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)
pipeline_optimizer.fit(X_train, y_train)


Optimization Progress:  30%|███       | 36/120 [1:35:03<3:29:05, 149.35s/pipeline]

In [None]:
print(pipeline_optimizer.score(X_test, y_test))
