In [39]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from itertools import islice
import numpy as np
from skimage.external import tifffile
from skimage.external.tifffile import imread
import matplotlib.pyplot as plt
import ipywidgets as widgets
import random
from scipy.ndimage.morphology import binary_erosion as br
from skimage import morphology as skmor
from scipy import ndimage
from PIL import Image, ImageDraw
import numpy.ma as ma
from skimage.measure import label, regionprops

from sklearn.cluster import KMeans
import plotly.express as px
import seaborn as sns

# Pixel Distribution for HP1 signals

This script is to see the 'pixel distribution' of the images randomly selected from two experiments, which differ in which staining they have used. 10 samples will be chosen from each HP1 and HP1Beta groups, and each images and its HP1 (or HP1Beta) signals and DAPI signals will be assessed to find a difference between the HP1 stainings. 
<br>

# Contents
* [Data Loading](#data)
* [Sampling](#samples)
* [Data Analysis](#EDA)
    1. [first image](#1st)
    2. [second image](#2nd)
    3. [third image](#3rd)
* [Compare mean and max values](#meanMax)

## Pixel Distribution of HP1 (HP1B) and DAPI signal <a name="data"></a>
- Select random 10 cells from the chosen dataset
- calculate HP1 (HP1B) signal and DAPI signal for each pixel of the image
- plot HP1 signal vs DAPI signal for each cell 
- look for any pattern noticed - ex) normal linear relationship, grouping in the plots, etc

In [2]:
bigDir=r'Z:\CookLab\Liu\20190816_organizedData_MCM_loading'

In [3]:
newfile=f'{bigDir}\\new dataset\signals_new_v5.csv'

In [4]:
newdata=pd.read_csv(newfile)

In [5]:
newdata=newdata.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [6]:
newdata.head()

Unnamed: 0,file,original cell name,movie,last frame movie position x,last frame movie position y,ab1 (MCM),ab1 ch,ab2 (heterochromatin),ab2 ch,DAPI ch,...,mol_age_mod,nuc_hp1b_total,inner_hp1b_total,outer_hp1b_total,nuc_hp1b_het,inner_hp1b_het,outer_hp1b_het,nuc_hp1b_eu,inner_hp1b_eu,outer_hp1b_eu
0,20191203_cell-00.czi,20191125-01-17,191125pcnat_dhbr_cdc6vxy01,678.301,200.931,MCM3,1,HP1beta,2,3,...,0.444,1415176000.0,425500564.0,989675900.0,550518042.0,321679433.0,228838609.0,864658400.0,103821131.0,760837300.0
1,20191203_cell-01.czi,20191125-01-20,191125pcnat_dhbr_cdc6vxy01,1552.152,240.479,MCM3,1,HP1beta,2,3,...,1.373,2175016000.0,804184935.0,1370831000.0,896267478.0,567124747.0,329142731.0,1278748000.0,237060188.0,1041688000.0
2,20191203_cell-02.czi,20191125-01-23,191125pcnat_dhbr_cdc6vxy01,1131.436,318.53,MCM3,1,HP1beta,2,3,...,1.371,1933001000.0,666820604.0,1266180000.0,732487366.0,444813519.0,287673847.0,1200514000.0,222007085.0,978506400.0
3,20191203_cell-03.czi,20191125-01-25,191125pcnat_dhbr_cdc6vxy01,1650.585,324.968,MCM3,1,HP1beta,2,3,...,0.208,1062142000.0,233589820.0,828552500.0,403943621.0,187675893.0,216267728.0,658198700.0,45913927.0,612284800.0
4,20191203_cell-04.czi,20191125-01-32,191125pcnat_dhbr_cdc6vxy01,1466.687,426.815,MCM3,1,HP1beta,2,3,...,0.345,1717888000.0,546330833.0,1171557000.0,697150763.0,424718652.0,272432111.0,1020738000.0,121612181.0,899125400.0


In [47]:
file=f'{bigDir}\\20200217_geminin\\200217_cellinfo_v8.csv'

In [48]:
geminin=pd.read_csv(file)

In [49]:
geminin=geminin.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [50]:
geminin.head()

Unnamed: 0,Unnamed: 0.1.1,file,original cell name,movie,x,y,ab1 (MCM),ab1 ch,ab2 (heterochromatin),ab2 ch,...,mol_age_mod,nuc_hp1b_total,inner_hp1b_total,outer_hp1b_total,nuc_hp1b_het,inner_hp1b_het,outer_hp1b_het,nuc_hp1b_eu,inner_hp1b_eu,outer_hp1b_eu
0,0,20200217_cell_00.ciz,2020024-19-21-63X.czi,20200217pinducer gemininxy19,1312.741,235.893,MCM3,1,HP1,2,...,0.948,2693842000.0,931477800.0,1762364000.0,1339535000.0,829988300.0,509546448.0,1354307000.0,101489496.0,1252817000.0
1,1,20200217_cell_01.ciz,2020024-19-27-63X.czi,20200217pinducer gemininxy19,1323.444,397.761,MCM3,1,HP1,2,...,0.465,2172380000.0,824190100.0,1348189000.0,991509600.0,660230400.0,331279227.0,1180870000.0,163959707.0,1016910000.0
2,2,20200217_cell_02.ciz,2020024-19-39-63X.czi,20200217pinducer gemininxy19,1772.297,584.995,MCM3,1,HP1,2,...,2.001,3263571000.0,1319735000.0,1943836000.0,1458153000.0,1100649000.0,357504422.0,1805418000.0,219086613.0,1586331000.0
3,3,20200217_cell_03.ciz,2020024-19-49-63X.czi,20200217pinducer gemininxy19,163.044,727.256,MCM3,1,HP1,2,...,1.063,2334174000.0,996342400.0,1337832000.0,1103680000.0,851526100.0,252154253.0,1230494000.0,144816240.0,1085678000.0
4,4,20200217_cell_04.ciz,2020024-19-51-63X.czi,20200217pinducer gemininxy19,1791.955,755.367,MCM3,1,HP1,2,...,0.579,2112534000.0,675619900.0,1436915000.0,983729000.0,582832000.0,400897040.0,1128805000.0,92787952.0,1036018000.0


## Sampling  <a name="samples"></a>

sample = 10 samples from newdata; this data used ***HP1Beta***<br>
sample_geminin = 10 samples from geminin dataset; this data used ***HP1***

In [51]:
sample=newdata.loc[(newdata.decon==False),:]

In [52]:
sample=sample.loc[(['G1' in x for x in sample.category]), :].sample(n=10, random_state=107)

In [53]:
sample

Unnamed: 0,file,original cell name,movie,last frame movie position x,last frame movie position y,ab1 (MCM),ab1 ch,ab2 (heterochromatin),ab2 ch,DAPI ch,...,mol_age_mod,nuc_hp1b_total,inner_hp1b_total,outer_hp1b_total,nuc_hp1b_het,inner_hp1b_het,outer_hp1b_het,nuc_hp1b_eu,inner_hp1b_eu,outer_hp1b_eu
102,20191203_cell-130.czi,20191125-08-101,191125pcnat_dhbr_cdc6vxy08,1237.094,1258.126,MCM3,1,HP1beta,2,3,...,0.721,1285848000.0,390107622.0,895740500.0,487715875.0,290984487.0,196731388.0,798132293.0,99123135.0,699009158.0
3,20191203_cell-03.czi,20191125-01-25,191125pcnat_dhbr_cdc6vxy01,1650.585,324.968,MCM3,1,HP1beta,2,3,...,0.208,1062142000.0,233589820.0,828552500.0,403943621.0,187675893.0,216267728.0,658198734.0,45913927.0,612284807.0
108,20191203_cell-137.czi,20191125-11-82,191125pcnat_dhbr_cdc6vxy11,916.914,921.865,MCM3,1,HP1beta,2,3,...,0.701,1340251000.0,442330664.0,897920000.0,509021593.0,325983023.0,183038570.0,831229094.0,116347641.0,714881453.0
58,20191203_cell-78.czi,20191125-05-44,191125pcnat_dhbr_cdc6vxy05,947.721,652.41,MCM3,1,HP1beta,2,3,...,0.422,1216596000.0,363577325.0,853018200.0,456375018.0,263518999.0,192856019.0,760220527.0,100058326.0,660162201.0
49,20191203_cell-64.czi,20191125-10-142,191125pcnat_dhbr_cdc6vxy10,520.447,1594.921,MCM3,1,HP1beta,2,3,...,1.009,1448302000.0,408907699.0,1039394000.0,561312449.0,288467325.0,272845124.0,886989714.0,120440374.0,766549340.0
67,20191203_cell-87.czi,20191125-05-88,191125pcnat_dhbr_cdc6vxy05,301.901,1450.886,MCM3,1,HP1beta,2,3,...,0.208,1173185000.0,255938367.0,917246300.0,438885958.0,196474325.0,242411633.0,734298719.0,59464042.0,674834677.0
81,20191203_cell-103.czi,20191125-07-55,191125pcnat_dhbr_cdc6vxy07,598.905,812.93,MCM3,1,HP1beta,2,3,...,0.462,1039684000.0,208705380.0,830979000.0,384549104.0,167188893.0,217360211.0,655135258.0,41516487.0,613618771.0
60,20191203_cell-80.czi,20191125-05-56,191125pcnat_dhbr_cdc6vxy05,467.635,889.574,MCM3,1,HP1beta,2,3,...,0.674,1373778000.0,453944387.0,919833800.0,519606383.0,326294309.0,193312074.0,854171789.0,127650078.0,726521711.0
95,20191203_cell-123.czi,20191125-07-125,191125pcnat_dhbr_cdc6vxy07,1258.357,1764.788,MCM3,1,HP1beta,2,3,...,0.649,1204104000.0,355183847.0,848919700.0,434319799.0,257178787.0,177141012.0,769783719.0,98005060.0,671778659.0
5,20191203_cell-05.czi,20191125-01-36,191125pcnat_dhbr_cdc6vxy01,1705.099,468.246,MCM3,1,HP1beta,2,3,...,0.632,1615431000.0,530009467.0,1085421000.0,699466569.0,454510713.0,244955856.0,915964380.0,75498754.0,840465626.0


In [54]:
sample_geminin=geminin.loc[(geminin.group=='CTL'),:]

sample_geminin=sample_geminin.loc[(['G1' in x for x in sample_geminin.category]), :].sample(n=10, random_state=107)

sample_geminin

Unnamed: 0,Unnamed: 0.1.1,file,original cell name,movie,x,y,ab1 (MCM),ab1 ch,ab2 (heterochromatin),ab2 ch,...,mol_age_mod,nuc_hp1b_total,inner_hp1b_total,outer_hp1b_total,nuc_hp1b_het,inner_hp1b_het,outer_hp1b_het,nuc_hp1b_eu,inner_hp1b_eu,outer_hp1b_eu
55,55,20200217_cell_56.ciz,2020024-21-69-63X.czi,20200217pinducer gemininxy21,1679.774,853.901,MCM3,1,HP1,2,...,0.913,1451200000.0,339178991.0,1112021000.0,620575400.0,274960992.0,345614428.0,830624500.0,64217999.0,766406500.0
52,52,20200217_cell_53.ciz,2020024-21-56-63X.czi,20200217pinducer gemininxy21,491.477,686.327,MCM3,1,HP1,2,...,0.312,1595833000.0,727375695.0,868457800.0,762248300.0,609343947.0,152904329.0,833585200.0,118031748.0,715553400.0
41,41,20200217_cell_41.ciz,2020024-20-129-63X.czi,20200217pinducer gemininxy20,956.116,1670.198,MCM3,1,HP1,2,...,0.98,1639314000.0,637438506.0,1001876000.0,774370500.0,554396674.0,219973831.0,864943500.0,83041832.0,781901700.0
33,33,20200217_cell_33.ciz,2020024-20-72-63X.czi,20200217pinducer gemininxy20,693.39,929.068,MCM3,1,HP1,2,...,1.098,2252354000.0,977700193.0,1274653000.0,1093978000.0,836537223.0,257440725.0,1158376000.0,141162970.0,1017213000.0
0,0,20200217_cell_00.ciz,2020024-19-21-63X.czi,20200217pinducer gemininxy19,1312.741,235.893,MCM3,1,HP1,2,...,0.948,2693842000.0,931477774.0,1762364000.0,1339535000.0,829988278.0,509546448.0,1354307000.0,101489496.0,1252817000.0
39,39,20200217_cell_39.ciz,2020024-20-111-63X.czi,20200217pinducer gemininxy20,1306.021,1454.848,MCM3,1,HP1,2,...,0.282,1736642000.0,715961360.0,1020680000.0,808729900.0,632150280.0,176579605.0,927911900.0,83811080.0,844100900.0
59,59,20200217_cell_60.ciz,2020024-21-91-63X.czi,20200217pinducer gemininxy21,1292.148,1095.184,MCM3,1,HP1,2,...,0.222,1831720000.0,689306892.0,1142413000.0,824124200.0,565741020.0,258383221.0,1007595000.0,123565872.0,884029400.0
58,58,20200217_cell_59.ciz,2020024-21-84-63X.czi,20200217pinducer gemininxy21,1388.189,1001.713,MCM3,1,HP1,2,...,0.844,1489642000.0,577177890.0,912464400.0,676006000.0,480895722.0,195110316.0,813636200.0,96282168.0,717354100.0
9,9,20200217_cell_09.ciz,2020024-19-79-63X.czi,20200217pinducer gemininxy19,398.674,1037.764,MCM3,1,HP1,2,...,0.726,2211104000.0,900934774.0,1310169000.0,1084085000.0,767018423.0,317066281.0,1127019000.0,133916351.0,993103000.0
12,12,20200217_cell_12.ciz,2020024-19-93-63X.czi,20200217pinducer gemininxy19,1841.328,1182.675,MCM3,1,HP1,2,...,0.429,2158952000.0,705915706.0,1453037000.0,990794500.0,623138461.0,367656076.0,1168158000.0,82777245.0,1085381000.0


# Data Analysis <a name="EDA"></a>

## first image <a name="1st"></a>
* [Density plots](#1Density)
* [Sample plots](#1Sample)
* [Comparison](#1Comparison)

In [55]:
i=0

In [56]:
mycell=sample.iloc[i]
%matplotlib notebook
cell=imread(mycell.path)

nucPath=(mycell.path.replace('data_tiff','segmentation_nucleus_Otsu'))
nucPath=nucPath.replace('.tif','_nucleus.tif')
nuc=imread(nucPath)

print(mycell.file)
print('shape of the cell image: ', cell.shape)
print('shape of nucleus mask: ', nuc.shape)

nuc=nuc.astype(bool)
# selecting every slide of the selected channel, which is hp1beta channel.
hp1b=cell[:,1,:,:] 
## masking the image
hp1b[~nuc]=0
hp1b=hp1b.astype(np.ndarray)

# DAPI channel
dapiB=cell[:,2,:,:]
dapiB[~nuc]=0


## HP1, geminin dataset
i=0
mycellGeminin=sample_geminin.iloc[i]
%matplotlib notebook
cell=imread(mycellGeminin.path)

nucPath=(mycellGeminin.path.replace('data_tiff','segmentation_nucleus_Otsu'))
nucPath=nucPath.replace('.tif','_nucleus.tif')
nuc=imread(nucPath)

print(mycellGeminin.file)
print('shape of the cell image: ', cell.shape)
print('shape of nucleus mask: ', nuc.shape)

nuc=nuc.astype(bool)
# selecting every slide of the selected channel, which is hp1beta channel.
hp1=cell[:,1,:,:] 
## masking the image
hp1[~nuc]=0
hp1=hp1.astype(np.ndarray)

# DAPI channel
dapi=cell[:,2,:,:]
dapi[~nuc]=0


20191203_cell-130.czi
shape of the cell image:  (29, 3, 380, 380)
shape of nucleus mask:  (29, 380, 380)
20200217_cell_56.ciz
shape of the cell image:  (22, 3, 380, 380)
shape of nucleus mask:  (22, 380, 380)


#### making hp1beta and dapi signals as 1d (reducing the factor of slices or any axis limitations)

In [57]:
hp1b.max()

65535

In [58]:
dapiB.max()

41919

By using `ravel()`, the signals' multidimensional array will be made into 1-dimensional array. This will enable us to just plot the pixels without any grouping of slices or axes.

In [59]:
hp1b_test=hp1b.ravel()
hp1b_test.shape

(4187600,)

In [60]:
dapiB_test=dapiB.ravel()
dapiB_test.shape

(4187600,)

#### making hp1 and dapi signals as 1d (reducing the factor of slices or any axis limitations)
By using `ravel()`, the signals' multidimensional array will be made into 1-dimensional array. This will enable us to just plot the pixels without any grouping of slices or axes.

In [61]:
hp1.max()

47432

In [62]:
dapi.max()

31365

In [63]:
hp1_test=hp1.ravel()
hp1_test.shape

(3176800,)

In [64]:
dapi_test=dapi.ravel()
dapi_test.shape

(3176800,)

## remove some data points

- remove pixel values of saturated pixels and 0 pixels, which are most likly the background pixels

### HP1Beta

In [65]:
hp1b_test[hp1b_test==0].shape

(3952366,)

In [66]:
hp1b_test_non_zero=hp1b_test[~(hp1b_test==0)]

In [67]:
hp1b_test_non_zero.shape

(235234,)

In [68]:
dapiB_test[dapiB_test==0].shape

(3952366,)

In [69]:
dapiB_test_non_zero=dapiB_test[~(dapiB_test==0)]

In [70]:
dapiB_test_non_zero.shape

(235234,)

### HP1

In [71]:
hp1_test_non_zero=hp1_test[~(hp1_test==0)]

In [72]:
hp1_test_non_zero.shape

(208612,)

In [73]:
dapi_test_non_zero=dapi_test[~(dapi_test==0)]

In [74]:
dapi_test_non_zero.shape

(208612,)

## density plot <a name="1Density"></a>

### HP1Beta

In [75]:
hp1b_test_final=hp1b_test_non_zero[hp1b_test_non_zero!=hp1b_test_non_zero.max()]

In [76]:
dapiB_test_final=dapiB_test_non_zero[hp1b_test_non_zero!=hp1b_test_non_zero.max()]

In [77]:
print(hp1b_test_final.shape)
print(dapiB_test_final.shape)

(235219,)
(235219,)


I have plotted a scatter plot of HP1Beta vs DAPI signals for the image. 

In [78]:
px.scatter(newdata.loc[('G1' in x for x in newdata.category),:], 'nuc_hp1b_total','nuc_DAPI_total')

In [79]:
px.scatter(geminin.loc[('G1' in x for x in geminin.category),:], 'nuc_hp1b_total','nuc_DAPI_total')

In [80]:
px.box(newdata.loc[('G1' in x for x in newdata.category),:], y='nuc_hp1b_total')

In [81]:
px.box(geminin.loc[('G1' in x for x in geminin.category),:], y='nuc_hp1b_total')