In [1]:
import sys
base = "./../../../"
sys.path.append(base)
import pandas
from pathlib import Path
import pairs_flat_v2 as pairs
import json
import helper
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import tools
import plotly.graph_objs as go
import numpy as np

init_notebook_mode(connected=True)
from tqdm import tqdm_notebook as tqdm

# Human pluripotent stem cells (hPSC) - GSE53481
Pluripotent cells from [Cell-cycle control of developmentally regulated transcription factors accounts for heterogeneity in human pluripotent cells](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3964279/)

In [2]:
# Load matrix
gencounts_oscope = pandas.read_csv(Path(base + "data/GSE64016_H1andFUCCI_normalized_EC_human.csv"))

# Set index right
gencounts_oscope.set_index("Unnamed: 0", inplace=True)

# Subset sorted
gencounts_oscope_sorted = gencounts_oscope.iloc[:, 
                                                       [gencounts_oscope.columns.get_loc(c) 
                                                        for c in gencounts_oscope.columns 
                                                        if "G1_" in c or "G2_" in c or "S_" in c]]

# Define annotation
is_G1 = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G1_" in c]
is_S = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "S_" in c]
is_G2M = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G2_" in c]

annotation = {
    "G1": list(is_G1),
    "S": list(is_S),
    "G2M": list(is_G2M)
}

go_0007049 = [line.replace("\n","").replace("\r","") for line in open(base + "data/go_0007049_homoSapiens.csv", "r")]
cycle_base = [line.split("\t")[0] for i, line in enumerate(open(base + "data/cyclebase_top1000_genes.tsv", "r")) if 0 < i]
cycle_genes = np.unique(np.concatenate((go_0007049, cycle_base),0))

cc_marker = pairs.sandbag(gencounts_oscope_sorted, phases=annotation, subset_genes=list(cycle_genes), fraction=0.6, processes=10, verbose=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 8146 marker pairs (phase: count): {'G1': 2575, 'S': 4101, 'G2M': 1470}


In [3]:
gencounts_GSE53481 = pandas.read_csv(Path(base + "data/GSE53481_humanRNAseq.txt"), sep='\t')
genes = [s[s.rindex('_') +1:] for s in gencounts_GSE53481["GENE"]]
gencounts_GSE53481["GENE"] = genes
gencounts_GSE53481.set_index("GENE", inplace=True)
gencounts_GSE53481.head(10)

Unnamed: 0_level_0,H1.DN,H1.KO2,H1.AzLow,H1.AzHigh,H2.DN,H2.KO2,H2.AzLow,H2.AzHigh,H3.DN,H3.KO2,H3.AzLow,H3.AzHigh
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LOC100289255,0.0,0.0,0.02,0.02,0.0,0.01,0.03,0.03,0.0,0.01,0.02,0.03
LOC644656,1.76,1.81,0.73,0.54,2.26,1.71,0.54,0.78,1.63,1.8,1.08,0.64
LOC646903,1.49,1.14,0.61,0.35,1.94,2.01,0.72,0.65,1.12,1.42,0.69,0.44
FLJ36644,0.11,0.12,0.0,0.01,0.12,0.2,0.07,0.03,0.15,0.16,0.12,0.05
LOC284454,0.5,0.99,0.19,0.22,0.85,0.72,0.38,0.4,0.66,0.81,0.36,0.58
LOC149773,0.01,0.0,0.03,0.01,0.03,0.01,0.06,0.02,0.07,0.02,0.04,0.03
LOC100131176,0.21,0.26,0.0,0.0,0.34,0.56,0.2,0.0,1.0,0.38,0.35,0.0
LOC100131366,0.19,0.03,0.31,0.29,0.0,0.3,0.96,0.71,0.3,0.0,0.71,0.94
FLJ42351,0.1,0.06,0.22,0.33,0.15,0.0,0.25,0.18,0.05,0.15,0.25,0.7
LOC392232,0.09,0.08,0.02,0.06,0.02,0.01,0.0,0.02,0.07,0.02,0.01,0.06


In [4]:
print("{} cell cycle genes are present in the dataset".format(len(list(set(genes).intersection(cycle_genes)))))

67 cell cycle genes are present in the dataset


In [5]:
x = gencounts_GSE53481.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_GSE53481_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_GSE53481.index, columns=gencounts_GSE53481.columns)

gencounts_GSE53481_Qnorm.head(10)

Unnamed: 0_level_0,H1.DN,H1.KO2,H1.AzLow,H1.AzHigh,H2.DN,H2.KO2,H2.AzLow,H2.AzHigh,H3.DN,H3.KO2,H3.AzLow,H3.AzHigh
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LOC100289255,1e-07,1e-07,0.6361361,0.6361361,1e-07,0.4194194,0.9999999,0.9999999,1e-07,0.4194194,0.636136,0.9999999
LOC644656,0.7273273,0.9089138,0.2728729,1e-07,0.9999999,0.6364466,1e-07,0.3634222,0.5455312,0.8185142,0.454506,0.1818422
LOC646903,0.8178995,0.6357958,0.1820635,1e-07,0.9094977,0.9999999,0.4544619,0.2727273,0.5455406,0.7275551,0.363697,0.09079514
FLJ36644,0.4547405,0.6361361,1e-07,0.09078309,0.6361361,0.9999999,0.3635214,0.1818182,0.81845,0.9089616,0.636136,0.2727273
LOC284454,0.4545657,0.9999999,1e-07,0.09058149,0.9089687,0.7271716,0.2727273,0.3634332,0.6364169,0.8183809,0.182107,0.5454545
LOC149773,0.1816817,1e-07,0.6361361,0.1816817,0.6361361,0.1816817,0.9092169,0.4094094,0.9999999,0.4094094,0.81804,0.6361361
LOC100131176,0.4544741,0.5454075,1e-07,1e-07,0.6365918,0.9089923,0.3642466,1e-07,0.9999999,0.8179049,0.727013,1e-07
LOC100131366,0.2728443,0.1814285,0.6356982,0.3641536,1e-07,0.5,0.9999999,0.7727728,0.5,1e-07,0.772773,0.9096284
FLJ42351,0.2726727,0.1814858,0.6364169,0.9089548,0.4094094,1e-07,0.7727728,0.5454278,0.0910485,0.4094094,0.772773,0.9999999
LOC392232,0.9999999,0.9090909,0.4089089,0.6816817,0.4089089,0.1361361,1e-07,0.4644645,0.8181818,0.4016517,0.136136,0.6816817


In [6]:
gencounts_GSE53481_Rnorm = (gencounts_GSE53481 / np.divide(gencounts_GSE53481.sum(),1000000))
gencounts_GSE53481_Rnorm.head()

Unnamed: 0_level_0,H1.DN,H1.KO2,H1.AzLow,H1.AzHigh,H2.DN,H2.KO2,H2.AzLow,H2.AzHigh,H3.DN,H3.KO2,H3.AzLow,H3.AzHigh
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LOC100289255,0.0,0.0,18.013961,15.353436,0.0,11.418523,23.257256,19.149262,0.0,9.501639,16.490901,19.088827
LOC644656,1570.64325,1836.388909,657.50957,414.542775,1969.687726,1952.567455,418.630613,497.880815,1278.280987,1710.295026,890.508662,407.228302
LOC646903,1329.692297,1156.620639,549.425805,268.685132,1690.793889,2295.123149,558.17415,414.900679,878.32804,1349.232743,568.93609,279.969458
FLJ36644,98.165203,121.749541,0.0,7.676718,104.585189,228.370463,54.266931,19.149262,117.63322,152.026225,98.945407,31.814711
LOC284454,446.205469,1004.433712,171.132628,168.887797,740.811755,822.133665,294.591913,255.323495,517.586166,769.632762,296.836221,369.050649


In [7]:
GSE53481_prediction = pairs.cyclone(gencounts_GSE53481_Qnorm, cc_marker, min_pairs=1, verbose=True)

[__set_matrix] Original Matrix 'x' has shape 510 x 12
[__set_matrix] Matrix truncation done. Working with 510 genes for 12 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 8102 marker pairs. 8146 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 5, S: 2, G2M: 5


In [8]:
GSE53481_prediction_table = helper.get_prediction_table(GSE53481_prediction)
helper.DataTable(GSE53481_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
H1.DN,0.585341,0.223447,0.682779,0.392434,0.149807,0.45776,G1
H1.KO2,0.682828,0.095,0.789634,0.435627,0.060608,0.503766,G1
H1.AzLow,0.43159,0.174,0.819738,0.3028,0.122077,0.575123,S
H1.AzHigh,0.078,0.943888,0.030151,0.074142,0.897199,0.028659,G2M
H2.DN,0.965966,0.182,0.350151,0.644787,0.121486,0.233728,G1
H2.KO2,0.864185,0.001016,0.644261,0.572512,0.000673,0.426815,G1
H2.AzLow,0.528169,0.741483,0.371457,0.321837,0.451818,0.226345,G2M
H2.AzHigh,0.083156,0.992979,0.0,0.077273,0.922727,0.0,G2M
H3.DN,0.663992,0.6,0.096057,0.488212,0.441161,0.070627,G1
H3.KO2,0.210262,0.039,0.305359,0.379109,0.070318,0.550573,S


In [9]:
GSE53481_labels = ['G1', 'G1','S','G2M','G1', 'G1','S','G2M','G1', 'G1','S','G2M']

In [10]:
GSE53481_evaluation = helper.evaluate_prediction(GSE53481_prediction_table, GSE53481_labels)

F1 Score: G1: 0.9090909090909091, S: 0.4, G2M: 0.7499999999999999
Reacall: G1: 0.8333333333333334, S: 0.3333333333333333, G2M: 1.0 
Precision: G1: 1.0, S: 0.5, G2M: 0.6 


In [11]:
helper.plot_evaluation(*GSE53481_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase")

{'data': [{'marker': {'color': 'red', 'size': 10, 'symbol': 'circle'},
   'mode': 'markers',
   'name': 'F1-Score',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': array([0.90909091, 0.4       , 0.75      ])},
  {'marker': {'color': 'blue', 'size': 10, 'symbol': 'square'},
   'mode': 'markers',
   'name': 'Recall-Score',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': array([0.83333333, 0.33333333, 1.        ])},
  {'marker': {'color': 'green', 'size': 10, 'symbol': 'triangle-up'},
   'mode': 'markers',
   'name': 'Precision-Score',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': array([1. , 0.5, 0.6])}],
 'layout': {'title': '',
  'xaxis': {'title': 'Phase'},
  'yaxis': {'title': 'F1, Recall, Precision Score'}}}

In [12]:
sample1_g1 = [GSE53481_prediction_table.iloc[i, 0] for i in range (0,4)]
sample1_s = [GSE53481_prediction_table.iloc[i, 2] for i in range (0,4)]
sample1_g2m = [GSE53481_prediction_table.iloc[i, 1] for i in range (0,4)]
plot = helper.plot_prediction(sample1_g1, sample1_s, sample1_g2m, t="pie", xaxis=['DN', 'KO2', 'AzLow', 'AzHigh'], xaxislbl="H1", title="Assignment of hESC H1 cells",width=950,height=950)
iplot(plot,image='svg')

In [13]:
avg_g1 = [np.average(GSE53481_prediction_table.iloc[[i, i+4, i+8], 0].values) for i in range (0,4)]
avg_s = [np.average(GSE53481_prediction_table.iloc[[i, i+4, i+8], 2].values) for i in range (0,4)]
avg_g2m = [np.average(GSE53481_prediction_table.iloc[[i, i+4, i+8], 1].values) for i in range (0,4)]
plot = helper.plot_prediction(avg_g1, avg_s, avg_g2m, t="pie", xaxis=['DN', 'KO2', 'AzLow', 'AzHigh'], xaxislbl="Average", title="Average assignment of hESC all cells", width=950,height=950)
iplot(plot,image='svg')