In [1]:
import sys
base = "./../../../"
sys.path.append(base)

In [2]:
import json
import pandas
import pairs_flat_v2 as pairs
import helper
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np
from tqdm import tqdm_notebook as tqdm
from pathlib import Path

tqdm.monitor_interval = 0
init_notebook_mode(connected=True)

In [8]:
# Load matrix
gencounts_oscope = pandas.read_csv(Path(base + "data/GSE64016_H1andFUCCI_normalized_EC_human.csv"))

# Set index right
gencounts_oscope.set_index("Unnamed: 0", inplace=True)

# Subset sorted
gencounts_oscope_sorted = gencounts_oscope.iloc[:, 
                                                       [gencounts_oscope.columns.get_loc(c) 
                                                        for c in gencounts_oscope.columns 
                                                        if "G1_" in c or "G2_" in c or "S_" in c]]

# Define annotation
is_G1 = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G1_" in c]
is_S = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "S_" in c]
is_G2M = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G2_" in c]

annotation = {
    "G1": list(is_G1),
    "S": list(is_S),
    "G2M": list(is_G2M)
}

cycle_genes = [line.split(";")[1].replace("\n","") for i, line in enumerate(open(base + "data/cell_cycle_genes.csv", "r")) if 0 < i]

cc_marker = pairs.sandbag(gencounts_oscope_sorted, phases=annotation, subset_genes=list(cycle_genes), fraction=0.6, processes=10, verbose=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16034 genes that were not in 'subset_genes'. 3050 genes remaining.
[__set_matrix] Removed 105 genes that were not expressed in any samples. 2945 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2945 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 11520 marker pairs (phase: count): {'G1': 3775, 'S': 5503, 'G2M': 2242}


In [9]:
gencounts_EMATB6142 = pandas.read_csv(Path(base + "data/E-MTAB-6142_human.csv"), sep=';')
gencounts_EMATB6142.set_index("Gene_ID", inplace=True)
gene_map = {}

with open(base + "data/biomart_human-genes.txt", "r") as f:
    for line in f:
        info = line.split(",")
        gene_map[info[0].replace("\n","").replace("\r","")] = info[1].replace("\n","")

index_list = gencounts_EMATB6142.index.tolist()

for idx, i in enumerate(index_list):
    try:
        if "." in i:
            index_list[idx] = gene_map[i[:i.index(".")]]
        else:
            index_list[idx] = gene_map[i] 
    except KeyError:
        pass

gencounts_EMATB6142.index = index_list
#gencounts_EMATB6142 = gencounts_EMATB6142[~gencounts_EMATB6142.index.duplicated(keep=False)]
gencounts_EMATB6142.head(10)

Unnamed: 0,S1_G1,S2_G1,S3_G1,S4_G1,S5_G1,S6_G1,S7_G1,S8_G1,S9_G1,S10_G1,...,S87_G2M,S88_G2M,S89_G2M,S90_G2M,S91_G2M,S92_G2M,S93_G2M,S94_G2M,S95_G2M,S96_G2M
TSPAN6,360,5,437,136,328,253,1101,39,157,253,...,391,429,148,397,424,317,403,280,470,725
TNMD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DPM1,111,421,70,179,93,57,60,35,174,91,...,63,228,173,115,87,308,92,179,164,104
SCYL3,2,5,0,0,1,1,3,0,5,1,...,3,0,2,1,2,0,0,0,38,1
C1orf112,179,448,0,0,135,47,0,0,0,159,...,151,68,147,84,151,11,0,0,169,77
FGR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CFH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,34,0,0,0
FUCA2,293,0,0,389,65,229,106,95,112,205,...,175,110,375,46,55,291,283,224,293,93
GCLC,0,0,0,0,208,0,0,0,0,0,...,0,0,0,0,0,0,40,0,0,0
NFYA,0,118,0,0,0,0,0,60,0,0,...,10,0,0,98,108,149,60,125,46,2


In [10]:
x = gencounts_EMATB6142.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_EMATB6142_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_EMATB6142.index, columns=gencounts_EMATB6142.columns)

gencounts_EMATB6142_Qnorm.head(10)

Unnamed: 0,S1_G1,S2_G1,S3_G1,S4_G1,S5_G1,S6_G1,S7_G1,S8_G1,S9_G1,S10_G1,...,S87_G2M,S88_G2M,S89_G2M,S90_G2M,S91_G2M,S92_G2M,S93_G2M,S94_G2M,S95_G2M,S96_G2M
TSPAN6,0.3473684,1e-07,0.526464,0.03131861,0.3050698,0.1526527,0.9789371,0.01029057,0.05297821,0.1526527,...,0.4210878,0.4945908,0.0421246,0.4369369,0.473536,0.2738559,0.4525139,0.221019,0.6156156,0.8947715
TNMD,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
DPM1,0.3683954,0.9890796,0.1736737,0.6896897,0.2945035,0.08958959,0.1052632,0.04754755,0.6416416,0.2631579,...,0.1157895,0.8105044,0.6210526,0.3895449,0.2316756,0.9473313,0.2787788,0.6896897,0.5735736,0.3313313
SCYL3,0.7157157,0.8683684,1e-07,1e-07,0.547047,0.547047,0.8108108,1e-07,0.8683684,0.547047,...,0.8108108,1e-07,0.7157157,0.547047,0.7157157,1e-07,1e-07,1e-07,0.9314314,0.547047
C1orf112,0.915836,0.9999999,1e-07,1e-07,0.821368,0.4843728,1e-07,1e-07,1e-07,0.8946219,...,0.8683684,0.568514,0.8426397,0.652642,0.8683684,0.3579496,1e-07,1e-07,0.9052632,0.631414
FGR,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
CFH,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.9894737,1e-07,1e-07,1e-07
FUCA2,0.9104104,1e-07,1e-07,0.9890842,0.2632633,0.7893646,0.4630937,0.4159159,0.4894895,0.705092,...,0.6315873,0.4738216,0.9789474,0.1683129,0.2367367,0.8947837,0.8738128,0.7789999,0.9104104,0.3948949
GCLC,1e-07,1e-07,1e-07,1e-07,0.9789681,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.9367956,1e-07,1e-07,1e-07
NFYA,1e-07,0.8368368,1e-07,1e-07,1e-07,1e-07,1e-07,0.6051051,1e-07,1e-07,...,0.505171,1e-07,1e-07,0.7266548,0.7788708,0.9259958,0.6051051,0.8633743,0.5788832,0.457958


In [11]:
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, cc_marker, verbose=True)

[__set_matrix] Original Matrix 'x' has shape 59838 x 96
[__set_matrix] Matrix truncation done. Working with 59838 genes for 96 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 121 marker pairs. 11520 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 32, G2M: 51, S: 13


In [12]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1_G1,0.995,0.0,1.0,0.498747,0.0,0.501253,G1
S2_G1,0.999,0.172,0.0,0.853117,0.146883,0.0,G1
S3_G1,0.949,0.6,0.0,0.612653,0.387347,0.0,G1
S4_G1,0.992,0.719,0.0,0.579778,0.420222,0.0,G1
S5_G1,0.537,0.457,0.978,0.272312,0.231744,0.495943,G1
S6_G1,0.747,0.325,1.0,0.360521,0.156853,0.482625,G1
S7_G1,0.93,0.973,0.0,0.488702,0.511298,0.0,G2M
S8_G1,0.999,0.004,0.528,0.652515,0.002613,0.344873,G1
S9_G1,1.0,0.82,0.0,0.549451,0.450549,0.0,G1
S10_G1,0.989,0.674,0.0,0.594708,0.405292,0.0,G1


In [13]:
EMATB6142_labels = list(['G1'] * 32) + list(['S'] * 32) + list(['G2M'] * 32)

In [14]:
EMATB6142_evaluation = helper.evaluate_prediction(EMATB6142_prediction_table, EMATB6142_labels)

F1 Score: G1: 0.8125, S: 0.4, G2M: 0.6506024096385543
Reacall: G1: 0.8125, S: 0.28125, G2M: 0.84375 
Precision: G1: 0.8125, S: 0.6923076923076923, G2M: 0.5294117647058824 


In [15]:
helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True)