In [1]:
import sys
base = "./../../../"
sys.path.append(base)

In [2]:
import json
import pandas
import pairs_flat_v2 as pairs
import helper
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np
from tqdm import tqdm_notebook as tqdm
from pathlib import Path

tqdm.monitor_interval = 0
init_notebook_mode(connected=True)

In [3]:
# Load matrix
gencounts_oscope = pandas.read_csv(Path(base + "data/GSE64016_H1andFUCCI_normalized_EC_human.csv"))

# Set index right
gencounts_oscope.set_index("Unnamed: 0", inplace=True)

# Subset sorted
gencounts_oscope_sorted = gencounts_oscope.iloc[:, 
                                                       [gencounts_oscope.columns.get_loc(c) 
                                                        for c in gencounts_oscope.columns 
Aif "G1_" in c or "G2_" in c or "S_" in c]]

# Define annotation
is_G1 = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G1_" in c]
is_S = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "S_" in c]
is_G2M = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G2_" in c]

annotation = {
    "G1": list(is_G1),
    "S": list(is_S),
    "G2M": list(is_G2M)
}

go_0007049 = [line.replace("\n","").replace("\r","") for line in open(base + "data/go_0007049_homoSapiens.csv", "r")]
cycle_base = [line.split("\t")[0] for i, line in enumerate(open(base + "data/cyclebase_top1000_genes.tsv", "r")) if 0 < i]
cycle_genes = np.unique(np.concatenate((go_0007049, cycle_base),0))

cc_marker = pairs.sandbag(gencounts_oscope_sorted, phases=annotation, subset_genes=list(cycle_genes), fraction=0.5, processes=10, verbose=True, weighted=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 149824 marker pairs (phase: count): {'G1': 50750, 'S': 59280, 'G2M': 39794}


In [4]:
triplets = pairs.identify_triplets(cc_marker, weighted=True)

In [5]:
for phase, tips in triplets.items():
    print("{} contains {} triples".format(phase, len(tips)))

G1 contains 419461 triples
S contains 614079 triples
G2M contains 229517 triples


In [6]:
gencounts_EMATB6142 = pandas.read_csv(Path(base + "data/E-MTAB-6142_human.csv"), sep=';')
gencounts_EMATB6142.set_index("Gene_ID", inplace=True)
gene_map = {}

with open(base + "data/biomart_human-genes.txt", "r") as f:
    for line in f:
        info = line.split(",")
        gene_map[info[0].replace("\n","").replace("\r","")] = info[1].replace("\n","")

index_list = gencounts_EMATB6142.index.tolist()

for idx, i in enumerate(index_list):
    try:
        if "." in i:
            index_list[idx] = gene_map[i[:i.index(".")]]
        else:
            index_list[idx] = gene_map[i] 
    except KeyError:
        pass

gencounts_EMATB6142.index = index_list
#gencounts_EMATB6142 = gencounts_EMATB6142[~gencounts_EMATB6142.index.duplicated(keep=False)]
x = gencounts_EMATB6142.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_EMATB6142_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_EMATB6142.index, columns=gencounts_EMATB6142.columns)

In [7]:
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, triplets, verbose=True, weighted=True, processes=0, triplets=True)

[__set_matrix] Original Matrix 'x' has shape 59838 x 96
[__set_matrix] Matrix truncation done. Working with 59838 genes for 96 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 89407 marker pairs. 1263057 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 39, S: 18, G2M: 39


In [8]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1_G1,0.98,0.001,0.949,0.507772,0.000518,0.49171,G1
S2_G1,0.975,0.082,0.0,0.922422,0.077578,0.0,G1
S3_G1,1.0,0.63,0.0,0.613497,0.386503,0.0,G1
S4_G1,0.999,0.124,0.0,0.889581,0.110419,0.0,G1
S5_G1,0.428,0.006,0.995,0.29951,0.004199,0.696291,S
S6_G1,0.314,0.0,1.0,0.238965,0.0,0.761035,S
S7_G1,0.975,0.95,0.0,0.506494,0.493506,0.0,G1
S8_G1,0.969,0.0,0.546,0.639604,0.0,0.360396,G1
S9_G1,0.998,0.994,0.0,0.501004,0.498996,0.0,G1
S10_G1,0.808,0.982,0.0,0.451397,0.548603,0.0,G2M


In [9]:
EMATB6142_labels = list(['G1'] * 32) + list(['S'] * 32) + list(['G2M'] * 32)

In [10]:
EMATB6142_evaluation = helper.evaluate_prediction(EMATB6142_prediction_table, EMATB6142_labels)

F1 Score: G1: 0.7323943661971831, S: 0.6, G2M: 0.7323943661971831
Reacall: G1: 0.8125, S: 0.46875, G2M: 0.8125 
Precision: G1: 0.6666666666666666, S: 0.8333333333333334, G2M: 0.6666666666666666 


In [11]:
helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True)

{'data': [{'marker': {'color': 'red', 'size': 10, 'symbol': 'circle'},
   'mode': 'markers',
   'name': 'F1-Score',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': array([0.73239437, 0.6       , 0.73239437])},
  {'marker': {'color': 'blue', 'size': 10, 'symbol': 'square'},
   'mode': 'markers',
   'name': 'Recall-Score',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': array([0.8125 , 0.46875, 0.8125 ])},
  {'marker': {'color': 'green', 'size': 10, 'symbol': 'triangle-up'},
   'mode': 'markers',
   'name': 'Precision-Score',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': array([0.66666667, 0.83333333, 0.66666667])},
  {'marker': {'color': 'red', 'size': 10},
   'mode': 'lines',
   'name': 'Average F1',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': [0.6882629107981221, 0.6882629107981221, 0.6882629107981221]},
  {'marker': {'color': 'blue', 'size': 10},
   'mode': 'lines',
   'name': 'Average Recall',
   'type': 'scatter',
   'x': ['G1', 'S', 'G

Cross Check

In [12]:
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, cc_marker, verbose=True, weighted=True, processes=0, triplets=True)

[__set_matrix] Original Matrix 'x' has shape 59838 x 96
[__set_matrix] Matrix truncation done. Working with 59838 genes for 96 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 6680 marker pairs. 149824 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 47, S: 37, G2M: 12


In [13]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1_G1,0.987207,0.074389,0.971338,0.485607,0.036592,0.477801,G1
S2_G1,1.0,0.014493,0.063784,0.927406,0.013441,0.059153,G1
S3_G1,1.0,0.0,0.483974,0.673866,0.0,0.326134,G1
S4_G1,1.0,0.458606,0.524272,0.504318,0.231283,0.2644,G1
S5_G1,0.807983,0.484459,0.24604,0.525182,0.314894,0.159924,G1
S6_G1,0.223656,0.371398,1.0,0.140218,0.232844,0.626938,S
S7_G1,0.087368,0.420652,0.23444,0.117674,0.566565,0.315761,S
S8_G1,0.977941,0.348387,0.992639,0.421714,0.150234,0.428052,G1
S9_G1,0.390426,0.13587,0.348565,0.446272,0.155304,0.398424,S
S10_G1,0.958378,0.243562,0.615957,0.52719,0.13398,0.33883,G1


In [14]:
EMATB6142_evaluation = helper.evaluate_prediction(EMATB6142_prediction_table, EMATB6142_labels)

F1 Score: G1: 0.48101265822784806, S: 0.26086956521739135, G2M: 0.13636363636363635
Reacall: G1: 0.59375, S: 0.28125, G2M: 0.09375 
Precision: G1: 0.40425531914893614, S: 0.24324324324324326, G2M: 0.25 


In [15]:
helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True)

{'data': [{'marker': {'color': 'red', 'size': 10, 'symbol': 'circle'},
   'mode': 'markers',
   'name': 'F1-Score',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': array([0.48101266, 0.26086957, 0.13636364])},
  {'marker': {'color': 'blue', 'size': 10, 'symbol': 'square'},
   'mode': 'markers',
   'name': 'Recall-Score',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': array([0.59375, 0.28125, 0.09375])},
  {'marker': {'color': 'green', 'size': 10, 'symbol': 'triangle-up'},
   'mode': 'markers',
   'name': 'Precision-Score',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': array([0.40425532, 0.24324324, 0.25      ])},
  {'marker': {'color': 'red', 'size': 10},
   'mode': 'lines',
   'name': 'Average F1',
   'type': 'scatter',
   'x': ['G1', 'S', 'G2M'],
   'y': [0.2927486199362919, 0.2927486199362919, 0.2927486199362919]},
  {'marker': {'color': 'blue', 'size': 10},
   'mode': 'lines',
   'name': 'Average Recall',
   'type': 'scatter',
   'x': ['G1', 'S', 'G