In [1]:
import sys
base = "./../../../"
sys.path.append(base)

In [2]:
import json
import pandas
import pairs_flat_v2 as pairs
import helper
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np
from tqdm import tqdm_notebook as tqdm
from pathlib import Path

tqdm.monitor_interval = 0
init_notebook_mode(connected=True)

In [3]:
# Load matrix
gencounts_oscope = pandas.read_csv(Path(base + "data/GSE64016_H1andFUCCI_normalized_EC_human.csv"))

# Set index right
gencounts_oscope.set_index("Unnamed: 0", inplace=True)

# Subset sorted
gencounts_oscope_sorted = gencounts_oscope.iloc[:, 
                                                       [gencounts_oscope.columns.get_loc(c) 
                                                        for c in gencounts_oscope.columns 
                                                        if "G1_" in c or "G2_" in c or "S_" in c]]

# Define annotation
is_G1 = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G1_" in c]
is_S = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "S_" in c]
is_G2M = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G2_" in c]

annotation = {
    "G1": list(is_G1),
    "S": list(is_S),
    "G2M": list(is_G2M)
}

go_0007049 = [line.replace("\n","").replace("\r","") for line in open(base + "data/go_0007049_homoSapiens.csv", "r")]
cycle_base = [line.split("\t")[0] for i, line in enumerate(open(base + "data/cyclebase_top1000_genes.tsv", "r")) if 0 < i]
cycle_genes = np.unique(np.concatenate((go_0007049, cycle_base),0))

cc_marker = pairs.sandbag(gencounts_oscope_sorted, phases=annotation, subset_genes=list(cycle_genes), fraction=0.5, processes=10, verbose=True, weighted=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 149824 marker pairs (phase: count): {'G1': 50750, 'S': 59280, 'G2M': 39794}


In [4]:
triplets = pairs.identify_triplets(cc_marker, weighted=True)

TypeError: '>' not supported between instances of 'list' and 'int'

In [None]:
for phase, tips in triplets.items():
    print("{} contains {} triples".format(phase, len(tips)))

In [None]:
gencounts_EMATB6142 = pandas.read_csv(Path(base + "data/E-MTAB-6142_human.csv"), sep=';')
gencounts_EMATB6142.set_index("Gene_ID", inplace=True)
gene_map = {}

with open(base + "data/biomart_human-genes.txt", "r") as f:
    for line in f:
        info = line.split(",")
        gene_map[info[0].replace("\n","").replace("\r","")] = info[1].replace("\n","")

index_list = gencounts_EMATB6142.index.tolist()

for idx, i in enumerate(index_list):
    try:
        if "." in i:
            index_list[idx] = gene_map[i[:i.index(".")]]
        else:
            index_list[idx] = gene_map[i] 
    except KeyError:
        pass

gencounts_EMATB6142.index = index_list
#gencounts_EMATB6142 = gencounts_EMATB6142[~gencounts_EMATB6142.index.duplicated(keep=False)]
x = gencounts_EMATB6142.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_EMATB6142_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_EMATB6142.index, columns=gencounts_EMATB6142.columns)

In [None]:
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, triplets, verbose=True, weighted=True, processes=0, triplets=True)

In [None]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

In [None]:
EMATB6142_labels = list(['G1'] * 32) + list(['S'] * 32) + list(['G2M'] * 32)

In [None]:
EMATB6142_evaluation = helper.evaluate_prediction(EMATB6142_prediction_table, EMATB6142_labels)

In [None]:
helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True)

Cross Check

In [None]:
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, cc_marker, verbose=True, weighted=True, processes=0, triplets=True)

In [None]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

In [None]:
helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True)