# Testing weighted triplets

<div id="toc"></div>

## Neccessary Imports

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
import sys
code = "./../../code/"
data = "./../../data/"
sys.path.append(code)
import pandas
import pypairs as pairs
from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np
from pathlib import Path
from tqdm import tqdm_notebook as tqdm
import helper
import timeit

init_notebook_mode(connected=True)

## Load oscope marker pairs

In [3]:
triplets = helper.load_ocope_marker(data, fraction=0.6, triplets=True, weighted=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 1584 marker pairs (phase: count): {'G1': 679, 'S': 820, 'G2M': 85}


## Identify Triplets

In [4]:
#triplets = pairs.identify_triplets(cc_marker, weighted=True, fraction=0.21)

In [5]:
for phase, tips in triplets.items():
    print("{} contains {} triples".format(phase, len(tips)))

G1 contains 679 triples
S contains 820 triples
G2M contains 85 triples


## Predict on ML sc dataset

In [6]:
gencounts_EMATB6142 = pandas.read_csv(Path(data + "E-MTAB-6142_human.csv"), sep=';')
gencounts_EMATB6142.set_index("Gene_ID", inplace=True)
gene_map = {}

with open(data + "biomart_human-genes.txt", "r") as f:
    for line in f:
        info = line.split(",")
        gene_map[info[0].replace("\n","").replace("\r","")] = info[1].replace("\n","")

index_list = gencounts_EMATB6142.index.tolist()

for idx, i in enumerate(index_list):
    try:
        if "." in i:
            index_list[idx] = gene_map[i[:i.index(".")]]
        else:
            index_list[idx] = gene_map[i] 
    except KeyError:
        pass

gencounts_EMATB6142.index = index_list
#gencounts_EMATB6142 = gencounts_EMATB6142[~gencounts_EMATB6142.index.duplicated(keep=False)]
x = gencounts_EMATB6142.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_EMATB6142_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_EMATB6142.index, columns=gencounts_EMATB6142.columns)

In [7]:
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, triplets, min_pairs=1, verbose=True, weighted=True, processes=0, triplets=True)

[__set_matrix] Original Matrix 'x' has shape 59838 x 96
[__set_matrix] Matrix truncation done. Working with 59838 genes for 96 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 115 marker pairs. 1469 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 30, S: 29, G2M: 37


In [8]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1_G1,0.85657,0.0,0.998,0.46187,0.0,0.53813,G1
S2_G1,0.96,0.094737,0.0,0.91018,0.08982,0.0,G1
S3_G1,0.77,0.243688,0.001,0.758854,0.240161,0.000986,G1
S4_G1,0.727455,0.0,0.0,1.0,0.0,0.0,G1
S5_G1,0.17,0.094439,0.852,0.15227,0.084589,0.763141,S
S6_G1,0.575576,0.026767,0.999,0.359433,0.016715,0.623852,G1
S7_G1,0.903,0.749471,0.0,0.546454,0.453546,0.0,G1
S8_G1,0.834669,0.0,0.339,0.711162,0.0,0.288838,G1
S9_G1,0.947,0.228125,0.0,0.805872,0.194128,0.0,G1
S10_G1,0.865,0.25756,0.0,0.77056,0.22944,0.0,G1


In [9]:
EMATB6142_labels = list(['G1'] * 32) + list(['S'] * 32) + list(['G2M'] * 32)

In [10]:
EMATB6142_evaluation = helper.evaluate_prediction(EMATB6142_prediction_table, EMATB6142_labels)

F1 Score: G1: 0.8064516129032259, S: 0.7213114754098361, G2M: 0.7246376811594203
Reacall: G1: 0.78125, S: 0.6875, G2M: 0.78125 
Precision: G1: 0.8333333333333334, S: 0.7586206896551724, G2M: 0.6756756756756757 


In [11]:
iplot(helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True))

Cross Check

## Regular pairs for comparison

In [12]:
cc_marker = helper.load_ocope_marker(data, fraction=0.6, weighted=True)
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, cc_marker, verbose=True, weighted=True, processes=0)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 8146 marker pairs (phase: count): {'G1': 2575, 'S': 4101, 'G2M': 1470}
[__set_matrix] Original Matrix 'x' has shape 59838 x 96
[__set_matrix] Matrix truncation done. Working with 59838 genes for 96 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 412 marker pairs. 7734 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (p

In [13]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1_G1,0.997,0.0,1.0,0.499249,0.0,0.500751,G1
S2_G1,0.996,0.224,0.0,0.816393,0.183607,0.0,G1
S3_G1,0.943,0.307,0.0,0.7544,0.2456,0.0,G1
S4_G1,0.985,0.499,0.0,0.663747,0.336253,0.0,G1
S5_G1,0.274,0.293,0.995,0.175416,0.18758,0.637004,S
S6_G1,0.676,0.066,0.999,0.388283,0.037909,0.573808,G1
S7_G1,0.93,0.94,0.0,0.497326,0.502674,0.0,G2M
S8_G1,0.984,0.001,0.42,0.700356,0.000712,0.298932,G1
S9_G1,1.0,0.863,0.0,0.536769,0.463231,0.0,G1
S10_G1,0.835,0.496,0.0,0.627348,0.372652,0.0,G1


In [14]:
EMATB6142_evaluation = helper.evaluate_prediction(EMATB6142_prediction_table, EMATB6142_labels)

F1 Score: G1: 0.8307692307692308, S: 0.4888888888888889, G2M: 0.6585365853658537
Reacall: G1: 0.84375, S: 0.34375, G2M: 0.84375 
Precision: G1: 0.8181818181818182, S: 0.8461538461538461, G2M: 0.54 


In [15]:
iplot(helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True), image="svg")

## Testing GSE53481

In [16]:
gencounts_GSE53481 = pandas.read_csv(Path(data + "GSE53481_humanRNAseq.txt"), sep='\t')
genes = [s[s.rindex('_') +1:] for s in gencounts_GSE53481["GENE"]]
gencounts_GSE53481["GENE"] = genes
gencounts_GSE53481.set_index("GENE", inplace=True)
x = gencounts_GSE53481.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_GSE53481_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_GSE53481.index, columns=gencounts_GSE53481.columns)

In [17]:
GSE53481_prediction = pairs.cyclone(gencounts_GSE53481_Qnorm, triplets, min_pairs=1, min_iter=1, weighted=True, triplets=True, verbose=True)

[__set_matrix] Original Matrix 'x' has shape 510 x 12
[__set_matrix] Matrix truncation done. Working with 510 genes for 12 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 1583 marker pairs. 1 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): S: 12


In [18]:
GSE53481_prediction_table = helper.get_prediction_table(GSE53481_prediction)
helper.DataTable(GSE53481_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
H1.DN,0.0,0.0,0.0,,,,S
H1.KO2,0.0,0.0,0.0,,,,S
H1.AzLow,0.0,0.0,0.0,,,,S
H1.AzHigh,0.0,0.0,0.0,,,,S
H2.DN,0.0,0.0,0.0,,,,S
H2.KO2,0.0,0.0,0.0,,,,S
H2.AzLow,0.0,0.0,0.0,,,,S
H2.AzHigh,0.0,0.0,0.0,,,,S
H3.DN,0.0,0.0,0.0,,,,S
H3.KO2,0.0,0.0,0.0,,,,S


In [19]:
GSE53481_labels = ['G1', 'G1','S','G2M','G1', 'G1','S','G2M','G1', 'G1','S','G2M']
GSE53481_evaluation = helper.evaluate_prediction(GSE53481_prediction_table, GSE53481_labels)
iplot(helper.plot_evaluation(*GSE53481_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase"))


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.



F1 Score: G1: 0.0, S: 0.4, G2M: 0.0
Reacall: G1: 0.0, S: 1.0, G2M: 0.0 
Precision: G1: 0.0, S: 0.25, G2M: 0.0 


## GSE71456

In [20]:
gencounts_GSE71456 = pandas.read_csv(
    Path(data + "GSE71456_Samples_RPKM.csv"), sep='\t', index_col=0, 
    usecols=[1,4,5,6,7,8,9,10,11,12,13,14,15,16]
)
x = gencounts_GSE71456.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_GSE71456_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_GSE71456.index, columns=gencounts_GSE71456.columns)


invalid value encountered in subtract



In [21]:
GSE71456_prediction = pairs.cyclone(gencounts_GSE71456_Qnorm, triplets, min_pairs=1, weighted=True, triplets=True, verbose=True)

[__set_matrix] Original Matrix 'x' has shape 63657 x 13
[__set_matrix] Matrix truncation done. Working with 63657 genes for 13 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 6 marker pairs. 1578 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): S: 3, G1: 7, G2M: 3


In [22]:
GSE71456_prediction_table = helper.get_prediction_table(GSE71456_prediction)
helper.DataTable(GSE71456_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pES10 h-G1 rep1,0.445445,0.465839,0.717,0.273567,0.286092,0.440341,S
pES10 h-G1 rep2,0.995,0.833853,0.0,0.544057,0.455943,0.0,G1
pES10 d-G1 rep1,0.791,0.742678,0.12,0.478328,0.449107,0.072566,G1
pES10 d-G1 rep2,0.634,0.0,1.0,0.388005,0.0,0.611995,G1
h-pES10 d-G2/M,0.75,0.901468,0.007,0.452225,0.543555,0.004221,G2M
d-pES10 d-G2/M,0.005,1.0,0.002,0.004965,0.993049,0.001986,G2M
pES12 h-G1 rep1,0.699,0.688912,0.003,0.502548,0.495295,0.002157,G1
pES12 h-G1 rep2,0.097,0.21134,0.808,0.086891,0.189315,0.723794,S
pES12 d-G1 rep1,0.988,0.0,0.994,0.498486,0.0,0.501514,G1
pES12 d-G1 rep2,0.967,0.107143,0.053,0.857921,0.095057,0.047022,G1


In [23]:
GSE71456_labels = ['G1', 'G1','G1','G1','G2M', 'G2M','G1','G1','G1', 'G1','G1','G1','G1']
GSE71456_evaluation = helper.evaluate_prediction(GSE71456_prediction_table, GSE71456_labels)
iplot(helper.plot_evaluation(*GSE71456_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase"))


F-score is ill-defined and being set to 0.0 in labels with no true samples.


Recall is ill-defined and being set to 0.0 in labels with no true samples.



F1 Score: G1: 0.7777777777777778, S: 0.0, G2M: 0.8
Reacall: G1: 0.6363636363636364, S: 0.0, G2M: 1.0 
Precision: G1: 1.0, S: 0.0, G2M: 0.6666666666666666 
