# Testing weighted triplets

<div id="toc"></div>

## Neccessary Imports

In [18]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [19]:
import sys
code = "./../../code/"
data = "./../../data/"
sys.path.append(code)
import pandas
import pypairs as pairs
from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np
from pathlib import Path
from tqdm import tqdm_notebook as tqdm
import helper
import timeit

init_notebook_mode(connected=True)

## Load oscope marker pairs

In [20]:
cc_marker = helper.load_ocope_marker(data, fraction=0.5, weighted=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 149824 marker pairs (phase: count): {'G1': 50750, 'S': 59280, 'G2M': 39794}


## Identify Triplets

In [21]:
triplets = pairs.identify_triplets(cc_marker, weighted=True, fraction=0.21)

In [22]:
for phase, tips in triplets.items():
    print("{} contains {} triples".format(phase, len(tips)))

G1 contains 176029 triples
S contains 328590 triples
G2M contains 74502 triples


## Predict on ML sc dataset

In [23]:
gencounts_EMATB6142 = pandas.read_csv(Path(data + "E-MTAB-6142_human.csv"), sep=';')
gencounts_EMATB6142.set_index("Gene_ID", inplace=True)
gene_map = {}

with open(data + "biomart_human-genes.txt", "r") as f:
    for line in f:
        info = line.split(",")
        gene_map[info[0].replace("\n","").replace("\r","")] = info[1].replace("\n","")

index_list = gencounts_EMATB6142.index.tolist()

for idx, i in enumerate(index_list):
    try:
        if "." in i:
            index_list[idx] = gene_map[i[:i.index(".")]]
        else:
            index_list[idx] = gene_map[i] 
    except KeyError:
        pass

gencounts_EMATB6142.index = index_list
#gencounts_EMATB6142 = gencounts_EMATB6142[~gencounts_EMATB6142.index.duplicated(keep=False)]
x = gencounts_EMATB6142.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_EMATB6142_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_EMATB6142.index, columns=gencounts_EMATB6142.columns)

In [24]:
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, triplets, verbose=True, weighted=True, processes=0, triplets=True)

[__set_matrix] Original Matrix 'x' has shape 59838 x 96
[__set_matrix] Matrix truncation done. Working with 59838 genes for 96 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 47877 marker pairs. 579121 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 35, S: 18, G2M: 43


In [25]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1_G1,0.995,0.002,0.994,0.499749,0.001005,0.499247,G1
S2_G1,0.988,0.183,0.0,0.843723,0.156277,0.0,G1
S3_G1,1.0,0.695,0.0,0.589971,0.410029,0.0,G1
S4_G1,0.999,0.273,0.0,0.785377,0.214623,0.0,G1
S5_G1,0.371,0.084,0.994,0.256039,0.057971,0.68599,S
S6_G1,0.541,0.003,1.0,0.350389,0.001943,0.647668,G1
S7_G1,0.982,0.965,0.0,0.504366,0.495634,0.0,G1
S8_G1,0.976,0.0,0.666,0.594397,0.0,0.405603,G1
S9_G1,0.999,0.997,0.0,0.500501,0.499499,0.0,G1
S10_G1,0.875,0.945,0.0,0.480769,0.519231,0.0,G2M


In [26]:
EMATB6142_labels = list(['G1'] * 32) + list(['S'] * 32) + list(['G2M'] * 32)

In [27]:
EMATB6142_evaluation = helper.evaluate_prediction(EMATB6142_prediction_table, EMATB6142_labels)

F1 Score: G1: 0.8059701492537314, S: 0.64, G2M: 0.6933333333333334
Reacall: G1: 0.84375, S: 0.5, G2M: 0.8125 
Precision: G1: 0.7714285714285715, S: 0.8888888888888888, G2M: 0.6046511627906976 


In [28]:
iplot(helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True))

Cross Check

## Regular pairs for comparison

In [37]:
cc_marker = helper.load_ocope_marker(data, fraction=0.6, weighted=True)
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, cc_marker, verbose=True, weighted=True, processes=0)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 8146 marker pairs (phase: count): {'G1': 2575, 'S': 4101, 'G2M': 1470}
[__set_matrix] Original Matrix 'x' has shape 59838 x 96
[__set_matrix] Matrix truncation done. Working with 59838 genes for 96 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 412 marker pairs. 8146 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (p

In [38]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1_G1,0.999,0.0,0.998,0.50025,0.0,0.49975,G1
S2_G1,0.995,0.235,0.0,0.808943,0.191057,0.0,G1
S3_G1,0.959,0.299,0.0,0.762321,0.237679,0.0,G1
S4_G1,0.986,0.514,0.0,0.657333,0.342667,0.0,G1
S5_G1,0.268,0.277,0.99,0.174593,0.180456,0.644951,S
S6_G1,0.647,0.062,0.998,0.379028,0.036321,0.584651,G1
S7_G1,0.952,0.928,0.0,0.506383,0.493617,0.0,G1
S8_G1,0.979,0.002,0.401,0.708394,0.001447,0.290159,G1
S9_G1,1.0,0.882,0.0,0.53135,0.46865,0.0,G1
S10_G1,0.839,0.527,0.0,0.614202,0.385798,0.0,G1


In [39]:
EMATB6142_evaluation = helper.evaluate_prediction(EMATB6142_prediction_table, EMATB6142_labels)

F1 Score: G1: 0.8484848484848485, S: 0.4888888888888889, G2M: 0.6666666666666666
Reacall: G1: 0.875, S: 0.34375, G2M: 0.84375 
Precision: G1: 0.8235294117647058, S: 0.8461538461538461, G2M: 0.5510204081632653 


In [40]:
iplot(helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True))