# Internal testing

Here we trained and predicted the on the sample dataset to evaluate the optimal fraction parameter and to decide whether we should only consider cell cycle annotated marker pairs

<div id="toc"></div>

## Neccessary Imports

In [14]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
import sys
base = "./../../../"
sys.path.append(base)

In [3]:
import sys
code = "./../../code/"
data = "./../../data/"
sys.path.append(code)
import pandas
import pypairs as pairs
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA as sklearnPCA
import plotly.graph_objs as go
import numpy as np
from pathlib import Path
from tqdm import tqdm_notebook as tqdm
from pandas import Series
import helper
import timeit

init_notebook_mode(connected=True)

## Loading the oscope dataset

In [5]:
gencounts_oscope = pandas.read_csv(data + "GSE64016_H1andFUCCI_normalized_EC_human.csv")
gencounts_oscope.set_index("Unnamed: 0", inplace=True)
gencounts_oscope_sorted = gencounts_oscope.iloc[:,
                              [gencounts_oscope.columns.get_loc(c) for c in gencounts_oscope.columns if
                              "G1_" in c or "G2_" in c or "S_" in c]]

is_G1 = len([gencounts_oscope.columns.get_loc(c) for c in gencounts_oscope.columns if "G1_" in c])
is_S = len([gencounts_oscope.columns.get_loc(c) for c in gencounts_oscope.columns if "S_" in c])
is_G2M = len([gencounts_oscope.columns.get_loc(c) for c in gencounts_oscope.columns if "G2_" in c])
is_NA = len([gencounts_oscope.columns.get_loc(c) for c in gencounts_oscope.columns if "H1_" in c])

height = [is_G1, is_S, is_G2M, is_NA]

data = [go.Bar(
            x=["G1", "S", "G2M", "Not annotated"],
            y=height,
            marker=dict(
                color=['blue', 'orange', 'green', 'gray'],
            )
    )]

layout = go.Layout(
    title='Distribution of phases within the oscope dataset',
     xaxis=dict(
        title='Phase',
    ),
    yaxis=dict(
        title='No. of samples',
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

## PCA

In [6]:
phases = []

for c in gencounts_oscope_sorted.columns:
    if "G1_" in c:
        phases.append("G1")
    elif "S_" in c:
        phases.append("S")
    elif "G2_" in c:
        phases.append("G2M")

phases = Series(phases)

In [7]:
oscope_dataset_sorted_trans = gencounts_oscope_sorted.T

oscope_dataset_sorted_trans = oscope_dataset_sorted_trans.assign(phase=phases.values)

y = oscope_dataset_sorted_trans.iloc[:,-1].values
X = oscope_dataset_sorted_trans.iloc[:,:-1].values

X_std = QuantileTransformer().fit_transform(X.astype(float))

In [8]:
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)

In [9]:
traces = []
colors = ['blue', 'orange', 'green']


for i, name in enumerate(('G1', 'S', 'G2M')):
    
    trace = go.Scatter(
        x=Y_sklearn[y==name,0],
        y=Y_sklearn[y==name,1],
        mode='markers',
        name=name,
        marker=go.Marker(
            size=5,
            line=go.Line(
                color=colors[i],
                width=0.5)
            ))
    traces.append(trace)

data = go.Data(traces)
layout = go.Layout(
    title='PCA of the (annotated) Oscope dataset',
    xaxis=go.XAxis(title='PC1', showline=False),
    yaxis=go.YAxis(title='PC2', showline=False)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='oscope_pca')

## 5-fold cross validation

Here we used fractions in the range of 0.4 to 0.8 and all genes

Notice: This code takes long time, resuts stored in magic.

In [14]:
avg_f1 = []
avg_recall = []
avg_precision = []

frac = [0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5, 0.45, 0.4]
for f in tqdm(frac):
    print("### Using Fraction of: {}".format(f))
    
    cross_f1_scores = []
    cross_recall_scores = []
    cross_precision_scores = []
    
    for i in tqdm(range(0,5)):
        print("#### Cross run no.: {}".format(i+1))
        
        no_samples = len(gencounts_oscope_sorted.columns)
        sub = helper.random_subset(range(0, no_samples), (2 / 3 * no_samples))
        training_set = gencounts_oscope_sorted.iloc[:, sub]

        # Define annotation
        is_G1 = [training_set.columns.get_loc(c) for c in training_set.columns if "G1_" in c]
        is_S = [training_set.columns.get_loc(c) for c in training_set.columns if "S_" in c]
        is_G2M = [training_set.columns.get_loc(c) for c in training_set.columns if "G2_" in c]

        annotation = {
            "G1": list(is_G1),
            "S": list(is_S),
            "G2M": list(is_G2M)
        }

        marker = pairs_flat.sandbag(training_set, phases=annotation, fraction=f, processes=0, verboose=True)
        
        rev_sub = [idx for idx in range(0, no_samples) if idx not in sub]
        testing_set = gencounts_oscope_sorted.iloc[:, rev_sub]
        
        prediction = pairs_flat.cyclone(testing_set, marker, verboose=True)

        pred_table = helper.get_prediction_table(prediction)
    
        f1, recall, precision = helper.evaluate_prediction(pred_table)
    
        cross_f1_scores.append(f1)
        cross_recall_scores.append(recall)
        cross_precision_scores.append(precision)
    
    avg_f1.append(np.average([np.average([data[i] for data in cross_f1_scores]) for i in range(0,3)]))
    avg_recall.append(np.average([np.average([data[i] for data in cross_recall_scores]) for i in range(0,3)]))
    avg_precision.append(np.average([np.average([data[i] for data in cross_precision_scores]) for i in range(0,3)]))

### Using Fraction of: 0.8


#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2242 genes that were not expressed in any samples. 16842 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16842 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 403 marker pairs (phase: count): {'G1': 318, 'S': 42, 'G2M': 43}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 403 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): None: 34, G1: 48



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.



F1 Score: G1: 0.8292682926829268, S: 0.0, G2M: 0.0
Reacall: G1: 1.0, S: 0.0, G2M: 0.0 
Precision: G1: 0.7083333333333334, S: 0.0, G2M: 0.0 
#### Cross run no.: 2
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2242 genes that were not expressed in any samples. 16842 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16842 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 424 marker pairs (phase: count): {'G1': 359, 'S': 43, 'G2M': 22}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 424 marker pairs remaining.
[cyclone] Calculating scores and predicting cell

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2262 genes that were not expressed in any samples. 16822 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16822 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 1197 marker pairs (phase: count): {'G1': 828, 'S': 203, 'G2M': 166}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 1197 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G2M: 26, S: 30, G1: 26
F1 Score: G1: 0.9811320754716981

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2237 genes that were not expressed in any samples. 16847 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16847 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 3632 marker pairs (phase: count): {'G1': 1696, 'S': 1464, 'G2M': 472}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 3632 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): S: 25, G2M: 23, G1: 34
F1 Score: G1: 1.0, S: 0.92, G2

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2267 genes that were not expressed in any samples. 16817 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16817 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 21622 marker pairs (phase: count): {'G1': 8363, 'S': 9494, 'G2M': 3765}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 21622 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G2M: 21, S: 35, G1: 26
F1 Score: G1: 1.0, S: 0.906

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2250 genes that were not expressed in any samples. 16834 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16834 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 183123 marker pairs (phase: count): {'G1': 67272, 'S': 81280, 'G2M': 34571}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 183123 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G2M: 25, S: 23, G1: 34
F1 Score: G1: 1.0, S: 

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2275 genes that were not expressed in any samples. 16809 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16809 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 883409 marker pairs (phase: count): {'G1': 320527, 'S': 364521, 'G2M': 198361}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 883409 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G2M: 26, S: 28, G1: 28
F1 Score: G1: 1.0, 

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2253 genes that were not expressed in any samples. 16831 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16831 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 5417495 marker pairs (phase: count): {'G1': 1828801, 'S': 2124282, 'G2M': 1464412}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 5417495 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G2M: 15, None: 12, S: 20, G1: 35
F1 S

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2261 genes that were not expressed in any samples. 16823 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16823 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 8647944 marker pairs (phase: count): {'G1': 2814969, 'S': 3346403, 'G2M': 2486572}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 8647944 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): None: 47, G2M: 3, S: 8, G1: 24
F1 Sco

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 2235 genes that were not expressed in any samples. 16849 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 16849 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 15 processes...
 Done!
[sandbag] Identified 13062237 marker pairs (phase: count): {'G1': 5896350, 'S': 4405680, 'G2M': 2760207}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 13062237 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): None: 46, S: 6, G1: 30
F1 Score: G1

In [15]:
%store avg_f1
%store avg_recall
%store avg_precision

Stored 'avg_f1' (list)
Stored 'avg_recall' (list)
Stored 'avg_precision' (list)


## Plotting results all genes

In [10]:
%store -r

In [6]:
frac = [0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5, 0.45, 0.4]
# Create traces
trace0 = go.Scatter(
    x = frac,
    y = avg_f1,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'circle',
        size = 10,
        color = 'red',
    ),
    name = 'Average F1-Score'
)

trace1 = go.Scatter(
    x = frac,
    y = avg_recall,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'square',
        size = 10,
        color = 'blue',
    ),
    name = 'Average Recall-Score'
)

trace2 = go.Scatter(
    x = frac,
    y = avg_precision,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'triangle-up',
        size = 10,
        color = 'green',
    ),
    name = 'Average Precision-Score'
)

layout = go.Layout(
    title='Parameter tuning of fraction',
    xaxis=dict(
        title='fraction f',
    ),
    yaxis=dict(
        title='F1, Recall, Precision Score',
    )
)
data =  go.Figure(data=[trace0, trace1, trace2], layout=layout)

iplot(data, filename='oscope_fraction')

## Cell Cycle genes only

Notice: This code takes long, results stored in magic

In [4]:
go_0007049 = [line.replace("\n","").replace("\r","") for line in open(data + "go_0007049_homoSapiens.csv", "r")]
cycle_base = [line.split("\t")[0] for i, line in enumerate(open(data + "cyclebase_top1000_genes.tsv", "r")) if 0 < i]
cycle_genes = np.unique(np.concatenate((go_0007049, cycle_base),0))
print(len(cycle_genes))
cycle_genes

2561


array(['A0A0U1RQJ8', 'A1BG', 'AAAS', ..., 'ccny-cdk14_human',
       'cdk5-p35_human', 'escrt-iii_human'], dtype='<U18')

In [5]:
list(cycle_genes)
len(list(set(list(cycle_genes)).intersection(gencounts_oscope_sorted.index.tolist())))

2395

In [8]:
avg_f1_cc = []
avg_recall_cc = []
avg_precision_cc = []



frac = [0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5, 0.45, 0.4]
for f in tqdm(frac):
    print("### Using Fraction of: {}".format(f))
    
    cross_f1_scores = []
    cross_recall_scores = []
    cross_precision_scores = []
    
    for i in tqdm(range(0,5)):
        print("#### Cross run no.: {}".format(i+1))
        
        no_samples = len(gencounts_oscope_sorted.columns)
        sub = helper.random_subset(range(0, no_samples), (2 / 3 * no_samples))
        training_set = gencounts_oscope_sorted.iloc[:, sub]

        # Define annotation
        is_G1 = [training_set.columns.get_loc(c) for c in training_set.columns if "G1_" in c]
        is_S = [training_set.columns.get_loc(c) for c in training_set.columns if "S_" in c]
        is_G2M = [training_set.columns.get_loc(c) for c in training_set.columns if "G2_" in c]

        annotation = {
            "G1": list(is_G1),
            "S": list(is_S),
            "G2M": list(is_G2M)
        }

        marker = pairs.sandbag(training_set, phases=annotation, fraction=f, subset_genes=list(cycle_genes), processes=10, verbose=True)
        
        rev_sub = [idx for idx in range(0, no_samples) if idx not in sub]
        testing_set = gencounts_oscope_sorted.iloc[:, rev_sub]
        
        prediction = pairs.cyclone(testing_set, marker, processes=10, verbose=True)
        
        label = [] 

        for c in testing_set.columns:
            if "G1_" in c:
                label.append("G1")
            elif "S_" in c:
                label.append("S")
            elif "G2_" in c:
                label.append("G2M")

        pred_table = helper.get_prediction_table(prediction)
    
        f1, recall, precision = helper.evaluate_prediction(pred_table, label=label)
    
        cross_f1_scores.append(f1)
        cross_recall_scores.append(recall)
        cross_precision_scores.append(precision)
    
    avg_f1_cc.append(np.average([np.average([data[i] for data in cross_f1_scores]) for i in range(0,3)]))
    avg_recall_cc.append(np.average([np.average([data[i] for data in cross_recall_scores]) for i in range(0,3)]))
    avg_precision_cc.append(np.average([np.average([data[i] for data in cross_precision_scores]) for i in range(0,3)]))

### Using Fraction of: 0.8


#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 71 genes that were not expressed in any samples. 2324 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2324 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 122 marker pairs (phase: count): {'G1': 103, 'S': 13, 'G2M': 6}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 122 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and p


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.



F1 Score: G1: 0.9508196721311475, S: 0.7228915662650602, G2M: 0.0
Reacall: G1: 0.90625, S: 1.0, G2M: 0.0 
Precision: G1: 1.0, S: 0.5660377358490566, G2M: 0.0 
#### Cross run no.: 2
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 74 genes that were not expressed in any samples. 2321 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2321 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 94 marker pairs (phase: count): {'G1': 77, 'S': 14, 'G2M': 3}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 77 genes that were not expressed in any samples. 2318 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2318 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 225 marker pairs (phase: count): {'G1': 135, 'S': 61, 'G2M': 29}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 225 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and 

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 70 genes that were not expressed in any samples. 2325 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2325 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 628 marker pairs (phase: count): {'G1': 308, 'S': 231, 'G2M': 89}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 628 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 76 genes that were not expressed in any samples. 2319 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2319 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 2452 marker pairs (phase: count): {'G1': 901, 'S': 1097, 'G2M': 454}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 2452 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 76 genes that were not expressed in any samples. 2319 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2319 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 10957 marker pairs (phase: count): {'G1': 3706, 'S': 4854, 'G2M': 2397}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 10957 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated sc

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 74 genes that were not expressed in any samples. 2321 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2321 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 41439 marker pairs (phase: count): {'G1': 14541, 'S': 17426, 'G2M': 9472}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 41439 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated 

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 77 genes that were not expressed in any samples. 2318 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2318 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 165589 marker pairs (phase: count): {'G1': 58574, 'S': 65963, 'G2M': 41052}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 165589 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculat

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 72 genes that were not expressed in any samples. 2323 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2323 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 246415 marker pairs (phase: count): {'G1': 87488, 'S': 106907, 'G2M': 52020}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 246415 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calcula

#### Cross run no.: 1
[__set_matrix] Original Matrix 'x' has shape 19084 x 165
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 81 genes that were not expressed in any samples. 2314 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 165 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2314 genes for 165 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 365730 marker pairs (phase: count): {'G1': 162092, 'S': 118703, 'G2M': 84935}
[__set_matrix] Original Matrix 'x' has shape 19084 x 82
[__set_matrix] Matrix truncation done. Working with 19084 genes for 82 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 365730 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calcul

In [12]:
%store avg_f1_cc
%store avg_recall_cc
%store avg_precision_cc

Stored 'avg_f1_cc' (list)
Stored 'avg_recall_cc' (list)
Stored 'avg_precision_cc' (list)


## Plotting results cc only

In [None]:
%store -r

In [14]:

frac = [0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5, 0.45, 0.4]
frac = sorted(frac)
# Create traces
trace0 = go.Scatter(
    x = frac,
    y = avg_f1_cc,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'circle',
        size = 10,
        color = 'red',
    ),
    name = 'Average F1-Score'
)

trace1 = go.Scatter(
    x = frac,
    y = avg_recall_cc,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'square',
        size = 10,
        color = 'blue',
    ),
    name = 'Average Recall-Score'
)

trace2 = go.Scatter(
    x = frac,
    y = avg_precision_cc,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'triangle-up',
        size = 10,
        color = 'green',
    ),
    name = 'Average Precision-Score'
)

layout = go.Layout(
    title='Parameter tuning of fraction',
    xaxis=dict(
        title='fraction f',
    ),
    yaxis=dict(
        title='F1, Recall, Precision Score',
    )
)
data =  go.Figure(data=[trace0, trace1, trace2], layout=layout)

iplot(data, filename='oscope_fraction')