# Running Random Forest with Pairs on ML

First evaluated internally oscope then on sc dataset [EMATB6142](./../3.%20Evaluation/3.4.2%20Single%20cell%20-%20EMATB6142.ipynb)

<div id="toc"></div>

## Neccessary Imports

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
import sys
code = "./../../code/"
data = "./../../data/"
sys.path.append(code)
import pandas
import pypairs as pairs
from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np
from pathlib import Path
from tqdm import tqdm_notebook as tqdm
import helper
import timeit

init_notebook_mode(connected=True)

## Loading oscope marker pairs

In [3]:
oscope_marker_pairs = helper.load_ocope_marker(data, fraction=0.65)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 1920 marker pairs (phase: count): {'G1': 646, 'S': 920, 'G2M': 354}


## Random Forest with pairs as Markers

In [4]:
oscope_gencounts = pandas.read_csv(Path(data + "GSE64016_H1andFUCCI_normalized_EC_human.csv"))

oscope_gencounts.set_index("Unnamed: 0", inplace=True)

oscope_gencounts_sorted = oscope_gencounts.iloc[:, [oscope_gencounts.columns.get_loc(c) for c in oscope_gencounts.columns if "G1_" in c or "G2_" in c or "S_" in c]]

oscope_pairs_features = pandas.DataFrame(index=oscope_gencounts_sorted.columns)

In [5]:
classes = ["G1", "S", "G2M"]

label = [classes.index(i[:i.index('_')].replace("G2", "G2M")) for i in oscope_pairs_features.index]

oscope_pairs_features = oscope_pairs_features.assign(label=label)

for phase, marker in tqdm(oscope_marker_pairs.items()):
    for pair in tqdm(marker):
        name = "{}_{}-{}".format(phase, *pair)
        
        values = [1 if oscope_gencounts_sorted.loc[pair[0],sample] > oscope_gencounts_sorted.loc[pair[1],sample]
                  else 0 for sample in oscope_gencounts_sorted.columns]
        kwargs = {name: values}
        
        oscope_pairs_features = oscope_pairs_features.assign(**kwargs)

oscope_pairs_features.head()





Unnamed: 0,label,G1_AASDHPPT-BIRC6,G1_AASDHPPT-CDK1,G1_AASDHPPT-NUP205,G1_AASDHPPT-TCF7L2,G1_ABCF2-UBE2C,G1_TOB1-ACTR2,G1_ADH4-HIST1H1C,G1_AHR-NUP210,G1_AKAP9-CDK1,...,G2M_UBE2C-UBR5,G2M_UBE2C-USP10,G2M_UBE2C-VMP1,G2M_UBE2C-WNK1,G2M_UBE2C-WTAP,G2M_UBE2C-ZFP36L2,G2M_UBE2C-ZFR,G2M_UBE2C-ZRANB2,G2M_UBE2C-ZWILCH,G2M_UBE2C-ZWINT
G2_Exp1.059,2,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,1,1,0,0
G2_Exp1.069,2,0,0,0,0,0,0,0,1,0,...,0,1,1,1,1,1,1,1,1,1
G2_Exp1.075,2,0,0,1,0,0,0,1,1,0,...,1,1,1,1,1,1,1,1,1,0
G2_Exp1.063,2,1,1,1,1,0,1,0,0,0,...,1,1,1,1,1,0,1,1,0,1
G2_Exp1.029,2,1,0,0,1,0,1,0,0,0,...,1,1,1,1,1,1,1,1,1,1


## Internal validation - Random forest with pairs

In [6]:
np.random.seed(0)

oscope_pairs_features['is_train'] = np.random.uniform(0, 1, len(oscope_pairs_features)) <= .75
train, test = oscope_pairs_features[oscope_pairs_features['is_train']==True], oscope_pairs_features[oscope_pairs_features['is_train']==False]
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 196
Number of observations in the test data: 51


In [7]:
features = oscope_pairs_features.columns[1:-1]
features

Index(['G1_AASDHPPT-BIRC6', 'G1_AASDHPPT-CDK1', 'G1_AASDHPPT-NUP205',
       'G1_AASDHPPT-TCF7L2', 'G1_ABCF2-UBE2C', 'G1_TOB1-ACTR2',
       'G1_ADH4-HIST1H1C', 'G1_AHR-NUP210', 'G1_AKAP9-CDK1', 'G1_CDC25A-ALMS1',
       ...
       'G2M_UBE2C-UBR5', 'G2M_UBE2C-USP10', 'G2M_UBE2C-VMP1', 'G2M_UBE2C-WNK1',
       'G2M_UBE2C-WTAP', 'G2M_UBE2C-ZFP36L2', 'G2M_UBE2C-ZFR',
       'G2M_UBE2C-ZRANB2', 'G2M_UBE2C-ZWILCH', 'G2M_UBE2C-ZWINT'],
      dtype='object', length=1920)

In [8]:
y = train['label'].values
y

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [9]:
clf = RandomForestClassifier(n_jobs=10, random_state=0)
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=10,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [10]:
clf.predict(test[features])

array([2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [11]:
classes = np.array(["G1", "S", "G2M"])
preds = classes[clf.predict(test[features])]
labels = classes[test['label']]
pandas.crosstab(labels, preds, rownames=['Actual Phase'], colnames=['Predicted Phase'])

Predicted Phase,G1,G2M,S
Actual Phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,16,1,0
G2M,0,16,2
S,1,1,14


In [12]:
feature_importance = list(zip(train[features], clf.feature_importances_))
feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
feature_importance[0:25]

[('G1_HMGB2-HIST1H4C', 0.04607449631632277),
 ('G1_AMD1-HIST1H4C', 0.04378223524831897),
 ('G1_PSMD7-HIST1H4C', 0.041126809845305506),
 ('G1_UBE2L3-HIST1H4C', 0.041057402510946875),
 ('G1_DKC1-HIST1H4C', 0.03882549654629427),
 ('G1_MAD2L1-HIST1H4C', 0.03759077740580439),
 ('G1_PPP1R12A-HIST1H4C', 0.03579703895394749),
 ('S_PPP1CA-CKS2', 0.03564986517590482),
 ('G1_PSMA5-HIST1H4C', 0.03446903608327911),
 ('G1_PSMC6-HIST1H4C', 0.033400812412163416),
 ('S_CFL1-CKS2', 0.03259968086493399),
 ('G1_CKS1B-HIST1H4C', 0.030249069332181584),
 ('G2M_UBE2C-GINS2', 0.02598186902164954),
 ('G2M_UBE2C-PHB2', 0.0246685016434847),
 ('G2M_TOP2A-SON', 0.02293695225319157),
 ('G2M_UBE2C-POLD2', 0.02247558533685431),
 ('S_INSR-CKS2', 0.019817008946765868),
 ('S_MIF-CKS2', 0.01833204554375219),
 ('S_CFLAR-CCNB1', 0.01768699459990451),
 ('S_RUVBL1-CKS2', 0.017668333003202648),
 ('S_RRM2-RPS15A', 0.01388032680146967),
 ('G2M_UBE2C-WNK1', 0.013730420106911787),
 ('S_MYH9-ARHGAP11A', 0.01281100561328569),
 ('G2M

## Internal validation - Cyclone

In [13]:
oscope_cyclone = pairs.cyclone(x=oscope_gencounts_sorted, marker_pairs=oscope_marker_pairs, subset_samples=test.index.tolist(), verbose=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 196 samples that were not in 'subset_samples'. 51 samples remaining.
[__set_matrix] Matrix truncation done. Working with 19084 genes for 51 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 0 marker pairs. 1920 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): S: 18, G2M: 15, G1: 18


In [14]:
preds = np.array([pred for sample, pred in oscope_cyclone["prediction"].items()])
pandas.crosstab(labels, preds, rownames=['Actual Phase'], colnames=['Predicted Phase'])

Predicted Phase,G1,G2M,S
Actual Phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,17,0,0
G2M,0,15,3
S,1,0,15


## Predicting on sc dataset - cyclone

In [15]:
gencounts_EMATB6142 = pandas.read_csv(Path(data + "E-MTAB-6142_human.csv"), sep=';')
gencounts_EMATB6142.set_index("Gene_ID", inplace=True)
gene_map = {}

with open(data + "biomart_human-genes.txt", "r") as f:
    for line in f:
        info = line.split(",")
        gene_map[info[0].replace("\n","").replace("\r","")] = info[1].replace("\n","")

index_list = gencounts_EMATB6142.index.tolist()

for idx, i in enumerate(index_list):
    try:
        if "." in i:
            index_list[idx] = gene_map[i[:i.index(".")]]
        else:
            index_list[idx] = gene_map[i] 
    except KeyError:
        pass

gencounts_EMATB6142.index = index_list
gencounts_EMATB6142 = gencounts_EMATB6142[~gencounts_EMATB6142.index.duplicated(keep=False)]
gencounts_EMATB6142.head(10)

Unnamed: 0,S1_G1,S2_G1,S3_G1,S4_G1,S5_G1,S6_G1,S7_G1,S8_G1,S9_G1,S10_G1,...,S87_G2M,S88_G2M,S89_G2M,S90_G2M,S91_G2M,S92_G2M,S93_G2M,S94_G2M,S95_G2M,S96_G2M
TSPAN6,360,5,437,136,328,253,1101,39,157,253,...,391,429,148,397,424,317,403,280,470,725
TNMD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DPM1,111,421,70,179,93,57,60,35,174,91,...,63,228,173,115,87,308,92,179,164,104
SCYL3,2,5,0,0,1,1,3,0,5,1,...,3,0,2,1,2,0,0,0,38,1
C1orf112,179,448,0,0,135,47,0,0,0,159,...,151,68,147,84,151,11,0,0,169,77
FGR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CFH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,34,0,0,0
FUCA2,293,0,0,389,65,229,106,95,112,205,...,175,110,375,46,55,291,283,224,293,93
GCLC,0,0,0,0,208,0,0,0,0,0,...,0,0,0,0,0,0,40,0,0,0
NFYA,0,118,0,0,0,0,0,60,0,0,...,10,0,0,98,108,149,60,125,46,2


In [16]:
x = gencounts_EMATB6142.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_EMATB6142_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_EMATB6142.index, columns=gencounts_EMATB6142.columns)

gencounts_EMATB6142_Qnorm.head(10)

Unnamed: 0,S1_G1,S2_G1,S3_G1,S4_G1,S5_G1,S6_G1,S7_G1,S8_G1,S9_G1,S10_G1,...,S87_G2M,S88_G2M,S89_G2M,S90_G2M,S91_G2M,S92_G2M,S93_G2M,S94_G2M,S95_G2M,S96_G2M
TSPAN6,0.3473684,1e-07,0.526464,0.03131861,0.3050698,0.1526527,0.9789371,0.01029057,0.05297821,0.1526527,...,0.4210878,0.4945908,0.0421246,0.4369369,0.473536,0.2738559,0.4525139,0.221019,0.6156156,0.8947715
TNMD,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
DPM1,0.3683954,0.9890796,0.1736737,0.6896897,0.2945035,0.08958959,0.1052632,0.04754755,0.6416416,0.2631579,...,0.1157895,0.8105044,0.6210526,0.3895449,0.2316756,0.9473313,0.2787788,0.6896897,0.5735736,0.3313313
SCYL3,0.7157157,0.8683684,1e-07,1e-07,0.547047,0.547047,0.8108108,1e-07,0.8683684,0.547047,...,0.8108108,1e-07,0.7157157,0.547047,0.7157157,1e-07,1e-07,1e-07,0.9314314,0.547047
C1orf112,0.915836,0.9999999,1e-07,1e-07,0.821368,0.4843728,1e-07,1e-07,1e-07,0.8946219,...,0.8683684,0.568514,0.8426397,0.652642,0.8683684,0.3579496,1e-07,1e-07,0.9052632,0.631414
FGR,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
CFH,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.9894737,1e-07,1e-07,1e-07
FUCA2,0.9104104,1e-07,1e-07,0.9890842,0.2632633,0.7893646,0.4630937,0.4159159,0.4894895,0.705092,...,0.6315873,0.4738216,0.9789474,0.1683129,0.2367367,0.8947837,0.8738128,0.7789999,0.9104104,0.3948949
GCLC,1e-07,1e-07,1e-07,1e-07,0.9789681,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.9367956,1e-07,1e-07,1e-07
NFYA,1e-07,0.8368368,1e-07,1e-07,1e-07,1e-07,1e-07,0.6051051,1e-07,1e-07,...,0.505171,1e-07,1e-07,0.7266548,0.7788708,0.9259958,0.6051051,0.8633743,0.5788832,0.457958


In [17]:
EMATB6142_prediction = pairs.cyclone(gencounts_EMATB6142_Qnorm, oscope_marker_pairs, verbose=True, processes=0)

[__set_matrix] Original Matrix 'x' has shape 56365 x 96
[__set_matrix] Matrix truncation done. Working with 56365 genes for 96 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 88 marker pairs. 1832 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 31, S: 18, G2M: 47


In [18]:
EMATB6142_prediction_table = helper.get_prediction_table(EMATB6142_prediction)
helper.DataTable(EMATB6142_prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1_G1,0.969,0.0,0.999,0.492378,0.0,0.507622,G1
S2_G1,0.963,0.198397,0.002,0.827749,0.170532,0.001719,G1
S3_G1,0.729,0.239394,0.0,0.752793,0.247207,0.0,G1
S4_G1,0.921,0.396378,0.0,0.699116,0.300884,0.0,G1
S5_G1,0.215,0.46046,0.929,0.134001,0.286988,0.579011,S
S6_G1,0.633,0.168517,0.995,0.352349,0.093802,0.55385,G1
S7_G1,0.887,0.882766,0.0,0.501196,0.498804,0.0,G1
S8_G1,0.957,0.0,0.438,0.686022,0.0,0.313978,G1
S9_G1,0.99,0.483483,0.0,0.671877,0.328123,0.0,G1
S10_G1,0.868,0.348394,0.0,0.713585,0.286415,0.0,G1


In [19]:
EMATB6142_labels = list(['G1'] * 32) + list(['S'] * 32) + list(['G2M'] * 32)

In [20]:
EMATB6142_evaluation = helper.evaluate_prediction(EMATB6142_prediction_table, EMATB6142_labels)

F1 Score: G1: 0.7936507936507936, S: 0.56, G2M: 0.6835443037974684
Reacall: G1: 0.78125, S: 0.4375, G2M: 0.84375 
Precision: G1: 0.8064516129032258, S: 0.7777777777777778, G2M: 0.574468085106383 


In [21]:
iplot(helper.plot_evaluation(*EMATB6142_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True, title="Pairs prediction scores for EMATB6142"))

## Predicting on sc dataset - Random forest with pairs

In [22]:
clf = RandomForestClassifier(n_jobs=10, random_state=0)
y = oscope_pairs_features['label'].values
clf.fit(oscope_pairs_features[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=10,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [23]:
gencounts_EMATB6142_Qnorm.drop_duplicates(keep=False, inplace=True)

EMATB6142_pairs_features = pandas.DataFrame(index=gencounts_EMATB6142_Qnorm.columns)

for phase, marker in tqdm(oscope_marker_pairs.items()):
    for pair in tqdm(marker):
        name = "{}_{}-{}".format(phase, *pair)
        
        values = [1 if pair[0] in gencounts_EMATB6142_Qnorm.index and pair[1] in gencounts_EMATB6142_Qnorm.index and gencounts_EMATB6142_Qnorm.loc[pair[0],sample] > gencounts_EMATB6142_Qnorm.loc[pair[1],sample]
                  else 0 for sample in gencounts_EMATB6142_Qnorm.columns]
        kwargs = {name: values}
        
        EMATB6142_pairs_features = EMATB6142_pairs_features.assign(**kwargs)




In [24]:
EMATB6142_rf_prediction = clf.predict(EMATB6142_pairs_features[features])
EMATB6142_rf_prediction

array([0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2,
       2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       0, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [25]:
classes = np.array(["G1", "S", "G2M"])
preds = classes[EMATB6142_rf_prediction]

EMATB6142_rf_prediction_table = pandas.DataFrame(index=gencounts_EMATB6142_Qnorm.columns)
EMATB6142_rf_prediction_table = EMATB6142_rf_prediction_table.assign(prediction=preds)

helper.DataTable(EMATB6142_rf_prediction_table)

Unnamed: 0,prediction
S1_G1,G1
S2_G1,G1
S3_G1,G2M
S4_G1,G1
S5_G1,S
S6_G1,S
S7_G1,G1
S8_G1,G1
S9_G1,G1
S10_G1,G1


In [26]:
EMATB6142_rf_evaluation = helper.evaluate_prediction(EMATB6142_rf_prediction_table, EMATB6142_labels)
iplot(helper.plot_evaluation(*EMATB6142_rf_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", average=True, title="Random Forest with Pairs prediction scores for EMATB6142"))

F1 Score: G1: 0.6792452830188678, S: 0.6944444444444444, G2M: 0.8059701492537314
Reacall: G1: 0.5625, S: 0.78125, G2M: 0.84375 
Precision: G1: 0.8571428571428571, S: 0.625, G2M: 0.7714285714285715 


## EBV

In [27]:
# Load matrix
ebv_gencounts = pandas.read_csv(Path(data + "Non_norm.PolyA_NamedByAlex_human.csv"))

# Set index right
ebv_gencounts.set_index("Unnamed: 0", inplace=True)

x = ebv_gencounts.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

ebv_gencounts_norm_qu = pandas.DataFrame(X_std, index=ebv_gencounts.index, columns=ebv_gencounts.columns)

ebv_gencounts_norm_qu

Unnamed: 0_level_0,Day0_1,Day1_1,Day2_1,Day3_1,Day4_1,Day5_1,Day8_1,Day14_1,Day0_2,Day1_2,...,Day8_2,Day14_2,Day0_3,Day1_3,Day2_3,Day3_3,Day4_3,Day5_3,Day8_3,Day14_3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S_rRNA,6.883960e-01,6.733400e-01,6.596597e-01,6.433100e-01,6.476476e-01,6.493129e-01,8.399452e-01,6.581582e-01,6.800133e-01,6.830163e-01,...,6.493994e-01,6.572573e-01,6.951952e-01,6.696697e-01,6.446446e-01,6.524024e-01,6.401401e-01,6.476476e-01,6.409743e-01,6.501502e-01
5_8S_rRNA,1.000000e-07,1.000000e-07,1.000000e-07,4.399399e-01,4.074074e-01,1.000000e-07,1.151151e-01,3.063063e-01,4.004004e-01,3.608609e-01,...,1.000000e-07,3.678679e-01,1.000000e-07,1.000000e-07,3.473473e-01,3.963964e-01,3.468468e-01,4.284284e-01,1.000000e-07,1.000000e-07
6M1-18,1.000000e-07,1.000000e-07,2.907908e-01,1.000000e-07,1.000000e-07,1.000000e-07,2.862863e-01,1.000000e-07,1.000000e-07,1.000000e-07,...,3.363363e-01,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07
7M1-2,1.000000e-07,1.000000e-07,1.000000e-07,3.148148e-01,1.000000e-07,1.000000e-07,1.301301e-01,1.000000e-07,1.000000e-07,1.000000e-07,...,1.000000e-07,1.000000e-07,4.024024e-01,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07
7SK,8.218218e-01,7.771104e-01,6.846847e-01,6.791500e-01,6.646647e-01,6.779439e-01,8.709209e-01,6.846847e-01,8.024691e-01,7.483363e-01,...,6.806807e-01,6.659993e-01,8.076648e-01,7.544424e-01,7.029029e-01,6.676677e-01,6.859237e-01,6.836837e-01,7.003003e-01,6.893892e-01
A1BG,6.856857e-01,6.995556e-01,6.761474e-01,6.599016e-01,6.816708e-01,7.014515e-01,6.800630e-01,6.861862e-01,6.601602e-01,6.462010e-01,...,6.753440e-01,6.429763e-01,7.075108e-01,6.783448e-01,6.451451e-01,6.411411e-01,6.494494e-01,6.931932e-01,6.809667e-01,6.823490e-01
A1BG-AS1,6.703370e-01,6.556557e-01,6.426426e-01,6.476476e-01,6.476476e-01,6.706707e-01,6.226226e-01,6.677936e-01,6.591592e-01,6.382050e-01,...,6.704818e-01,6.568569e-01,7.037037e-01,6.649983e-01,6.614815e-01,6.471471e-01,6.638580e-01,6.819319e-01,6.722321e-01,6.671672e-01
A1CF,4.279279e-01,3.903904e-01,4.504505e-01,4.574575e-01,4.074074e-01,3.633634e-01,6.066066e-01,4.349349e-01,4.449449e-01,1.000000e-07,...,3.698699e-01,3.443443e-01,4.159159e-01,1.000000e-07,4.819820e-01,4.654655e-01,4.629630e-01,3.858859e-01,4.524525e-01,4.434434e-01
A2M,5.995996e-01,4.739740e-01,5.480480e-01,5.225225e-01,5.250250e-01,5.530531e-01,6.516517e-01,5.265265e-01,5.405405e-01,4.389389e-01,...,5.450450e-01,5.605606e-01,5.765766e-01,4.904905e-01,5.485485e-01,5.050050e-01,5.575576e-01,5.805806e-01,5.565566e-01,5.515516e-01
A2M-AS1,5.410410e-01,4.519520e-01,5.075075e-01,5.050050e-01,4.639640e-01,5.630631e-01,3.693694e-01,5.155155e-01,5.665666e-01,4.009009e-01,...,5.110110e-01,5.385385e-01,5.660661e-01,4.224224e-01,5.195195e-01,5.180180e-01,5.280280e-01,5.485485e-01,5.065065e-01,5.565566e-01


In [28]:
EBV_pairs_features = pandas.DataFrame(index=ebv_gencounts_norm_qu.columns)

for phase, marker in tqdm(oscope_marker_pairs.items()):
    for pair in tqdm(marker):
        name = "{}_{}-{}".format(phase, *pair)
        
        values = [1 if pair[0] in ebv_gencounts_norm_qu.index and pair[1] in ebv_gencounts_norm_qu.index and ebv_gencounts_norm_qu.loc[pair[0],sample] > ebv_gencounts_norm_qu.loc[pair[1],sample]
                  else 0 for sample in ebv_gencounts_norm_qu.columns]
        kwargs = {name: values}
        
        EBV_pairs_features = EBV_pairs_features.assign(**kwargs)




In [29]:
ebv_rf_prediction = clf.predict(EBV_pairs_features[features])
ebv_rf_prediction

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1], dtype=int64)

In [30]:
classes = np.array(["G1", "S", "G2M"])
preds = classes[ebv_rf_prediction]

ebv_rf_prediction_table = pandas.DataFrame(index=ebv_gencounts_norm_qu.columns)
ebv_rf_prediction_table = ebv_rf_prediction_table.assign(prediction=preds)

helper.DataTable(ebv_rf_prediction_table)

Unnamed: 0,prediction
Day0_1,S
Day1_1,G1
Day2_1,S
Day3_1,S
Day4_1,S
Day5_1,S
Day8_1,S
Day14_1,S
Day0_2,S
Day1_2,G1


In [31]:
ebv_labels = list(["G1"] * 3) + list(["S"] * 5) + list(["G1"] * 3) + list(["S"] * 5) + list(["G1"] * 3) + list(["S"] * 5)

In [32]:
ebv_rf_evaluation = helper.evaluate_prediction(ebv_rf_prediction_table, ebv_labels)
iplot(helper.plot_evaluation(*ebv_rf_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", title="Random Forest with Pairs prediction scores for EBV"))


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no true samples.


Recall is ill-defined and being set to 0.0 in labels with no true samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.



F1 Score: G1: 0.5, S: 0.8333333333333333, G2M: 0.0
Reacall: G1: 0.3333333333333333, S: 1.0, G2M: 0.0 
Precision: G1: 1.0, S: 0.7142857142857143, G2M: 0.0 


In [33]:
prediction = pairs.cyclone(ebv_gencounts_norm_qu, oscope_marker_pairs, verbose=True)
ebv_prediction_table_cyclone = helper.get_prediction_table(prediction)
ebv_cyclone_evaluation = helper.evaluate_prediction(ebv_prediction_table_cyclone, ebv_labels)
iplot(helper.plot_evaluation(*ebv_cyclone_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", title="Random Forest with Pairs prediction scores for EBV"))

[__set_matrix] Original Matrix 'x' has shape 28730 x 24
[__set_matrix] Matrix truncation done. Working with 28730 genes for 24 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 6 marker pairs. 1914 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 9, S: 15



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no true samples.


Recall is ill-defined and being set to 0.0 in labels with no true samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.



F1 Score: G1: 1.0, S: 1.0, G2M: 0.0
Reacall: G1: 1.0, S: 1.0, G2M: 0.0 
Precision: G1: 1.0, S: 1.0, G2M: 0.0 


## GSE53481

In [34]:
cc_marker = helper.load_ocope_marker(data, fraction=0.6)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 8146 marker pairs (phase: count): {'G1': 2575, 'S': 4101, 'G2M': 1470}


In [35]:
gencounts_GSE53481 = pandas.read_csv(Path(data + "GSE53481_humanRNAseq.txt"), sep='\t')
genes = [s[s.rindex('_') +1:] for s in gencounts_GSE53481["GENE"]]
gencounts_GSE53481["GENE"] = genes
gencounts_GSE53481.set_index("GENE", inplace=True)
x = gencounts_GSE53481.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_GSE53481_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_GSE53481.index, columns=gencounts_GSE53481.columns)
gencounts_GSE53481_Qnorm = gencounts_GSE53481_Qnorm[~gencounts_GSE53481_Qnorm.index.duplicated(keep='last')]
gencounts_GSE53481_Qnorm

Unnamed: 0_level_0,H1.DN,H1.KO2,H1.AzLow,H1.AzHigh,H2.DN,H2.KO2,H2.AzLow,H2.AzHigh,H3.DN,H3.KO2,H3.AzLow,H3.AzHigh
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LOC100289255,1.000000e-07,1.000000e-07,6.361361e-01,6.361361e-01,1.000000e-07,4.194194e-01,9.999999e-01,9.999999e-01,1.000000e-07,4.194194e-01,6.361361e-01,9.999999e-01
LOC644656,7.273273e-01,9.089138e-01,2.728729e-01,1.000000e-07,9.999999e-01,6.364466e-01,1.000000e-07,3.634222e-01,5.455312e-01,8.185142e-01,4.545062e-01,1.818422e-01
LOC646903,8.178995e-01,6.357958e-01,1.820635e-01,1.000000e-07,9.094977e-01,9.999999e-01,4.544619e-01,2.727273e-01,5.455406e-01,7.275551e-01,3.636970e-01,9.079514e-02
FLJ36644,4.547405e-01,6.361361e-01,1.000000e-07,9.078309e-02,6.361361e-01,9.999999e-01,3.635214e-01,1.818182e-01,8.184500e-01,9.089616e-01,6.361361e-01,2.727273e-01
LOC284454,4.545657e-01,9.999999e-01,1.000000e-07,9.058149e-02,9.089687e-01,7.271716e-01,2.727273e-01,3.634332e-01,6.364169e-01,8.183809e-01,1.821066e-01,5.454545e-01
LOC149773,1.816817e-01,1.000000e-07,6.361361e-01,1.816817e-01,6.361361e-01,1.816817e-01,9.092169e-01,4.094094e-01,9.999999e-01,4.094094e-01,8.180403e-01,6.361361e-01
LOC100131176,4.544741e-01,5.454075e-01,1.000000e-07,1.000000e-07,6.365918e-01,9.089923e-01,3.642466e-01,1.000000e-07,9.999999e-01,8.179049e-01,7.270127e-01,1.000000e-07
LOC100131366,2.728443e-01,1.814285e-01,6.356982e-01,3.641536e-01,1.000000e-07,5.000000e-01,9.999999e-01,7.727728e-01,5.000000e-01,1.000000e-07,7.727728e-01,9.096284e-01
FLJ42351,2.726727e-01,1.814858e-01,6.364169e-01,9.089548e-01,4.094094e-01,1.000000e-07,7.727728e-01,5.454278e-01,9.104850e-02,4.094094e-01,7.727728e-01,9.999999e-01
LOC392232,9.999999e-01,9.090909e-01,4.089089e-01,6.816817e-01,4.089089e-01,1.361361e-01,1.000000e-07,4.644645e-01,8.181818e-01,4.016517e-01,1.361361e-01,6.816817e-01


In [36]:
GSE53481_pairs_features = pandas.DataFrame(index=gencounts_GSE53481_Qnorm.columns)

for phase, marker in tqdm(cc_marker.items()):
    for pair in tqdm(marker):
        name = "{}_{}-{}".format(phase, *pair)
        
        values = [
            1 if pair[0] in gencounts_GSE53481_Qnorm.index 
                and pair[1] in gencounts_GSE53481_Qnorm.index 
                and gencounts_GSE53481_Qnorm.loc[pair[0],sample] > gencounts_GSE53481_Qnorm.loc[pair[1],sample]
            else 0 
            for sample in gencounts_GSE53481_Qnorm.columns]
        kwargs = {name: values}
        
        GSE53481_pairs_features = GSE53481_pairs_features.assign(**kwargs)




In [37]:
GSE53481_rf_prediction = clf.predict(GSE53481_pairs_features[features])

preds = classes[GSE53481_rf_prediction]

GSE53481_rf_prediction_table = pandas.DataFrame(index=gencounts_GSE53481_Qnorm.columns)
GSE53481_rf_prediction_table = GSE53481_rf_prediction_table.assign(prediction=preds)

helper.DataTable(GSE53481_rf_prediction_table)

Unnamed: 0,prediction
H1.DN,G2M
H1.KO2,G2M
H1.AzLow,G2M
H1.AzHigh,G2M
H2.DN,G2M
H2.KO2,G2M
H2.AzLow,G2M
H2.AzHigh,G2M
H3.DN,G2M
H3.KO2,G2M


In [38]:
GSE53481_labels = list(["G1"] * 2) + list(["S"]) + list(["G2M"]) + list(["G1"] * 2) + list(["S"]) + list(["G2M"]) + list(["G1"] * 2) + list(["S"]) + list(["G2M"])

In [39]:
GSE53481_rf_evaluation = helper.evaluate_prediction(GSE53481_rf_prediction_table, GSE53481_labels)
iplot(helper.plot_evaluation(*GSE53481_rf_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", title="Random Forest with Pairs prediction scores for GSE53481"))


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.



F1 Score: G1: 0.0, S: 0.0, G2M: 0.4
Reacall: G1: 0.0, S: 0.0, G2M: 1.0 
Precision: G1: 0.0, S: 0.0, G2M: 0.25 


In [40]:
GSE53481_prediction = pairs.cyclone(gencounts_GSE53481_Qnorm, cc_marker, min_pairs=1, verbose=True)
GSE53481_prediction_table_cyclone = helper.get_prediction_table(GSE53481_prediction)
helper.DataTable(GSE53481_prediction_table_cyclone)

[__set_matrix] Original Matrix 'x' has shape 498 x 12
[__set_matrix] Matrix truncation done. Working with 498 genes for 12 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 8102 marker pairs. 44 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 5, S: 2, G2M: 5


Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
H1.DN,0.575301,0.238716,0.672379,0.387044,0.160601,0.452355,G1
H1.KO2,0.683417,0.085256,0.74645,0.451064,0.05627,0.492666,G1
H1.AzLow,0.40321,0.166,0.802817,0.293879,0.120989,0.585132,S
H1.AzHigh,0.067067,0.933801,0.031345,0.064974,0.904659,0.030367,G2M
H2.DN,0.965829,0.2,0.364004,0.63133,0.130733,0.237937,G1
H2.KO2,0.877264,0.003046,0.677116,0.563278,0.001956,0.434766,G1
H2.AzLow,0.505506,0.753507,0.354125,0.313368,0.467107,0.219525,G2M
H2.AzHigh,0.077005,0.993982,0.0,0.071901,0.928099,0.0,G2M
H3.DN,0.660643,0.568,0.084677,0.503032,0.432492,0.064476,G1
H3.KO2,0.208249,0.029,0.295363,0.390996,0.054449,0.554555,S


In [41]:
GSE53481_cyclone_evaluation = helper.evaluate_prediction(GSE53481_prediction_table_cyclone, GSE53481_labels)
iplot(
    helper.plot_evaluation(
        *GSE53481_cyclone_evaluation, 
        xaxis=["G1","S","G2M"], 
        xaxislbl="Phase", 
        title="Random Forest with Pairs prediction scores for GSE53481"
    )
)

F1 Score: G1: 0.9090909090909091, S: 0.4, G2M: 0.7499999999999999
Reacall: G1: 0.8333333333333334, S: 0.3333333333333333, G2M: 1.0 
Precision: G1: 1.0, S: 0.5, G2M: 0.6 


## GSE71456

In [42]:
gencounts_GSE71456 = pandas.read_csv(
    Path(data + "GSE71456_Samples_RPKM.csv"), sep='\t', index_col=0, 
    usecols=[1,4,5,6,7,8,9,10,11,12,13,14,15,16]
)

x = gencounts_GSE71456.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_GSE71456_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_GSE71456.index, columns=gencounts_GSE71456.columns)
gencounts_GSE71456_Qnorm = gencounts_GSE71456_Qnorm[~gencounts_GSE71456_Qnorm.index.duplicated(keep='last')]


invalid value encountered in subtract



In [43]:
GSE71456_pairs_features = pandas.DataFrame(index=gencounts_GSE71456_Qnorm.columns)

for phase, marker in tqdm(cc_marker.items()):
    for pair in tqdm(marker):
        name = "{}_{}-{}".format(phase, *pair)
        
        values = [
            1 if pair[0] in gencounts_GSE71456_Qnorm.index 
                and pair[1] in gencounts_GSE71456_Qnorm.index 
                and gencounts_GSE71456_Qnorm.loc[pair[0],sample] > gencounts_GSE71456_Qnorm.loc[pair[1],sample]
            else 0 
            for sample in gencounts_GSE71456_Qnorm.columns]
        kwargs = {name: values}
        
        GSE71456_pairs_features = GSE71456_pairs_features.assign(**kwargs)




In [44]:
GSE71456_rf_prediction = clf.predict(GSE71456_pairs_features[features])

preds = classes[GSE71456_rf_prediction]

GSE71456_rf_prediction_table = pandas.DataFrame(index=gencounts_GSE71456_Qnorm.columns)
GSE71456_rf_prediction_table = GSE71456_rf_prediction_table.assign(prediction=preds)

helper.DataTable(GSE71456_rf_prediction_table)

Unnamed: 0,prediction
pES10 h-G1 rep1,S
pES10 h-G1 rep2,G1
pES10 d-G1 rep1,S
pES10 d-G1 rep2,S
h-pES10 d-G2/M,G1
d-pES10 d-G2/M,G2M
pES12 h-G1 rep1,G1
pES12 h-G1 rep2,S
pES12 d-G1 rep1,G1
pES12 d-G1 rep2,S


In [45]:
GSE71456_labels = ["G1", "G1", "G1", "G1", "G2M", "G2M", "G1", "G1", "G1", "G1", "G1", "G1", "G1"]
GSE71456_rf_evaluation = helper.evaluate_prediction(GSE71456_rf_prediction_table, GSE71456_labels)
iplot(helper.plot_evaluation(*GSE71456_rf_evaluation, xaxis=["G1","S","G2M"], xaxislbl="Phase", title="Random Forest with Pairs prediction scores for GSE71456"))


F-score is ill-defined and being set to 0.0 in labels with no true samples.


Recall is ill-defined and being set to 0.0 in labels with no true samples.



F1 Score: G1: 0.5000000000000001, S: 0.0, G2M: 0.5
Reacall: G1: 0.36363636363636365, S: 0.0, G2M: 0.5 
Precision: G1: 0.8, S: 0.0, G2M: 0.5 


In [46]:
GSE71456_prediction = pairs.cyclone(gencounts_GSE71456_Qnorm, cc_marker, min_pairs=1, verbose=True)
GSE71456_prediction_table_cyclone = helper.get_prediction_table(GSE71456_prediction)
helper.DataTable(GSE71456_prediction_table_cyclone)

[__set_matrix] Original Matrix 'x' has shape 56626 x 13
[__set_matrix] Matrix truncation done. Working with 56626 genes for 13 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 64 marker pairs. 8082 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): S: 2, G1: 8, G2M: 3


Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pES10 h-G1 rep1,0.391,0.212,0.896,0.260841,0.141428,0.597732,S
pES10 h-G1 rep2,0.998,0.851,0.0,0.539751,0.460249,0.0,G1
pES10 d-G1 rep1,0.693,0.653,0.497,0.376017,0.354314,0.269669,G1
pES10 d-G1 rep2,0.536,0.013,1.0,0.34603,0.008393,0.645578,G1
h-pES10 d-G2/M,0.358,1.0,0.0,0.263623,0.736377,0.0,G2M
d-pES10 d-G2/M,0.0,1.0,0.001,0.0,0.999001,0.000999,G2M
pES12 h-G1 rep1,0.802,0.698,0.0,0.534667,0.465333,0.0,G1
pES12 h-G1 rep2,0.355,0.02,0.997,0.258746,0.014577,0.726676,S
pES12 d-G1 rep1,0.999,0.0,1.0,0.49975,0.0,0.50025,G1
pES12 d-G1 rep2,0.931,0.091,0.317,0.695295,0.067961,0.236744,G1


In [47]:
GSE71456_cyclone_evaluation = helper.evaluate_prediction(GSE71456_prediction_table_cyclone, GSE71456_labels)
iplot(
    helper.plot_evaluation(
        *GSE71456_cyclone_evaluation, 
        xaxis=["G1","S","G2M"], 
        xaxislbl="Phase", 
        title="Random Forest with Pairs prediction scores for GSE71456"
    )
)


F-score is ill-defined and being set to 0.0 in labels with no true samples.


Recall is ill-defined and being set to 0.0 in labels with no true samples.



F1 Score: G1: 0.8421052631578948, S: 0.0, G2M: 0.8
Reacall: G1: 0.7272727272727273, S: 0.0, G2M: 1.0 
Precision: G1: 1.0, S: 0.0, G2M: 0.6666666666666666 


## Plot comparison

In [48]:
ebv_cyclone_evaluation[0]

array([1., 1., 0.])

In [49]:
cyclone = list(ebv_cyclone_evaluation[0]) + list(GSE53481_cyclone_evaluation[0]) + list(GSE71456_cyclone_evaluation[0])
cyclone

[1.0,
 1.0,
 0.0,
 0.9090909090909091,
 0.4,
 0.7499999999999999,
 0.8421052631578948,
 0.0,
 0.8]

In [50]:
rf = list(ebv_rf_evaluation[0]) + list(GSE53481_rf_evaluation[0]) + list(GSE71456_rf_evaluation[0])
rf

[0.5, 0.8333333333333333, 0.0, 0.0, 0.0, 0.4, 0.5000000000000001, 0.0, 0.5]

In [51]:
xrange = ["G1 EBV","S EBV","G2M EBV", "G1 GSE53481", "S GSE53481", "G2M GSE53481", "G1 GSE71456", "S GSE71456", "G2M GSE71456"] 
cyclone_trace = go.Scatter(
    x=xrange,
    y=cyclone,
    mode='markers',
    marker=dict(
        symbol='square',
        size=10,
        color='blue',
    ),
    name='Cyclone'
)

rf_trace = go.Scatter(
    x=xrange,
    y=rf,
    mode='markers',
    marker=dict(
        symbol='triangle-up',
        size=10,
        color='green',
    ),
    name='Random Forest with Pairs'
)

layout = {
    'title': "Comparison Cyclone vs Random Forest with pairs on human bulk data",
    'xaxis': {
        'title': 'Phase'
    },
    'yaxis': {
        'title': 'F1 Score [0-1]'
    },
    'width': 950,
    'height': 500,
    'shapes': [
        # G1
        {
            'type': 'path',
            'path': 'M 2.5,0 L 2.5,1 M 5.5,0 L 5.5,1',
            'line': {
                'color': 'red',
                'width': 1,
                'dash': 'dot'
            }
        }
       
    ]
}
data = go.Figure(data=[cyclone_trace, rf_trace], layout=layout)

iplot(data, image="svg")