# Using pretrained mouse marker on oscope dataset

Here we used the pre-trained mouse marker pairs and rpedicted the cell cycle phases of the oscope dataset.

<div id="toc"></div>

## Neccessary Imports

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
import sys
code = "./../../code/"
data = "./../../data/"
sys.path.append(code)
import pandas
import json
import pypairs as pairs
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np
from pathlib import Path
from tqdm import tqdm_notebook as tqdm
import helper

init_notebook_mode(connected=True)

## Loading Mouse-Human Orthologues

In [3]:
genes_mouse_to_human = {}
header = True
count = 0
ambigious = set()

for line in open(data + "biomart_mouse-human-orthologs.txt"):
    if header:
        header = False
        continue
    infos = line.split(",")
    if infos[2] != "":
        if infos[0] in genes_mouse_to_human:
            ambigious.add(infos[0])
            count += 1
        genes_mouse_to_human[infos[0]] = infos[2]
            
for gene in ambigious:
    del genes_mouse_to_human[gene]
    
print("{} ambiguous mappings skipped".format(count))
print("{} genes remain".format(len(genes_mouse_to_human)))

6912 ambiguous mappings skipped
18962 genes remain


## Loading Mouse Marker Pairs

In [4]:
marker_json = json.load(
    open(data + 'mouse_pretrained-pairs.json')
)

mm_marker_pairs = {
    "G1": list(
        zip(
            marker_json["G1"]["first"],
            marker_json["G1"]["second"]
        )
    ),
    "S": list(
        zip(
            marker_json["S"]["first"],
            marker_json["S"]["second"]
        )
    ),
    "G2M": list(
        zip(
            marker_json["G2M"]["first"],
            marker_json["G2M"]["second"]
        )
    )
}

mm_marker_pairs = {
    phase: [
        (
            genes_mouse_to_human[pair[0]], 
            genes_mouse_to_human[pair[1]]
        ) 
        for pair in pairs 
        if pair[0] in genes_mouse_to_human 
        and pair[1] in genes_mouse_to_human
    ] for phase, pairs in mm_marker_pairs.items()
}

print("The {} marker pairs loaded".format(
          len(mm_marker_pairs["G1"]) +
          len(mm_marker_pairs["S"]) +
          len(mm_marker_pairs["G2M"])
      )
     )
print("Split up into: " 
      "{} G1 , {} S and {} G2M pairs".format(
          len(mm_marker_pairs["G1"]), 
          len(mm_marker_pairs["S"]), 
          len(mm_marker_pairs["G2M"])
      )
     )

The 26545 marker pairs loaded
Split up into: 11259 G1 , 5920 S and 9366 G2M pairs


## Loading Oscope Dataset

In [5]:
gencounts_oscope = pandas.read_csv(
    data + "GSE64016_H1andFUCCI_normalized_EC_human.csv"
)
gencounts_oscope.set_index("Unnamed: 0", inplace=True)
gencounts_oscope_sorted = gencounts_oscope.iloc[
    :, [
        gencounts_oscope.columns.get_loc(c) 
        for c in gencounts_oscope.columns if
        "G1_" in c or "G2_" in c or "S_" in c
    ]
]
gencounts_oscope.head(10)

Unnamed: 0_level_0,H1_Exp1.001,H1_Exp1.002,H1_Exp1.003,H1_Exp1.004,H1_Exp1.006,H1_Exp1.007,H1_Exp1.008,H1_Exp1.009,H1_Exp1.010,H1_Exp1.011,...,G1_Exp1.008,G1_Exp1.055,G1_Exp1.050,G1_Exp1.076,G1_Exp1.011,G1_Exp1.063,G1_Exp1.083,G1_Exp1.030,G1_Exp1.018,G1_Exp1.046
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MKL2,24.148634,285.530829,6.481959,107.12962,0.0,5.316709,42.809004,0.0,267.202286,2.838761,...,3.887628,84.337868,69.192927,1.126491,1.13733,0.0,36.741767,11.218839,152.79286,123.041274
CD109,2.414863,2.238421,341.512799,14.896119,16.807235,115.372585,7.991014,154.389316,16.663439,12.022155,...,4.956726,7.208501,4.299846,7.626347,5.936864,2.797575,149.063512,2.80471,15.996667,7.077119
ABTB1,0.0,49.351007,0.0,2.550705,0.0,23.92519,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MAST2,0.0,234.417285,88.586769,0.0,0.0,5.316709,0.0,0.0,0.0,11.355046,...,0.971907,15.069427,0.0,60.830538,22.746604,0.0,0.720427,0.0,5.589983,33.230125
KAT5,0.0,12.443504,114.341752,51.422218,0.0,16.72105,0.0,0.0,151.218876,173.817364,...,0.0,96.041736,0.0,0.0,0.0,0.0,1.03021,55.09151,4.826018,0.0
WWC2,205.118496,8.81268,658.999139,104.70645,374.666774,394.180805,0.0,219.069622,539.029228,725.303552,...,169.11182,233.122597,656.509085,412.712674,1075.186455,661.889223,104.641994,322.527589,47.514853,67.124853
CD163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MYL2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UBE2Z,43.467541,257.41838,230.606485,22.956347,36.938978,295.077349,23.78278,43.813924,39.6399,102.195412,...,138.010796,58.87738,538.955243,5.632457,71.651801,0.656708,2.881707,610.984967,0.0,3.215239
RGPD4,0.0,8.759804,0.0,0.0,0.0,0.0,0.0,8.708017,3.083103,0.0,...,0.0,0.0,1.91928,3.559713,0.0,0.0,2.845686,0.0,17.505962,0.0


## Prediction - raw

In [6]:
prediction = pairs.cyclone(
    gencounts_oscope_sorted, 
    mm_marker_pairs, 
    processes=10,
    verbose=True
)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Matrix truncation done. Working with 19084 genes for 247 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 2836 marker pairs. 26545 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): S: 100, G2M: 3, G1: 144


In [7]:
prediction_table = helper.get_prediction_table(prediction)
helper.DataTable(prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G2_Exp1.059,0.488,0.008,0.404,0.542222,0.008889,0.448889,S
G2_Exp1.069,0.064,0.551,0.052,0.095952,0.826087,0.077961,G2M
G2_Exp1.075,0.005,0.258,0.633,0.00558,0.287946,0.706473,S
G2_Exp1.063,0.038,0.422,0.134,0.063973,0.710438,0.225589,S
G2_Exp1.029,0.253,0.036,0.265,0.456679,0.064982,0.478339,S
G2_Exp1.076,0.033,0.424,0.24,0.047346,0.608321,0.344333,S
G2_Exp1.013,0.366,0.025,0.411,0.456359,0.031172,0.512469,S
G2_Exp1.037,0.869,0.002,0.243,0.780072,0.001795,0.218133,G1
G2_Exp1.057,0.635,0.002,0.218,0.74269,0.002339,0.254971,G1
G2_Exp1.018,0.068,0.495,0.094,0.103501,0.753425,0.143075,S


In [8]:
iplot(helper.get_prediction_plot(
    prediction_table.loc[:, "G1"], 
    prediction_table.loc[:, "S"], 
    prediction_table.loc[:, "G2M"], 
    samples=list(prediction_table.index),
    t="scatter", title="Phase assignment"
))

In [9]:
p = ["G1","S","G2M"]
label = [] 

for c in gencounts_oscope_sorted.columns:
    if "G1_" in c:
        label.append("G1")
    elif "S_" in c:
        label.append("S")
    elif "G2_" in c:
        label.append("G2M")

In [10]:
evaluation = helper.evaluate_prediction(prediction_table, label)

iplot(helper.plot_evaluation(*evaluation, average=True, xaxislbl=["G1","S","G2M"], title="Prediction Scores per Phase for unnormalized gene counts"))

F1 Score: G1: 0.502127659574468, S: 0.14444444444444446, G2M: 0.07594936708860758
Reacall: G1: 0.6483516483516484, S: 0.1625, G2M: 0.039473684210526314 
Precision: G1: 0.4097222222222222, S: 0.13, G2M: 1.0 


## Prediction - Quantile normalized

In [11]:
x = gencounts_oscope.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_oscope_normalized = pandas.DataFrame(X_std.T, index=gencounts_oscope.index, columns=gencounts_oscope.columns)
gencounts_oscope_normalized_sorted = gencounts_oscope_normalized.iloc[:,
                              [gencounts_oscope_normalized.columns.get_loc(c) for c in gencounts_oscope_normalized.columns if
                              "G1_" in c or "G2_" in c or "S_" in c]]
gencounts_oscope_normalized_sorted.head(10)

Unnamed: 0_level_0,G2_Exp1.059,G2_Exp1.069,G2_Exp1.075,G2_Exp1.063,G2_Exp1.029,G2_Exp1.076,G2_Exp1.013,G2_Exp1.037,G2_Exp1.057,G2_Exp1.018,...,G1_Exp1.008,G1_Exp1.055,G1_Exp1.050,G1_Exp1.076,G1_Exp1.011,G1_Exp1.063,G1_Exp1.083,G1_Exp1.030,G1_Exp1.018,G1_Exp1.046
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MKL2,0.5948365,0.4251952,0.2353241,0.45062,0.3248365,0.8494862,0.6035429,1e-07,0.564266,0.9781442,...,0.3512828,0.7757691,0.7407407,0.2723152,0.2772233,1e-07,0.6386019,0.4684572,0.868922,0.8305803
CD109,0.100305,0.3201135,0.7909013,0.03080467,0.08501255,0.5030089,0.7447545,0.9152499,0.05667066,0.357342,...,0.2592593,0.3856201,0.2323796,0.40253,0.3136761,0.126431,0.8998772,0.1290721,0.5925926,0.3766069
ABTB1,0.9020715,1e-07,1e-07,0.9211207,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
MAST2,0.4792734,0.4836901,0.8627706,1e-07,0.7818149,1e-07,0.8561407,1e-07,1e-07,0.9150147,...,0.5272325,0.7318324,1e-07,0.845494,0.7754613,1e-07,0.5014795,1e-07,0.6666667,0.8039964
KAT5,0.5337311,0.4465861,1e-07,0.9829656,1e-07,1e-07,0.7804196,0.509615,0.975995,1e-07,...,1e-07,0.853874,1e-07,1e-07,1e-07,1e-07,0.4663023,0.766857,0.5425416,1e-07
WWC2,0.1460928,0.8388357,0.7306529,0.1270348,0.5951589,0.5510902,0.8825024,0.3772122,0.5226564,0.8779768,...,0.2723594,0.346951,0.8539372,0.6712327,0.9738533,0.8583905,0.157137,0.5029423,0.06746602,0.1111111
CD163,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
MYL2,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
UBE2Z,0.9478934,0.6929424,0.1960402,0.1851852,0.4357608,0.5512878,0.919791,0.6493849,0.6947714,0.2418189,...,0.6818743,0.4377827,0.9805805,0.1266152,0.477079,0.06317808,0.1021025,0.9890268,1e-07,0.1048298
RGPD4,1e-07,0.6426691,1e-07,1e-07,0.9545437,0.8388375,1e-07,1e-07,0.5947432,1e-07,...,1e-07,1e-07,0.6145896,0.6882182,1e-07,1e-07,0.6585361,1e-07,0.9129127,1e-07


In [12]:
prediction = pairs.cyclone(gencounts_oscope_normalized_sorted, mm_marker_pairs, verbose=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Matrix truncation done. Working with 19084 genes for 247 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 2836 marker pairs. 26545 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G2M: 93, S: 33, G1: 121


In [13]:
prediction_table = helper.get_prediction_table(prediction)
helper.DataTable(prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G2_Exp1.059,0.167,0.743,0.484,0.119799,0.532999,0.347202,G2M
G2_Exp1.069,0.002,1.0,0.179,0.001693,0.84674,0.151566,G2M
G2_Exp1.075,0.0,0.99,0.918,0.0,0.518868,0.481132,G2M
G2_Exp1.063,0.018,0.993,0.092,0.016319,0.900272,0.083409,G2M
G2_Exp1.029,0.182,0.917,0.383,0.122807,0.618758,0.258435,G2M
G2_Exp1.076,0.009,1.0,0.254,0.007126,0.791766,0.201108,G2M
G2_Exp1.013,0.466,0.467,0.482,0.329329,0.330035,0.340636,S
G2_Exp1.037,0.832,0.169,0.594,0.52163,0.105956,0.372414,G1
G2_Exp1.057,0.685,0.015,0.728,0.479692,0.010504,0.509804,G1
G2_Exp1.018,0.025,0.995,0.229,0.020016,0.796637,0.183347,G2M


In [15]:
iplot(helper.get_prediction_plot(
    prediction_table.loc[:, "G1"], 
    prediction_table.loc[:, "S"], 
    prediction_table.loc[:, "G2M"], 
    samples=list(prediction_table.index),
    t="pie", title="Phase assignment", width=600, height=600
))

In [16]:
evaluation = helper.evaluate_prediction(prediction_table, label=label)

iplot(helper.plot_evaluation(*evaluation, average=True, xaxislbl=["G1","S","G2M"], title="Prediction Scores per Phase for unnormalized gene counts"))

F1 Score: G1: 0.4811320754716981, S: 0.23008849557522124, G2M: 0.6863905325443788
Reacall: G1: 0.5604395604395604, S: 0.1625, G2M: 0.7631578947368421 
Precision: G1: 0.4214876033057851, S: 0.3939393939393939, G2M: 0.6236559139784946 
