# Running EBV

Here we tested the dataset provided by Paulina Mrozek-Gorska

<div id="toc"></div>

## Neccessary Imports

In [30]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [31]:
import sys
code = "./../../code/"
data = "./../../data/"
sys.path.append(code)
import pandas
import pypairs as pairs
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np
from pathlib import Path
from tqdm import tqdm_notebook as tqdm
import helper
import timeit
from plotly import tools

init_notebook_mode(connected=True)

## Loading Oscope marker pairs

In [32]:
cc_marker = helper.load_ocope_marker(data, fraction=0.65)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 1920 marker pairs (phase: count): {'G1': 646, 'S': 920, 'G2M': 354}


## Load and normalize EBV dataset

Normalization is done with the [QuantileTransformer()](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html) of the scikit-learn python package 

In [33]:
# Load matrix
ebv_gencounts = pandas.read_csv(Path(data + "Non_norm.PolyA_NamedByAlex_human.csv"))

# Set index right
ebv_gencounts.set_index("Unnamed: 0", inplace=True)

ebv_gencounts

Unnamed: 0_level_0,Day0_1,Day1_1,Day2_1,Day3_1,Day4_1,Day5_1,Day8_1,Day14_1,Day0_2,Day1_2,...,Day8_2,Day14_2,Day0_3,Day1_3,Day2_3,Day3_3,Day4_3,Day5_3,Day8_3,Day14_3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S_rRNA,350,230,217,175,197,187,2282,218,237,252,...,231,276,269,163,185,234,240,238,194,240
5_8S_rRNA,0,0,0,7,4,0,11,1,2,1,...,0,2,0,0,1,3,1,7,0,0
6M1-18,0,0,1,0,0,0,81,0,0,0,...,2,0,0,0,0,0,0,0,0,0
7M1-2,0,0,0,1,0,0,15,0,0,0,...,0,0,3,0,0,0,0,0,0,0
7SK,1068,761,315,300,255,273,2894,304,728,533,...,358,308,717,446,417,300,475,378,443,405
A1BG,340,330,278,227,322,356,713,310,190,149,...,333,225,304,183,186,197,279,416,349,371
A1BG-AS1,283,179,168,190,197,251,469,247,187,133,...,314,274,292,153,240,215,351,371,313,304
A1CF,8,3,9,9,4,2,424,6,7,0,...,3,1,4,0,15,11,13,3,9,7
A2M,113,11,39,25,27,41,576,26,33,6,...,39,56,55,11,41,21,62,82,47,44
A2M-AS1,46,8,21,19,10,47,122,22,49,3,...,23,40,47,3,26,26,38,50,21,48


## Quantile Normalization

In [34]:
x = ebv_gencounts.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

ebv_gencounts_norm_qu = pandas.DataFrame(X_std, index=ebv_gencounts.index, columns=ebv_gencounts.columns)

ebv_gencounts_norm_qu

Unnamed: 0_level_0,Day0_1,Day1_1,Day2_1,Day3_1,Day4_1,Day5_1,Day8_1,Day14_1,Day0_2,Day1_2,...,Day8_2,Day14_2,Day0_3,Day1_3,Day2_3,Day3_3,Day4_3,Day5_3,Day8_3,Day14_3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S_rRNA,6.883960e-01,6.733400e-01,6.596597e-01,6.433100e-01,6.476476e-01,6.493129e-01,8.399452e-01,6.581582e-01,6.800133e-01,6.830163e-01,...,6.493994e-01,6.572573e-01,6.951952e-01,6.696697e-01,6.446446e-01,6.524024e-01,6.401401e-01,6.476476e-01,6.409743e-01,6.501502e-01
5_8S_rRNA,1.000000e-07,1.000000e-07,1.000000e-07,4.399399e-01,4.074074e-01,1.000000e-07,1.151151e-01,3.063063e-01,4.004004e-01,3.608609e-01,...,1.000000e-07,3.678679e-01,1.000000e-07,1.000000e-07,3.473473e-01,3.963964e-01,3.468468e-01,4.284284e-01,1.000000e-07,1.000000e-07
6M1-18,1.000000e-07,1.000000e-07,2.907908e-01,1.000000e-07,1.000000e-07,1.000000e-07,2.862863e-01,1.000000e-07,1.000000e-07,1.000000e-07,...,3.363363e-01,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07
7M1-2,1.000000e-07,1.000000e-07,1.000000e-07,3.148148e-01,1.000000e-07,1.000000e-07,1.301301e-01,1.000000e-07,1.000000e-07,1.000000e-07,...,1.000000e-07,1.000000e-07,4.024024e-01,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07,1.000000e-07
7SK,8.218218e-01,7.771104e-01,6.846847e-01,6.791500e-01,6.646647e-01,6.779439e-01,8.709209e-01,6.846847e-01,8.024691e-01,7.483363e-01,...,6.806807e-01,6.659993e-01,8.076648e-01,7.544424e-01,7.029029e-01,6.676677e-01,6.859237e-01,6.836837e-01,7.003003e-01,6.893892e-01
A1BG,6.856857e-01,6.995556e-01,6.761474e-01,6.599016e-01,6.816708e-01,7.014515e-01,6.800630e-01,6.861862e-01,6.601602e-01,6.462010e-01,...,6.753440e-01,6.429763e-01,7.075108e-01,6.783448e-01,6.451451e-01,6.411411e-01,6.494494e-01,6.931932e-01,6.809667e-01,6.823490e-01
A1BG-AS1,6.703370e-01,6.556557e-01,6.426426e-01,6.476476e-01,6.476476e-01,6.706707e-01,6.226226e-01,6.677936e-01,6.591592e-01,6.382050e-01,...,6.704818e-01,6.568569e-01,7.037037e-01,6.649983e-01,6.614815e-01,6.471471e-01,6.638580e-01,6.819319e-01,6.722321e-01,6.671672e-01
A1CF,4.279279e-01,3.903904e-01,4.504505e-01,4.574575e-01,4.074074e-01,3.633634e-01,6.066066e-01,4.349349e-01,4.449449e-01,1.000000e-07,...,3.698699e-01,3.443443e-01,4.159159e-01,1.000000e-07,4.819820e-01,4.654655e-01,4.629630e-01,3.858859e-01,4.524525e-01,4.434434e-01
A2M,5.995996e-01,4.739740e-01,5.480480e-01,5.225225e-01,5.250250e-01,5.530531e-01,6.516517e-01,5.265265e-01,5.405405e-01,4.389389e-01,...,5.450450e-01,5.605606e-01,5.765766e-01,4.904905e-01,5.485485e-01,5.050050e-01,5.575576e-01,5.805806e-01,5.565566e-01,5.515516e-01
A2M-AS1,5.410410e-01,4.519520e-01,5.075075e-01,5.050050e-01,4.639640e-01,5.630631e-01,3.693694e-01,5.155155e-01,5.665666e-01,4.009009e-01,...,5.110110e-01,5.385385e-01,5.660661e-01,4.224224e-01,5.195195e-01,5.180180e-01,5.280280e-01,5.485485e-01,5.065065e-01,5.565566e-01


## Prediction using cyclone

On normalized EBV with cc-only marker pairs from oscope

In [35]:
prediction = pairs.cyclone(ebv_gencounts_norm_qu, cc_marker, verbose=True, processes=0)

[__set_matrix] Original Matrix 'x' has shape 28730 x 24
[__set_matrix] Matrix truncation done. Working with 28730 genes for 24 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 6 marker pairs. 1920 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G1: 9, S: 15


## Show table with scores and prediction 

In [36]:
prediction_table = helper.get_prediction_table(prediction)
helper.DataTable(prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Day0_1,0.72,0.0,1.0,0.418605,0.0,0.581395,G1
Day1_1,0.857,0.0,0.999,0.461746,0.0,0.538254,G1
Day2_1,0.823,0.0,1.0,0.451454,0.0,0.548546,G1
Day3_1,0.381,0.002002,0.905,0.295807,0.001554,0.702639,S
Day4_1,0.225,0.103,0.3,0.35828,0.164013,0.477707,S
Day5_1,0.192,0.273,0.17,0.302362,0.429921,0.267717,S
Day8_1,0.215,0.11,0.468,0.271122,0.138714,0.590164,S
Day14_1,0.203,0.151,0.383,0.275441,0.204885,0.519674,S
Day0_2,0.77,0.0,0.999,0.435274,0.0,0.564726,G1
Day1_2,0.931,0.0,1.0,0.482134,0.0,0.517866,G1


## Visualization per group

In [37]:
days = [0, 1, 2, 3, 4, 5, 8, 14]

# Create traces
trace0G1 = go.Scatter(
    x = days,
    y = prediction_table.iloc[0:8,0].values,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'circle',
        size = 10,
        color = 'red',
    ),
    name = 'G1'
)

trace0S = go.Scatter(
    x = days,
    y = prediction_table.iloc[0:8,2].values,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'triangle-up',
        size = 10,
        color = 'green',
    ),
    name = 'S'
)

trace0G2M = go.Scatter(
    x = days,
    y = prediction_table.iloc[0:8,1].values,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'square',
        size = 10,
        color = 'blue',
    ),
    name = 'G2M'
)

trace1G1 = go.Scatter(
    x = days,
    y = prediction_table.iloc[8:16,0].values,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'circle',
        size = 10,
        color = 'red',
    ),
    name = 'Group 2 - G1',
    showlegend=False
)

trace1S = go.Scatter(
    x = days,
    y = prediction_table.iloc[8:16,2].values,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'triangle-up',
        size = 10,
        color = 'green',
    ),
    name = 'Group 2 - S',
    showlegend=False
)

trace1G2M = go.Scatter(
    x = days,
    y = prediction_table.iloc[8:16,1].values,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'square',
        size = 10,
        color = 'blue',
    ),
    name = 'Group 2 - G2M',
    showlegend=False
)


trace2G1 = go.Scatter(
    x = days,
    y = prediction_table.iloc[16:24,0].values,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'circle',
        size = 10,
        color = 'red',
    ),
    name = 'Group 3 - G1',
    showlegend=False
)

trace2S = go.Scatter(
    x = days,
    y = prediction_table.iloc[16:24,2].values,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'triangle-up',
        size = 10,
        color = 'green',
    ),
    name = 'Group 3 - S',
    showlegend=False
)

trace2G2M = go.Scatter(
    x = days,
    y = prediction_table.iloc[16:243,1].values,
    mode = 'lines+markers',
    marker = dict(
        symbol = 'square',
        size = 10,
        color = 'blue',
    ),
    name = 'Group 3 - G2M',
    showlegend=False
)

fig = tools.make_subplots(rows=1, cols=3, subplot_titles=('Group 1', 'Group 2', 'Group 3'))

fig.append_trace(trace0G1, 1, 1)
fig.append_trace(trace0G2M, 1, 1)
fig.append_trace(trace0S, 1, 1)
fig.append_trace(trace1G1, 1, 2)
fig.append_trace(trace1G2M, 1, 2)
fig.append_trace(trace1S, 1, 2)
fig.append_trace(trace2G1, 1, 3)
fig.append_trace(trace2G2M, 1, 3)
fig.append_trace(trace2S, 1, 3)

fig['layout']['xaxis1'].update(title='Day', range=[0, 15])
fig['layout']['xaxis2'].update(title='Day', range=[0, 15])
fig['layout']['xaxis3'].update(title='Day', range=[0, 15])

fig['layout']['yaxis1'].update(title='Score [0-1]')

#fig['layout'].update(title='Customizing Subplot Axes')

#data =  go.Figure(data=[trace0, trace1, trace2], layout=layout)

iplot(fig, filename='ebv')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]



## Visualization averaged

In [38]:
avg_g1 = [np.average(prediction_table.iloc[[i, i+8, i+16], 0].values) for i in range (0,8)]
avg_s = [np.average(prediction_table.iloc[[i, i+8, i+16], 2].values) for i in range (0,8)]
avg_g2m = [np.average(prediction_table.iloc[[i, i+8, i+16], 1].values) for i in range (0,8)]

In [39]:
iplot(helper.get_prediction_plot(avg_g1, avg_s, avg_g2m, xaxis=[0, 1, 2, 3, 4, 5, 8, 14], xaxislbl="Day", title="Time line of phase scores for EBV infected B cells", width=950,height=600))

In [40]:
iplot(helper.get_prediction_plot(avg_g1, avg_s, avg_g2m, t="pie", xaxis=[0, 1, 2, 3, 4, 5, 8, 14], xaxislbl="Day", title="Prediction of EBV infected B cells", width=950,height=950))