name: predict_on_NRC_data.ipynb

The purpose of this notebook is to create a dataset that contains the 12-element feaure vector outputted from the MPRA-DragoNN Kipoi model. This is then used in the linear mapping step.

---

http://kipoi.org/docs/tutorials/python-api/  
http://kipoi.org/docs/using/python/

In [None]:
import kipoi
import warnings
import logging
import pandas as pd

warnings.filterwarnings('ignore')
logging.disable(1000)

In [None]:
kipoi.list_sources()

Unnamed: 0,source,type,location,local_size,n_models,n_dataloaders
0,kipoi,git,/Users/sarahdavis/.kipoi/models/,14M,2130,2130
1,github-permalink,github-permalink,/Users/sarahdavis/.kipoi/github-permalink/,,0,0


In [None]:
# MODEL = "kipoi/ConvModel"
# model = kipoi.get_model(MODEL, source="dir")  # to get model from the repo

MODEL = "MPRA-DragoNN/ConvModel"
model = kipoi.get_model(MODEL)    # or "MPRA-DragoNN/DeepFactorizedModel"

Using downloaded and verified file: /Users/sarahdavis/.kipoi/models/MPRA-DragoNN/ConvModel/downloaded/model_files/arch/930692182c3fcbb4483115fa4ac386f8
Using downloaded and verified file: /Users/sarahdavis/.kipoi/models/MPRA-DragoNN/ConvModel/downloaded/model_files/weights/19fb17f943c3d6bcada8c5dc638092b4


Using TensorFlow backend.


In [None]:
# model.info
# model.source
# model.source_dir
# model.default_dataloader
# model.default_dataloader.info

In [None]:
DataLoader = model.default_dataloader
DataLoader.example_kwargs

{'intervals_file': '/Users/sarahdavis/.kipoi/models/MPRA-DragoNN/ConvModel/downloaded/example_files/intervals_file',
 'fasta_file': '/Users/sarahdavis/.kipoi/models/MPRA-DragoNN/ConvModel/downloaded/example_files/fasta_file'}

---
## Run on example data from the DragoNN repository
### Pipeline
Kipoi example

In [None]:
dl_kwargs = model.default_dataloader.download_example('example')  # equivalent to DataLoader.example_kwargs

# Run the prediction
pred = model.pipeline.predict(dl_kwargs, batch_size=4)

3it [00:00,  6.67it/s]


In [None]:
pred.shape  # (10x12) AKA (10 examples x 12 output nodes)

(10, 12)

In [None]:
pred[0]

array([-0.288252  , -0.28210288, -0.33340785, -0.13078065, -0.11779819,
       -0.14638898, -0.22221461, -0.07526954, -0.17996395, -0.03116388,
       -0.08795099, -0.07663752], dtype=float32)

Repo example

In [None]:
dl_kwargs = {"intervals_file":"data/processed/intervals_file", "fasta_file":"data/processed/fasta_file"}

# Run the prediction
pred = model.pipeline.predict(dl_kwargs, batch_size=4)

3it [00:00, 116.19it/s]


In [None]:
pred[0]

array([-0.288252  , -0.28210288, -0.33340785, -0.13078065, -0.11779819,
       -0.14638898, -0.22221461, -0.07526954, -0.17996395, -0.03116388,
       -0.08795099, -0.07663752], dtype=float32)

In [None]:
pred.shape  # (10x12) AKA (10 examples x 12 output nodes)

(10, 12)

### Iterator Method

Kipoi example

In [None]:
dl = DataLoader(**DataLoader.example_kwargs)

In [None]:
# get a batch iterator
it = dl.batch_iter(batch_size=2)
# predict for a batch
batch = next(it)
print(batch['inputs'].shape)
print(batch)
model.predict_on_batch(batch['inputs'])

(2, 145, 4)
{'inputs': array([[[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]]]), 'targets': {}, 'metadata': {'ranges': {'chr': array(['chr22', 'chr22'], dtype='<U5'), 'start': array([135971, 136304]), 'end': array([136116, 136449]), 'id': array(['0', '1'], dtype='<U1'), 'strand': array(['*', '*'], dtype='<U1')}}}


array([[-0.28825194, -0.2821029 , -0.33340782, -0.13078062, -0.11779823,
        -0.14638893, -0.22221467, -0.07526959, -0.17996398, -0.03116385,
        -0.08795102, -0.07663757],
       [-0.4520552 , -0.5208955 , -0.5681222 , -0.20029847, -0.2777201 ,
        -0.28277114, -0.33968392, -0.30178902, -0.3873875 , -0.15831251,
        -0.26334903, -0.25061306]], dtype=float32)

Repo example

In [None]:
dl_kwargs = {"intervals_file":"data/processed/intervals_file", "fasta_file":"data/processed/fasta_file"}
dl = DataLoader(**dl_kwargs)

In [None]:
# get a batch iterator
it = dl.batch_iter(batch_size=2)
# predict for a batch
batch = next(it)
print(batch['inputs'].shape)
print(batch)
model.predict_on_batch(batch['inputs'])

(2, 145, 4)
{'inputs': array([[[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]]]), 'targets': {}, 'metadata': {'ranges': {'chr': array(['chr22', 'chr22'], dtype='<U5'), 'start': array([135971, 136304]), 'end': array([136116, 136449]), 'id': array(['0', '1'], dtype='<U1'), 'strand': array(['*', '*'], dtype='<U1')}}}


array([[-0.28825194, -0.2821029 , -0.33340782, -0.13078062, -0.11779823,
        -0.14638893, -0.22221467, -0.07526959, -0.17996398, -0.03116385,
        -0.08795102, -0.07663757],
       [-0.4520552 , -0.5208955 , -0.5681222 , -0.20029847, -0.2777201 ,
        -0.28277114, -0.33968392, -0.30178902, -0.3873875 , -0.15831251,
        -0.26334903, -0.25061306]], dtype=float32)

---
## Run on our test data
### Pipeline

In [None]:
dl_kwargs = {"intervals_file":"data/processed/intervals_file", "fasta_file":"data/processed/fasta_file"}
pred = model.pipeline.predict(dl_kwargs, batch_size=4)

3it [00:00, 64.59it/s]


In [None]:
pred.shape  # (10x12) AKA (10 examples x 12 output nodes)

(10, 12)

In [None]:
pred[0]

array([ 0.0571927 ,  0.06287674,  0.06941664, -0.09135027, -0.13182783,
       -0.1274572 ,  0.03053482, -0.07282072, -0.02254407, -0.04860624,
       -0.12053218, -0.1046651 ], dtype=float32)

### Iterator Method

In [None]:
dl_kwargs = {"intervals_file":"data/processed/intervals_file", "fasta_file":"data/processed/fasta_file"}
dl = DataLoader(**dl_kwargs)  # specify our own files

In [None]:
# get a batch iterator
it = dl.batch_iter(batch_size=4)
# predict for a batch
batch = next(it)
print(batch['inputs'].shape)
print(batch)
model.predict_on_batch(batch['inputs'])

(4, 145, 4)
{'inputs': array([[[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        ...,
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]],

       [[0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]],

       [[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        ...,
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]],

       [[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]]]), 'targets': {}, 'metadata': {'ranges': {'chr': array(['chr1', 'chr1', 'chr1', 'chr1'], dtype='<U4'), 'start': array([ 0,  5, 10, 15]), 'end': array([145, 150, 155, 160]), 'id': array(['0', '1', '2', '3'], dtype='<U1'), 'strand': array(['*', '*', '*', '*'], dtype='<U1')}}}


array([[ 0.0571927 ,  0.06287674,  0.06941664, -0.09135027, -0.13182783,
        -0.1274572 ,  0.03053482, -0.07282072, -0.02254407, -0.04860624,
        -0.12053218, -0.1046651 ],
       [ 0.24802886,  0.31453475,  0.3285599 ,  0.12974916, -0.00247397,
         0.07746662,  0.3386457 ,  0.14567457,  0.296574  , -0.01673259,
         0.06336489,  0.01785175],
       [ 0.26566124,  0.22909397,  0.289731  , -0.02052839,  0.08377554,
         0.0450661 ,  0.35583344,  0.13505654,  0.29651985,  0.1337512 ,
         0.21370882,  0.19802292],
       [ 0.36996043,  0.42554772,  0.4643766 ,  0.0721544 ,  0.10909953,
         0.10980394,  0.3669311 ,  0.2792327 ,  0.39648372,  0.11275812,
         0.2543426 ,  0.20965862]], dtype=float32)

---
## Run on our full-size data

In [None]:
dl_kwargs = {"intervals_file":"data/processed/intervals_file", "fasta_file":"data/processed/fasta_file"}
pred = model.pipeline.predict(dl_kwargs, batch_size=4)

18722it [01:12, 256.70it/s]


In [None]:
pred.shape

In [None]:
pd.DataFrame(pred, columns=["node"+str(i) for i in range(1,13)]).to_csv("new_data/CanolaTargetsKipoi.csv")