This notebook contains examples of how to interact with each of the scripts in `/src`.

In [None]:
# only run if this repo is stored on your GDrive, ignore if you are running these things locally

from google.colab import drive
drive.mount('/content/drive')


%cd '/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/plant_enhancer_and_accessibility_prediction'

Mounted at /content/drive
/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/clean


In [None]:
# !pip list -v

In [None]:
from subprocess import run
import pandas as pd
import time

# Data pre-processing

Running these cells calls a script that takes in a filepath to an unprocessed dataset in `data/raw/`, processes this dataset, and outputs it to a destination within `data/processed/`

In order to work with training, a dataset needs at least three columns: one called "sequence" which contains the nucleotide sequence, one (or more) columns containing the word "target", and a column called "set" which contains either "train", "test", or "validation".

## Canola

In [None]:
# call script to process canola starr dataset
!python src/create_final_dataset-bnapus_starr.py \
        --data_input_path data/raw/hidra.tsv \
        --data_output_path data/processed/bnapus_organelle_starr.csv \
        --pad_up_to 0 \
        --num_chunks 70 \
        --organelle "NC_016734.1"

In [None]:
pd.read_csv("data/processed/bnapus_organelle_starr.csv")

Unnamed: 0,ref,start_coord,end_coord,sequence,target,set
0,NC_016734.1,0,145,AATCATAATAACTTGGTCCCGGGCATCACGGGCGAACGACGGGAAT...,0.26,test
1,NC_016734.1,5,150,TAATAACTTGGTCCCGGGCATCACGGGCGAACGACGGGAATTGAAC...,0.27,test
2,NC_016734.1,10,155,ACTTGGTCCCGGGCATCACGGGCGAACGACGGGAATTGAACCCGCG...,0.27,test
3,NC_016734.1,15,160,GTCCCGGGCATCACGGGCGAACGACGGGAATTGAACCCGCGATGGT...,0.27,test
4,NC_016734.1,20,165,GGGCATCACGGGCGAACGACGGGAATTGAACCCGCGATGGTGAATT...,0.27,test
...,...,...,...,...,...,...
69029,NC_008285.1,221685,221830,ATGGAGTTGTGTTTTGCCACCTGGAGTTTTAATGGAAGTTTGAGTG...,0.47,val
69030,NC_008285.1,221690,221835,GTTGTGTTTTGCCACCTGGAGTTTTAATGGAAGTTTGAGTGCGTCC...,0.50,val
69031,NC_008285.1,221695,221840,GTTTTGCCACCTGGAGTTTTAATGGAAGTTTGAGTGCGTCCTAAAA...,0.47,val
69032,NC_008285.1,221700,221845,GCCACCTGGAGTTTTAATGGAAGTTTGAGTGCGTCCTAAAAGCCAA...,0.51,val


In [None]:
# call script to process canola genome ATAC dataset w/ real-valued targets
!python src/create_final_dataset-bnapus_atac_real.py \
        --data_input_path data/raw/atac.expression_with_seq.tsv \
        --data_output_path data/processed/bnapus_genome_atac_real.csv \
        --sample 32 \
        --target "averaged" \
        --accessible "all" \
        --inaccessible "all"

# note: accessible and inaccessible datasets contain overlapping sequences
# be careful about the train/test/val split (e.g. training on accessible, testing on the inaccessible)

In [None]:
pd.read_csv("data/processed/bnapus_genome_atac_real.csv")

Unnamed: 0,ref,start_coord,end_coord,sequence,target,set
0,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,2.116333e-08,train
1,N1,544,697,ATTGTTTTGGTTCACAATGGCGTCCACTCCTTCTCAAAATTCGAAG...,1.382833e-07,train
2,N1,1088,1241,ATGCCCAGAAAGAAACAGGCTTATGTCCAGGATGTAAAGAACAATA...,1.424333e-07,train
3,N1,1632,1785,GGAGAAATGAACAGTGGTGGCTTATCTCAGGAACAAGCTCTCACCT...,1.502278e-07,train
4,N1,2176,2329,TAATCAAATAATATGCACTTATTCAAAATCTTTTTTGTTTTGTTTA...,9.752222e-08,train
...,...,...,...,...,...,...
1206877,N9,39585112,39585265,CCAGTTCTTACGTTATGTTCCAAATATGCTATATTTGCATGAGACT...,6.388333e-08,train
1206878,N9,39585656,39585809,CCCCCGTTTCTCCGCCGGGAAGACCAAGACACAGCTCAGTCTCTCT...,1.680389e-07,train
1206879,N9,39586744,39586897,CTTAAAACCAAAAATATTGAAGATTTGTCTGTCTGTTATGTTCTAC...,7.905556e-08,train
1206880,N9,39587288,39587441,ACACATATAAACGCTCCATTGCTCGTTGTAATTTTAAAATAATAAC...,8.317222e-08,train


In [None]:
# call script to process canola genome ATAC dataset w/ binary targets
!python src/create_final_dataset-bnapus_atac_binary.py \
        --data_input_path data/raw/atac.expression.binary.reformatted.tsv \
        --data_output_path data/processed/bnapus_genome_atac_binary_bud-green.csv \
        --sample 64 \
        --target "bud-green"

In [None]:
pd.read_csv("data/processed/bnapus_genome_atac_binary_bud-green.csv")

Unnamed: 0,ref,start_coord,end_coord,sequence,target,set
0,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,0,train
1,N1,1088,1241,ATGCCCAGAAAGAAACAGGCTTATGTCCAGGATGTAAAGAACAATA...,0,train
2,N1,2176,2329,TAATCAAATAATATGCACTTATTCAAAATCTTTTTTGTTTTGTTTA...,0,train
3,N1,3264,3417,AACAAGTTTTTGGCATGAAAACGCATTTTTTGCGATTTTGGCGGGA...,0,train
4,N1,4352,4505,ATGATCCATCTGAATGAGTTGCACTTTTAGTGCCTAAACAAACAAA...,0,train
...,...,...,...,...,...,...
604010,N9,40285240,40285393,GGAGTTTTAAGAAGTGGAAGCTTGACTTTTCCTTTATGACAACACG...,0,train
604011,N9,40286328,40286481,ATTAAAGAAATCAGAATATCTATAAATTAGGTTTGGAATGGTTATT...,0,train
604012,N9,40287416,40287569,TTAAATATACCAAAATTTTAGCAAATTTATTACTTCATCTGAAGAT...,0,train
604013,N9,40288504,40288657,CAGAATTAGTGATGTTTTATAAATAGAATAAGACGTACCTTTATAC...,0,train


## Arabidopsis

In [None]:
# call script to process unannotated arabidopsis genome starr dataset
!python src/create_final_dataset-athaliana_genome_unannotated.py \
        --data_input_path data/raw/athal_starr_hidra.tsv \
        --data_output_path data/processed/athaliana_genome_starr.csv \
        --sample 64 \
        --pad_up_to 0 \
        --control_threshold 30 \
        --treatment_threshold 5

In [None]:
pd.read_csv("data/processed/athaliana_genome_starr.csv")

Unnamed: 0,ref,start_coord,end_coord,sequence,target,set
0,Chr1,3520,3665,TAAAACCATACCAATTAAACCGGAGATCCATATTAATTTAATTAAG...,-0.439532,train
1,Chr1,4160,4305,GTGGTAAGATTGGTCATAAAAGGGTTTTGGTGTTCCTCGATGGAAG...,-0.285909,train
2,Chr1,4480,4625,TACAGAGGACATATGTCATCTGCAGACTTGAGTACAAGGGTGATGA...,-0.628684,train
3,Chr1,4800,4945,CATTATGCAGCAGCAACCACTTCAAGGATCATTCAACCCTCTCCTT...,-0.542793,train
4,Chr1,5120,5265,TCACAACCTATATCGCTTCTATATCTCACACGCTGAATTTTGGCTT...,0.104023,train
...,...,...,...,...,...,...
257866,Chr5,26974040,26974185,TTGTTTTCTGTACTTGTTGCCACCATGATGCGGCTTGCCCTCGAAA...,-0.098684,train
257867,Chr5,26974360,26974505,ATGTGTCTGTGTGGTGATAATATTAATGATGTAATAGATGTATGAT...,0.337695,train
257868,Chr5,26974680,26974825,CAAAACACACACTCTAGTCGAAGAATTTAGGCAAAACTCACACGCC...,0.328623,train
257869,Chr5,26975000,26975145,AAATAAGGTAGAGTGACAATTCTTTCTAATGTTCATTTGAAATAAA...,0.800391,train


In [None]:
# call script to process arabidopsis genome starr dataset, split for accessibility
!python src/create_final_dataset-athaliana_genome_annotated.py \
        --data_input_path data/raw/athal_starr.acr.regions.bed \
        --accessible_data_output_path data/processed/athaliana_genome_starr_accessible.csv \
        --inaccessible_data_output_path data/processed/athaliana_genome_starr_inaccessible.csv \
        --sample 1 \
        --pad_up_to 0 \
        --trim True \
        --downsample True \
        --save_separate False \
        --control_threshold 30 \
        --treatment_threshold 5

# note: resulting accessible and inaccessible datasets contain no sequences in common if trim == True
# e.g. one could train on inaccessible and test on accessible with no issues

In [None]:
pd.read_csv("data/processed/athaliana_genome_starr_inaccessible.csv")

Unnamed: 0,ref,start_coord,end_coord,sequence,ACR_flag,region,target,set
0,Chr1,3519,3672,GTAAAACCATACCAATTAAACCGGAGATCCATATTAATTTAATTAA...,0,proximal,-0.439532,train
1,Chr1,3536,3689,AAACCGGAGATCCATATTAATTTAATTAAGAAAATAAAAATAAAAG...,0,proximal,-0.435215,train
2,Chr1,3553,3706,TAATTTAATTAAGAAAATAAAAATAAAAGGAATAAATTGTCTTATT...,0,proximal,-0.479610,train
3,Chr1,3570,3723,TAAAAATAAAAGGAATAAATTGTCTTATTTAAACGCTGACTTCACT...,0,genic,-0.436099,train
4,Chr1,3570,3723,TAAAAATAAAAGGAATAAATTGTCTTATTTAAACGCTGACTTCACT...,0,genic,-0.436099,train
...,...,...,...,...,...,...,...,...
127802,Chr5,26958362,26958515,ATTTGGAAATTAACAGGTGATAATATACTCACCTGCTAGAACAGCT...,0,genic,0.007654,train
127803,Chr5,26958379,26958532,TGATAATATACTCACCTGCTAGAACAGCTACCCAGTCAGGATCTGG...,0,genic,0.022840,train
127804,Chr5,26961405,26961558,GGCTCCTTTCAGGAGTTTTATGGAAAGAGGAAGGAAATGTATCTGA...,0,genic,0.474372,train
127805,Chr5,26961422,26961575,TTATGGAAAGAGGAAGGAAATGTATCTGATTCAGACGGTGGAGATT...,0,genic,0.342525,train


In [None]:
# call script to process arabidopsis BAC starr dataset
!python src/create_final_dataset-athaliana_bac.py \
        --data_input_path data/raw/athal_bac.intro.tsv \
        --data_output_path data/processed/athaliana_bac_starr.csv \
        --pad_up_to 0 \
        --num_chunks 10

In [None]:
pd.read_csv("data/processed/athaliana_bac_starr.csv")

Unnamed: 0,ref,start_coord,end_coord,sequence,target,set
0,AC002387.3,0,153,GAATTCTTAAGATTGATCTGAGTTTTCCTCTTACACCGAATGTTTC...,-0.326286,test
1,AC002387.3,17,170,CTGAGTTTTCCTCTTACACCGAATGTTTCAGAAGAAGCCAAAAATC...,-0.432610,test
2,AC002387.3,34,187,ACCGAATGTTTCAGAAGAAGCCAAAAATCTTATCAGTCAGGTACAC...,-0.385478,test
3,AC002387.3,51,204,AAGCCAAAAATCTTATCAGTCAGGTACACACCCAACCAAGCTAAAG...,-0.330750,test
4,AC002387.3,68,221,AGTCAGGTACACACCCAACCAAGCTAAAGCATACCCACAACGACTC...,-0.341424,test
...,...,...,...,...,...,...
6986,AC002387.3,122706,122859,ACATGATCAACAAAAGGTACTTACTTCTTCTTGTCTATATTTGGGA...,-0.361640,val
6987,AC002387.3,122723,122876,TACTTACTTCTTCTTGTCTATATTTGGGACTTCACTTTTCTCAGCC...,-0.346007,val
6988,AC002387.3,122740,122893,CTATATTTGGGACTTCACTTTTCTCAGCCTTTTCCACAATCACCTG...,-0.434803,val
6989,AC002387.3,122757,122910,CTTTTCTCAGCCTTTTCCACAATCACCTGCAACCAAACTACAATGT...,-0.457397,val


## Drosophila

In [None]:
# call script to process drosophila starr dataset
!python src/create_final_dataset-dmelanogaster.py \
        --train_activity data/raw/Sequences_activity_Train.txt \
        --val_activity data/raw/Sequences_activity_Val.txt \
        --test_activity data/raw/Sequences_activity_Test.txt \
        --train_sequences data/raw/Sequences_Train.fa \
        --val_sequences data/raw/Sequences_Val.fa \
        --test_activity data/raw/Sequences_Test.fa \
        --data_output_path data/processed/dmelanogaster.csv \
        --target "multi"

In [None]:
pd.read_csv("data/processed/dmelanogaster.csv")

Unnamed: 0,dev_target,hk_target,set,sequence
0,6.046261,1.711032,train,ATTCAGATTGCCTCTCATTGTCTCACCCATATTATGGGAACCAAAT...
1,5.604215,1.974276,train,AAATGGCCGCTCAAGAAAAGGCTCGAATATATATTGCCTGCCTCTC...
2,2.312282,0.606119,train,ATAAGGATCAAAAAGTCCTGATTTCCGAAATGGCGGTTCTCCTTCA...
3,1.254361,3.780414,train,TTTCCATGACTGACTGGAATGGGTGGAGAACATCGCTTTGGGAGTG...
4,2.658029,0.714676,train,TCTATCGACCCATAGCCGTAGTCGCTAGACCCGCCCTTCGGAGCAT...
...,...,...,...,...
484029,1.238013,-0.517860,val,GCAAATCATGAAGGTGGAATATATATATAGCAGATCTAAACCCTTC...
484030,0.052251,-1.155432,val,AAGTATTTTCCCAATTTCCCATACCGCACCCAGCTGCATTTCCCAT...
484031,0.235486,-0.150081,val,TTAATTTAAAGTACTTCCCATCAGCTTTTTGTGCATTTTTCCCATG...
484032,-0.456804,0.882345,val,GTCGTCTTACACTCGCAGCAAAACTTGTAAATTAAAAATGCGTCCA...


## Tobacco

In [None]:
# call script to process tobacco 35S starr dataset
!python src/create_final_dataset-ntabacum.py \
        --data_input_path data/raw/tobacco.csv \
        --data_output_path data/processed/ntabacum_35s_starr.csv \
        --pad_up_to 0

# NOTE: does not create "set" attribute

In [None]:
pd.read_csv("data/processed/ntabacum_35s_starr.csv")

Unnamed: 0,sequence,target
0,AAGATCTCTCTGCCGACAGTGGTCCCAAAGATGGACCCCCACCCAC...,-0.073000
1,ACGATCTCTCTGCCGACAGTGGTCCCAAAGATGGACCCCCACCCAC...,0.006000
2,AGGATCTCTCTGCCGACAGTGGTCCCAAAGATGGACCCCCACCCAC...,-0.054000
3,ATGATCTCTCTGCCGACAGTGGTCCCAAAGATGGACCCCCACCCAC...,-0.184000
4,AGAATCTCTCTGCCGACAGTGGTCCCAAAGATGGACCCCCACCCAC...,-0.037000
...,...,...
1201,AGATCTCTCTGCCGACAGTGGTCCCAAAGATGGACCCCCACCCACG...,-0.140368
1202,AGATCTCTCTGCCGACAGTGGTCCCAAAGATGGACCCCCACCCACG...,-0.068632
1203,AGATCTCTCTGCCGACAGTGGTCCCAAAGATGGACCCCCACCCACG...,0.158253
1204,AGATCTCTCTGCCGACAGTGGTCCCAAAGATGGACCCCCACCCACG...,-0.126364


# Model training

If there is a single column in the input dataset labeled "target", this will be inferred as the attribute to predict. If multiple columns contain the word "target", these will all be taken as multi-task target attributes, predicted simultanously, and evaluated separately.

##Nucleotide Frequency Models

### Feed-Forward Neural Network

In [None]:
# train feed-forward neural network to predict real-valued or binary target(s)
!python src/model_train-frequency_features.py \
        --data_input_path data/processed/bnapus_genome_atac_binary_multi.csv \
        --save_path "experiments/bnapus_ffnn/" \
        --model "ffnn" \
        --task "binclass" \
        --include_mononuc_freq 1 \
        --include_dinuc_freq 0 \
        --include_trinuc_freq 0 \
        --learning_rate 0.002 \
        --batch_size 512 \
        --num_epochs 500 \
        --patience 20 \
        --layer_1_size 24 \
        --layer_1_activation "relu" \
        --layer_2_size 0 \
        --layer_2_activation "relu" \
        --layer_3_size 0 \
        --layer_3_activation "relu"

# note: make sure dataset and task match!

targets: ['bud-green_target', 'bud-yellow_target', 'peduncle-down-15cm_target', 'seed-21d_target', 'silique-1week_target', 'silique-2week_target', 'silique-3week_target', 'silique-4week_target', 'stem-down-15cm_target']
got data
2023-01-05 23:51:53.871391: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch

### Parameter sweep example

In [None]:
# test many settings in one go

models = [(24, 0), (64, 24)]  # (layer_1_size, layer_2_size)
nts = ["100", "010", "001", "110", "101", "011", "111"]  # "include_mononuc_freq|include_dinuc_freq|include_trinuc_freq"

for nt in nts:
  for model in models:
    print(nt, model)

    t0 = time.time()
    !python src/model_train-frequency_features.py \
            --data_input_path data/processed/athaliana_genome_starr.csv \
            --save_path "experiments/athal_genome_ffnn/" \
            --model "ffnn" \
            --task "regression" \
            --include_mononuc_freq {int(nt[0])} \
            --include_dinuc_freq {int(nt[1])} \
            --include_trinuc_freq {int(nt[2])} \
            --learning_rate 0.002 \
            --batch_size 512 \
            --num_epochs 500 \
            --patience 20 \
            --layer_1_size {model[0]} \
            --layer_1_activation "relu" \
            --layer_2_size {model[1]} \
            --layer_2_activation "relu" \
            --layer_3_size 0 \
            --layer_3_activation "relu"
    print(time.time()-t0)
    print()

## Nucleotide Sequence Models

### Convolutional Model

In [None]:
# train convolutional neural network to predict real-valued or binary target(s)
!python src/model_train-sequence_features.py \
        --data_input_path data/processed/dmelanogaster.csv \
        --save_path "experiments/dros_cnn/" \
        --model "cnn" \
        --task "regression" \
        --learning_rate 0.002 \
        --batch_size 512 \
        --num_epochs 5 \
        --patience 20 \
        --shuffle 0 \
        --model_path "models/mpra_dragonn_model.json" \
        --weights_path "models/mpra_dragonn_weights.hdf5" \
        --conv_1_set 0 \
        --conv_2_set 0 \
        --conv_3_set 0 \
        --linear_mapping 0 \
        --last_conv_layer 1

# note: make sure dataset and task match!

2023-01-06 17:04:11.646665: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
targets: ['dev_target', 'hk_target']
tcmalloc: large alloc 3205357568 bytes == 0xcfa42000 @  0x7f0e8601b1e7 0x7f0e5084314e 0x7f0e508a10b5 0x7f0e508a16f9 0x7f0e5094239f 0x5aae14 0x4997c7 0x5d8868 0x4990ca 0x5d8868 0x4990ca 0x55cd91 0x55d743 0x642630 0x6426ae 0x644b78 0x64511c 0x677e5e 0x678029 0x7f0e85c18c87 0x5e1baa
got data
2023-01-06 17:05:48.011583: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-06 17:05:

###Recurrent Model

In [None]:
# train recurrent neural network to predict real-valued or binary target(s)
!python src/model_train-sequence_features.py \
        --data_input_path data/processed/dmelanogaster.csv \
        --save_path "experiments/dros_rnn/" \
        --model "rnn" \
        --task "regression" \
        --learning_rate 0.002 \
        --batch_size 512 \
        --num_epochs 500 \
        --patience 20 \
        --shuffle 0 \
        --layer_size 128

# note: make sure dataset and task match!

2023-01-06 17:14:03.910000: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
targets: ['dev_target', 'hk_target']
tcmalloc: large alloc 3205357568 bytes == 0xcf604000 @  0x7fa21190a1e7 0x7fa1dc13214e 0x7fa1dc1900b5 0x7fa1dc1906f9 0x7fa1dc23139f 0x5aae14 0x4997c7 0x5d8868 0x4990ca 0x5d8868 0x4990ca 0x55cd91 0x55d743 0x642630 0x6426ae 0x644b78 0x64511c 0x677e5e 0x678029 0x7fa211507c87 0x5e1baa
got data
2023-01-06 17:15:37.128894: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-06 17:15:

### Parameter sweep example

In [None]:
# test many settings in one go

setups = ["0000", "1110", "2220", "2210", "2200", "2221"]

for setup in setups:
  print(setup)

  t0 = time.time()
  !python src/model_train-sequence_features.py \
          --data_input_path data/processed/athaliana_genome_starr.csv \
          --save_path "experiments/athal_genome_cnn/" \
          --model "cnn" \
          --task "regression" \
          --learning_rate 0.002 \
          --batch_size 512 \
          --num_epochs 500 \
          --patience 20 \
          --shuffle 0 \
          --model_path "models/mpra_dragonn_model.json" \
          --weights_path "models/mpra_dragonn_weights.hdf5" \
          --conv_1_set {int(setup[0])} \
          --conv_2_set {int(setup[1])} \
          --conv_3_set {int(setup[2])} \
          --linear_mapping {int(setup[3])} \
          --last_conv_layer 1
  print(time.time()-t0)
  print()

# Model Evaluation and Prediction Generation
Takes in a dataset and a model and outputs:
*   Model predictions and various performance metrics if target is provided
*   Model predictions if target is not provided

In other words, to use this script, the dataset needs (at minimum) a column with the header "sequence". The length of these sequences must be compatible with the layer dimensions of the model specified.



### Generate Predictions and Evaluate

In [None]:
# generate predictions and evaluations given a model and a dataset
!python src/predict_and_evaluate.py \
        --data_input_path data/processed/ntabacum_35s_starr.csv \
        --experiment_path "models/cnn_20230211-200556_out1_lr0.002_bs512_sh0_0000" \
        --save_path "evaluation/ntaba_35S/"

# note: experiment path must contain
#    model_architecture.json
#    best_weights.h5
#    settings.txt (args which contain instructions for how to transform the sequences into features) 
# if you want to test many models on a single dataset, see test_streetlight_models.ipynb in /notebooks

2023-01-06 17:42:38.714483: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
targets: ['target']
2023-01-06 17:42:42.195363: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-06 17:42:42.962269: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-01-06 17:42:42.962357: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:loc

###Generate Predictions

In [None]:
# generate predictions and evaluations given a model and a dataset
!python src/predict_and_evaluate.py \
        --data_input_path data/processed/ntabacum_35s_starr_notarget.csv \
        --experiment_path "models/cnn_20230211-200556_out1_lr0.002_bs512_sh0_0000" \
        --save_path "evaluation/ntaba_35S_notarget/"

2023-01-06 17:45:38.629832: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
targets: []
2023-01-06 17:45:42.088387: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-06 17:45:42.850532: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-01-06 17:45:42.850595: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/r