This notebook contains functionality to perform the following:

Three steps of the streetlight dataset creation process. The first is training an accessibility prediction model (either freq or conv) from the napus ATAC dataset (`atac.expression_with_seq.tsv`) after generating the dataset three different ways (from `atac.expression.binary.reformatted.tsv` and `atac.expression_with_seq.tsv`). Next, we use this model to create and restrict the arabidopsis dataset (`athal_starr_hidra.tsv`) based on these predictions.

In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics/'

Mounted at /content/drive
/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics


In [None]:
import argparse
import keras
import warnings, logging
import numpy as np
import pandas as pd
import datetime, time, os
import json
import random
import tensorflow as tf
import math

from keras.models import Sequential, load_model, model_from_json
from keras.layers import Input, Dense, Conv1D, MaxPooling2D, Dropout, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam  # https://stackoverflow.com/questions/62707558/importerror-cannot-import-name-adam-from-keras-optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping  # https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
from collections import Counter

from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, roc_auc_score
from scipy.stats import spearmanr  # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html

warnings.filterwarnings('ignore')
logging.disable(1000)

# tf.random.set_seed(1202)  # https://www.tensorflow.org/api_docs/python/tf/random/set_seed
# from numpy.random import seed
# seed(1202)

random.seed(1234)

nts = ["A", "T", "C", "G"]  # list of single nucleotides
mapping = {"A": [1, 0, 0, 0], "T": [0, 0, 0, 1], "C": [0, 1, 0, 0], "G": [0, 0, 1, 0], "X":[0, 0, 0, 0]}  # cross referenced with kipoi data loader

def Spearman(y_true, y_pred):
     return (tf.py_function(spearmanr, [tf.cast(y_pred, tf.float32), 
                       tf.cast(y_true, tf.float32)], Tout = tf.float32) )

## Create Napus Dataset

### Self-Defined Single-Target

In [None]:
df_atac = pd.read_csv("data/raw/new_data/atac.expression_with_seq.tsv", sep="\t", skiprows=lambda x: x % 128, header=None)  # read in every nth row

samples = ["bud-green_rep1",
           "bud-green_rep2",
           "bud-yellow_rep1",
           "bud-yellow_rep2",
           "peduncle-down-15cm_rep1",
           "peduncle-down-15cm_rep2",
           "seed-21d_rep1",
           "seed-21d_rep2",
           "silique-1week_rep1",
           "silique-1week_rep2",
           "silique-2week_rep1",
           "silique-2week_rep2",
           "silique-3week_rep1",
           "silique-3week_rep2",
           "silique-4week_rep1",
           "silique-4week_rep2",
           "stem-down-15cm_rep1",
           "stem-down-15cm_rep2"]

column_names = ["ref", 
                "start_coord", 
                "end_coord",
                "sequence"]
                
for item in samples:
  column_names.append(item+"_raw")
  column_names.append(item+"_norm_target")

df_atac.columns = column_names

In [None]:
df_atac = df_atac[np.logical_not(df_atac.ref.isin(["chr_contigs", "napus_chloroplast", "napus_mitochondrion"]))]  # get rid of weirdo columns

In [None]:
df_atac.shape  # napus sequences are length 153

(333563, 40)

In [None]:
# PROCESS

# calculate number of ns in each row
# df_atac["num_n"] = df_atac.sequence.str.count("N")
# df_atac["presence_n"] = [1 if "N" in s else 0 for s in df_atac.sequence]

odds = [s for s in list(set("".join(df_atac.sequence))) if s not in ["A", "T", "C", "G"]]

mask = df_atac.sequence.str.contains("|".join(odds))  # true if contains weird characters, false if contains only ATCG
df_atac = df_atac[np.logical_not(mask)]  # keep only rows without weird characters

df_atac["norm_read_avg"] = df_atac.iloc[:,[True if "_norm" in s else False for s in df_atac.columns]].mean(axis=1)
# df_atac["raw_read_sum"] = df_atac.iloc[:,[True if "_raw" in s else False for s in df_atac.columns]].sum(axis=1)

df_atac = df_atac.sort_values("norm_read_avg")

# definitely good up til this point

In [None]:
df_atac

Unnamed: 0,ref,start_coord,end_coord,sequence,bud-green_rep1_raw,bud-green_rep1_norm_target,bud-green_rep2_raw,bud-green_rep2_norm_target,bud-yellow_rep1_raw,bud-yellow_rep1_norm_target,...,silique-3week_rep2_norm_target,silique-4week_rep1_raw,silique-4week_rep1_norm_target,silique-4week_rep2_raw,silique-4week_rep2_norm_target,stem-down-15cm_rep1_raw,stem-down-15cm_rep1_norm_target,stem-down-15cm_rep2_raw,stem-down-15cm_rep2_norm_target,norm_read_avg
154429,N16,25282400,25282553,TATAAATAAATAATTTAAAAATATAAAAAATTTAAAAATAGTTTCA...,0,0.000000,0,0.000000,0,0.000000,...,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0.000000
157548,N16,32069344,32069497,CGAGGAAAGAAGAAATTCCGAGGAATTTCCGAGGAAAGAAGAAATT...,0,0.000000,0,0.000000,0,0.000000,...,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0.000000
234117,N2,9354607,9354760,TTATGAAAGATGAGGATTCTTTTAGAATTTTTGATCAATAGGATGT...,0,0.000000,0,0.000000,0,0.000000,...,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0.000000
209841,N19,12512918,12513071,TGATATATTTCTAACATATAAAATTAAAAAGATAATATAATTAAAT...,0,0.000000,0,0.000000,0,0.000000,...,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0.000000
157549,N16,32071520,32071673,GAGGAAAGAAGAAATTCCGAGGAATTTCCGAGGAAAGAAGAAATTC...,0,0.000000,0,0.000000,0,0.000000,...,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212047,N19,17313174,17313327,TTTCGCTCGCCGCTACTACGGGAATCGCTTTTGCTTTCTTTTCCTC...,12339,0.000160,19681,0.000238,18564,0.000160,...,0.000204,17992,0.000139,18636,0.000227,23043,0.000211,22758,0.000303,0.000200
219421,N19,33358998,33359151,AAGTTTGGTTTCAAACCCCGGTTCGAACAGGAGGAGTACGCCATGC...,14087,0.000183,20858,0.000253,19749,0.000170,...,0.000228,18976,0.000146,20998,0.000256,24196,0.000221,23889,0.000318,0.000215
212046,N19,17310998,17311151,CCGTCGGATCACTAAGGCCGACTTTCGTCCCTGCTCGACGGGTGGG...,14252,0.000185,21998,0.000266,19636,0.000169,...,0.000243,18711,0.000144,21845,0.000266,24672,0.000226,25157,0.000335,0.000223
219419,N19,33354646,33354799,CCAACCCTTGGAACATACTACAGCCCCAGGTGGCGAAGAGCCGACA...,14563,0.000189,22729,0.000275,20497,0.000177,...,0.000250,19267,0.000149,22309,0.000272,25777,0.000236,26400,0.000352,0.000232


In [None]:
# # # BEFORE
# # # get rid of rows that aren't in the top and bottom 2%
# # bottom_indices = [i for i in range(0,int(df_atac.shape[0]*(0.02)))]
# # top_indices = [i for i in range(int(df_atac.shape[0]*(1-0.021)),int(df_atac.shape[0]*(1-0.001)))]

# # # create target
# # df_atac = df_atac.iloc[bottom_indices+top_indices, :]
# # df_atac["target"] = [0]*len(bottom_indices) + [1]*len(top_indices)  # create 0/1 attribute

# # AFTER
# # create new column
# middle_index = int(df_atac.shape[0]/2)
# df_atac["target"] = [0]*df_atac[:middle_index].shape[0] + [1]*df_atac[middle_index:].shape[0]

# # remove middle section if n>2
# n = 2
# index = int(df_atac.shape[0]/n)
# df_atac = pd.concat([df_atac.iloc[[i for i in range(index)],:], df_atac.iloc[[i for i in range(index*(n-1), df_atac.shape[0])],:]])

In [None]:
first_index = int(df_atac.shape[0]*0.979)  # first index we want, hits 97.9%
final_index = int(df_atac.shape[0]*0.999)  # final index we want, hits 99.9%
accessible = df_atac.iloc[first_index:final_index,:]

first_index = 0  # first index we want
final_index = 0  # final index we want, hits 97.9%
inaccessible = df_atac.iloc[first_index:final_index,:]

In [None]:
df_atac = pd.concat([accessible, inaccessible])

In [None]:
df_atac

Unnamed: 0,ref,start_coord,end_coord,sequence,bud-green_rep1_raw,bud-green_rep1_norm_target,bud-green_rep2_raw,bud-green_rep2_norm_target,bud-yellow_rep1_raw,bud-yellow_rep1_norm_target,...,silique-3week_rep2_norm_target,silique-4week_rep1_raw,silique-4week_rep1_norm_target,silique-4week_rep2_raw,silique-4week_rep2_norm_target,stem-down-15cm_rep1_raw,stem-down-15cm_rep1_norm_target,stem-down-15cm_rep2_raw,stem-down-15cm_rep2_norm_target,norm_read_avg
36158,N11,33544706,33544859,GGTTTGGTGCTATTGTCCTGTCACGAAATGGTCAGGGATATTATAT...,44,5.710000e-07,42,5.080000e-07,78,6.730000e-07,...,7.200000e-07,178,1.370000e-06,38,4.630000e-07,275,2.510000e-06,155,2.070000e-06,0.000001
129309,N15,20964995,20965148,TTTTAAAACTCTCTCCCTCACCGTGAGTGGACCCCCCTTCCGGCAC...,29,3.760000e-07,38,4.600000e-07,41,3.540000e-07,...,2.080000e-06,295,2.270000e-06,59,7.190000e-07,64,5.850000e-07,32,4.270000e-07,0.000001
22818,N11,4516866,4517019,GATTTTCGGATCTGCAAATTTAGCCCCGTAGTTACGGGACGTTATA...,85,1.100000e-06,104,1.260000e-06,170,1.470000e-06,...,9.790000e-07,185,1.430000e-06,31,3.780000e-07,125,1.140000e-06,53,7.060000e-07,0.000001
57735,N12,33850502,33850655,TCCTCCTGATGGAGGTGCACTGAAGGTGTCATCAATTAACCTAATA...,67,8.690000e-07,79,9.560000e-07,173,1.490000e-06,...,1.320000e-06,251,1.940000e-06,49,5.970000e-07,102,9.330000e-07,61,8.130000e-07,0.000001
185732,N18,5169479,5169632,AACTTCAAGAGATACAAAACACATCAACTTAAACATATTGTCTTCT...,97,1.260000e-06,123,1.490000e-06,165,1.420000e-06,...,1.080000e-06,128,9.870000e-07,46,5.610000e-07,52,4.750000e-07,32,4.270000e-07,0.000001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189967,N18,14384839,14384992,TGATCAGAATGGATCATGGGAAAACTAAGTCTAGGTCTGGAAATTG...,400,5.190000e-06,335,4.060000e-06,789,6.800000e-06,...,6.060000e-06,840,6.480000e-06,665,8.110000e-06,812,7.420000e-06,435,5.800000e-06,0.000006
193915,N18,22975687,22975840,AAGGAGATATTTTAGAGCGCCAGGTGTCCGAACTGTACGCATGAAG...,295,3.830000e-06,545,6.600000e-06,680,5.860000e-06,...,6.630000e-06,747,5.760000e-06,153,1.860000e-06,561,5.130000e-06,305,4.070000e-06,0.000006
242768,N2,28179183,28179336,TTTTCCAATCAAGGTTTTTAATGAGGCCACAAGTTACTTCTCTTAC...,282,3.660000e-06,307,3.720000e-06,699,6.030000e-06,...,5.800000e-06,1091,8.410000e-06,703,8.570000e-06,822,7.520000e-06,423,5.640000e-06,0.000006
159532,N16,36386528,36386681,ATCAGACACCTTGATCCATCATCCAAGGATCAGGTCGTCCATCCTT...,693,8.990000e-06,346,4.190000e-06,557,4.800000e-06,...,6.660000e-06,777,5.990000e-06,724,8.820000e-06,680,6.220000e-06,492,6.560000e-06,0.000006


In [None]:
# add set attribute
import random

picked1 = ["N"+str(i) for i in range(1,11)]
picked2 = ["N"+str(i) for i in range(11,20)]
random.Random(1202).shuffle(picked1) 
random.Random(1202).shuffle(picked2)

df_atac["set"] = "train"

df_atac.loc[df_atac.ref == picked1[0], "set"] = "val"
df_atac.loc[df_atac.ref == picked2[0], "set"] = "val"

df_atac.loc[df_atac.ref == picked1[1], "set"] = "test"
df_atac.loc[df_atac.ref == picked2[1], "set"] = "test"

In [None]:
df_atac[[column for column in df_atac.columns if "_raw" not in column]]

Unnamed: 0,ref,start_coord,end_coord,sequence,bud-green_rep1_norm_target,bud-green_rep2_norm_target,bud-yellow_rep1_norm_target,bud-yellow_rep2_norm_target,peduncle-down-15cm_rep1_norm_target,peduncle-down-15cm_rep2_norm_target,...,silique-2week_rep1_norm_target,silique-2week_rep2_norm_target,silique-3week_rep1_norm_target,silique-3week_rep2_norm_target,silique-4week_rep1_norm_target,silique-4week_rep2_norm_target,stem-down-15cm_rep1_norm_target,stem-down-15cm_rep2_norm_target,norm_read_avg,set
36158,N11,33544706,33544859,GGTTTGGTGCTATTGTCCTGTCACGAAATGGTCAGGGATATTATAT...,5.710000e-07,5.080000e-07,6.730000e-07,6.000000e-07,1.940000e-06,1.700000e-06,...,0.000001,0.000001,1.230000e-06,7.200000e-07,1.370000e-06,4.630000e-07,2.510000e-06,2.070000e-06,0.000001,train
129309,N15,20964995,20965148,TTTTAAAACTCTCTCCCTCACCGTGAGTGGACCCCCCTTCCGGCAC...,3.760000e-07,4.600000e-07,3.540000e-07,4.920000e-07,5.670000e-07,4.680000e-07,...,0.000001,0.000002,2.100000e-06,2.080000e-06,2.270000e-06,7.190000e-07,5.850000e-07,4.270000e-07,0.000001,train
22818,N11,4516866,4517019,GATTTTCGGATCTGCAAATTTAGCCCCGTAGTTACGGGACGTTATA...,1.100000e-06,1.260000e-06,1.470000e-06,1.910000e-06,9.770000e-07,5.760000e-07,...,0.000002,0.000001,1.500000e-06,9.790000e-07,1.430000e-06,3.780000e-07,1.140000e-06,7.060000e-07,0.000001,train
57735,N12,33850502,33850655,TCCTCCTGATGGAGGTGCACTGAAGGTGTCATCAATTAACCTAATA...,8.690000e-07,9.560000e-07,1.490000e-06,1.060000e-06,7.220000e-07,6.000000e-07,...,0.000001,0.000001,1.750000e-06,1.320000e-06,1.940000e-06,5.970000e-07,9.330000e-07,8.130000e-07,0.000001,train
185732,N18,5169479,5169632,AACTTCAAGAGATACAAAACACATCAACTTAAACATATTGTCTTCT...,1.260000e-06,1.490000e-06,1.420000e-06,2.640000e-06,1.980000e-07,2.760000e-07,...,0.000001,0.000002,9.960000e-07,1.080000e-06,9.870000e-07,5.610000e-07,4.750000e-07,4.270000e-07,0.000001,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189967,N18,14384839,14384992,TGATCAGAATGGATCATGGGAAAACTAAGTCTAGGTCTGGAAATTG...,5.190000e-06,4.060000e-06,6.800000e-06,6.600000e-06,6.660000e-06,7.360000e-06,...,0.000007,0.000006,6.730000e-06,6.060000e-06,6.480000e-06,8.110000e-06,7.420000e-06,5.800000e-06,0.000006,train
193915,N18,22975687,22975840,AAGGAGATATTTTAGAGCGCCAGGTGTCCGAACTGTACGCATGAAG...,3.830000e-06,6.600000e-06,5.860000e-06,7.700000e-06,4.800000e-06,4.170000e-06,...,0.000006,0.000008,6.650000e-06,6.630000e-06,5.760000e-06,1.860000e-06,5.130000e-06,4.070000e-06,0.000006,train
242768,N2,28179183,28179336,TTTTCCAATCAAGGTTTTTAATGAGGCCACAAGTTACTTCTCTTAC...,3.660000e-06,3.720000e-06,6.030000e-06,4.000000e-06,7.210000e-06,6.880000e-06,...,0.000009,0.000006,9.210000e-06,5.800000e-06,8.410000e-06,8.570000e-06,7.520000e-06,5.640000e-06,0.000006,train
159532,N16,36386528,36386681,ATCAGACACCTTGATCCATCATCCAAGGATCAGGTCGTCCATCCTT...,8.990000e-06,4.190000e-06,4.800000e-06,4.510000e-06,7.070000e-06,7.260000e-06,...,0.000007,0.000008,5.680000e-06,6.660000e-06,5.990000e-06,8.820000e-06,6.220000e-06,6.560000e-06,0.000006,train


In [None]:
df_atac = df_atac[[column for column in df_atac.columns if "_raw" not in column]]  # get rid of _raw columns

df_atac.columns = [column.replace("_norm_target", "") for column in df_atac.columns]  # clean up _norm column names
df_atac = df_atac.melt(id_vars=["ref", "start_coord", "end_coord", "sequence", "set", "norm_read_avg"],  # https://stackoverflow.com/questions/28654047/convert-columns-into-rows-with-pandas
                       var_name="tissue", 
                       value_name="target")
df_atac = df_atac[["start_coord", "end_coord", "sequence", "tissue", "target", "set"]]

In [None]:
df_atac

Unnamed: 0,start_coord,end_coord,sequence,tissue,target,set
0,33544706,33544859,GGTTTGGTGCTATTGTCCTGTCACGAAATGGTCAGGGATATTATAT...,bud-green_rep1,5.710000e-07,train
1,20964995,20965148,TTTTAAAACTCTCTCCCTCACCGTGAGTGGACCCCCCTTCCGGCAC...,bud-green_rep1,3.760000e-07,train
2,4516866,4517019,GATTTTCGGATCTGCAAATTTAGCCCCGTAGTTACGGGACGTTATA...,bud-green_rep1,1.100000e-06,train
3,33850502,33850655,TCCTCCTGATGGAGGTGCACTGAAGGTGTCATCAATTAACCTAATA...,bud-green_rep1,8.690000e-07,train
4,5169479,5169632,AACTTCAAGAGATACAAAACACATCAACTTAAACATATTGTCTTCT...,bud-green_rep1,1.260000e-06,train
...,...,...,...,...,...,...
108715,14384839,14384992,TGATCAGAATGGATCATGGGAAAACTAAGTCTAGGTCTGGAAATTG...,stem-down-15cm_rep2,5.800000e-06,train
108716,22975687,22975840,AAGGAGATATTTTAGAGCGCCAGGTGTCCGAACTGTACGCATGAAG...,stem-down-15cm_rep2,4.070000e-06,train
108717,28179183,28179336,TTTTCCAATCAAGGTTTTTAATGAGGCCACAAGTTACTTCTCTTAC...,stem-down-15cm_rep2,5.640000e-06,train
108718,36386528,36386681,ATCAGACACCTTGATCCATCATCCAAGGATCAGGTCGTCCATCCTT...,stem-down-15cm_rep2,6.560000e-06,train


In [None]:
# restrict columns based on what we care about
df_atac = df_atac[["sequence", "target", "set"]]
# df_atac = df_atac[["sequence", "norm_read_avg", "raw_read_sum", "target", "set"]]

In [None]:
df_atac

Unnamed: 0,sequence,target,set
907996,CCGGATCCGGATCCGGATAATAAAATGTTGGATCCGTCAAGGCCGG...,0,train
1455203,TTATTTTATTTTATTTTATTTTTATTTTTATTTTTATTTTTATTTT...,0,val
166727,CCCTAAGCCCTATACCCTAAGCCCTATACCCTAAGCCCTATACCCT...,0,train
1997001,AACAATTTTCGATATAATCATAATTTTAAAATTATTAAATAAAATA...,0,test
2233691,TTACGAAATGAATCAATGGTATTAATTTAAATAAAATATTTTAAAA...,0,train
...,...,...,...
1755351,GCTACCGGAGGCTCTGGGGAAGTCGGAATAGGAGAGCACTCATCTT...,1,train
1755371,ACGGTGGCAACTAAACACGAGGGTTGCGCTCGTTGCGGGACTTAAC...,1,train
2583525,TGTCCTATTCCGAAACTGGGAAACTGGAATCACCTGATTTGAAAGT...,1,train
2583526,GATTCTCCACCACTTTATGTATCCAAATCAAGCTTCTTACAAAGTG...,1,train


In [None]:
df_atac.to_csv("data/processed/napus_spotlight.csv", index=False)

### NRC-Defined Multi-Task

In [None]:
df = pd.read_csv("data/raw/new_data/atac.expression.binary.reformatted.tsv", sep="\t", skiprows=lambda x: x % 32, header=None)

In [None]:
samples =  ["bud-green",
            "bud-yellow",
            "peduncle-down-15cm",
            "seed-21d",
            "silique-1week",
            "silique-2week",
            "silique-3week",
            "silique-4week",
            "stem-down-15cm"]

column_names = ["ref", 
                "start_coord", 
                "end_coord",
                "sequence"]

df.columns = column_names + samples

In [None]:
df = df[np.logical_not(df.ref.isin(["chr_contigs", "napus_chloroplast", "napus_mitochondrion"]))]  # get rid of weirdo columns

In [None]:
# calculate number of ns in each row
df["num_n"] = df.sequence.str.count("N")
df["presence_n"] = [1 if "N" in s else 0 for s in df.sequence]

# remove all rows that have an N in them
odds = [s for s in list(set("".join(df.sequence))) if s not in ["A", "T", "C", "G"]]

mask = df.sequence.str.contains("|".join(odds))  # true if contains weird characters, false if contains only ATCG
df = df[np.logical_not(mask)]  # keep only rows without weird characters

In [None]:
(df.iloc[:,4:].sum()/df.shape[0])*100

bud-green             2.355121
bud-yellow            3.577214
peduncle-down-15cm    1.981970
seed-21d              3.408270
silique-1week         3.395771
silique-2week         3.390721
silique-3week         3.609414
silique-4week         2.199917
stem-down-15cm        2.541365
num_n                 0.000000
presence_n            0.000000
dtype: float64

In [None]:
# add set attribute
import random

picked1 = ["N"+str(i) for i in range(1,11)]
picked2 = ["N"+str(i) for i in range(11,20)]
random.Random(1202).shuffle(picked1) 
random.Random(1202).shuffle(picked2)

df["set"] = "train"

df.loc[df.ref == picked1[0], "set"] = "val"
df.loc[df.ref == picked2[0], "set"] = "val"

df.loc[df.ref == picked1[1], "set"] = "test"
df.loc[df.ref == picked2[1], "set"] = "test"

In [None]:
df.groupby("set").count()

Unnamed: 0_level_0,ref,start_coord,end_coord,sequence,bud-green,bud-yellow,peduncle-down-15cm,seed-21d,silique-1week,silique-2week,silique-3week,silique-4week,stem-down-15cm,num_n,presence_n
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
test,173632,173632,173632,173632,173632,173632,173632,173632,173632,173632,173632,173632,173632,173632,173632
train,923421,923421,923421,923421,923421,923421,923421,923421,923421,923421,923421,923421,923421,923421,923421
val,111038,111038,111038,111038,111038,111038,111038,111038,111038,111038,111038,111038,111038,111038,111038


In [None]:
# create franken dataset
df = df.drop(columns=["num_n", "presence_n"])

In [None]:
df.head()

Unnamed: 0,ref,start_coord,end_coord,sequence,bud-green,bud-yellow,peduncle-down-15cm,seed-21d,silique-1week,silique-2week,silique-3week,silique-4week,stem-down-15cm,set
0,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,0,0,0,0,0,0,0,0,0,train
1,N1,544,697,ATTGTTTTGGTTCACAATGGCGTCCACTCCTTCTCAAAATTCGAAG...,0,0,0,0,0,0,0,0,0,train
2,N1,1088,1241,ATGCCCAGAAAGAAACAGGCTTATGTCCAGGATGTAAAGAACAATA...,0,0,0,0,0,0,0,0,0,train
3,N1,1632,1785,GGAGAAATGAACAGTGGTGGCTTATCTCAGGAACAAGCTCTCACCT...,0,0,0,0,0,0,0,0,0,train
4,N1,2176,2329,TAATCAAATAATATGCACTTATTCAAAATCTTTTTTGTTTTGTTTA...,0,0,0,0,0,0,0,0,0,train


In [None]:
df = df.drop(columns=["ref", "start_coord", "end_coord"])

In [None]:
# save dataset
df.to_csv("data/processed/napus_multi.csv", index=False)

### NRC-Defined Single-Target

In [None]:
df = pd.read_csv("data/raw/new_data/atac.expression.binary.reformatted.tsv", sep="\t", skiprows=lambda x: x % 64, header=None)

In [None]:
samples =  ["bud-green",
            "bud-yellow",
            "peduncle-down-15cm",
            "seed-21d",
            "silique-1week",
            "silique-2week",
            "silique-3week",
            "silique-4week",
            "stem-down-15cm"]

column_names = ["ref", 
                "start_coord", 
                "end_coord",
                "sequence"]

df.columns = column_names + samples

In [None]:
df = df[np.logical_not(df.ref.isin(["chr_contigs", "napus_chloroplast", "napus_mitochondrion"]))]  # get rid of weirdo columns

In [None]:
# calculate number of ns in each row
# df["num_n"] = df.sequence.str.count("N")
# df["presence_n"] = [1 if "N" in s else 0 for s in df.sequence]

# remove all rows that have an N in them
odds = [s for s in list(set("".join(df.sequence))) if s not in ["A", "T", "C", "G"]]

mask = df.sequence.str.contains("|".join(odds))  # true if contains weird characters, false if contains only ATCG
df = df[np.logical_not(mask)]  # keep only rows without weird characters

In [None]:
(df.iloc[:,4:].sum()/df.shape[0])*100

bud-green             2.359544
bud-yellow            3.580540
peduncle-down-15cm    1.975117
seed-21d              3.421273
silique-1week         3.393624
silique-2week         3.394287
silique-3week         3.609844
silique-4week         2.200111
stem-down-15cm        2.529904
num_n                 0.000000
presence_n            0.000000
dtype: float64

In [None]:
# add set attribute
import random

picked1 = ["N"+str(i) for i in range(1,11)]
picked2 = ["N"+str(i) for i in range(11,20)]
random.Random(1202).shuffle(picked1) 
random.Random(1202).shuffle(picked2)

df["set"] = "train"

df.loc[df.ref == picked1[0], "set"] = "val"
df.loc[df.ref == picked2[0], "set"] = "val"

df.loc[df.ref == picked1[1], "set"] = "test"
df.loc[df.ref == picked2[1], "set"] = "test"

In [None]:
df.groupby("set").count()

Unnamed: 0_level_0,ref,start_coord,end_coord,sequence,bud-green,bud-yellow,peduncle-down-15cm,seed-21d,silique-1week,silique-2week,silique-3week,silique-4week,stem-down-15cm,num_n,presence_n
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
test,86852,86852,86852,86852,86852,86852,86852,86852,86852,86852,86852,86852,86852,86852,86852
train,461638,461638,461638,461638,461638,461638,461638,461638,461638,461638,461638,461638,461638,461638,461638
val,55525,55525,55525,55525,55525,55525,55525,55525,55525,55525,55525,55525,55525,55525,55525


In [None]:
# create franken dataset
df = df.drop(columns=["num_n", "presence_n"])

In [None]:
df.head()

Unnamed: 0,ref,start_coord,end_coord,sequence,bud-green,bud-yellow,peduncle-down-15cm,seed-21d,silique-1week,silique-2week,silique-3week,silique-4week,stem-down-15cm,set
0,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,0,0,0,0,0,0,0,0,0,train
1,N1,1088,1241,ATGCCCAGAAAGAAACAGGCTTATGTCCAGGATGTAAAGAACAATA...,0,0,0,0,0,0,0,0,0,train
2,N1,2176,2329,TAATCAAATAATATGCACTTATTCAAAATCTTTTTTGTTTTGTTTA...,0,0,0,0,0,0,0,0,0,train
3,N1,3264,3417,AACAAGTTTTTGGCATGAAAACGCATTTTTTGCGATTTTGGCGGGA...,0,0,0,0,0,0,0,0,0,train
4,N1,4352,4505,ATGATCCATCTGAATGAGTTGCACTTTTAGTGCCTAAACAAACAAA...,0,0,0,0,0,0,0,0,0,train


In [None]:
df = df.melt(id_vars=["ref", "start_coord", "end_coord", "sequence", "set"],  # https://stackoverflow.com/questions/28654047/convert-columns-into-rows-with-pandas
             var_name="tissue", 
             value_name="target")

In [None]:
df.sort_values(["ref", "start_coord", "sequence"])[:20]

Unnamed: 0,ref,start_coord,end_coord,sequence,set,tissue,target
0,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,train,bud-green,0
604015,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,train,bud-yellow,0
1208030,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,train,peduncle-down-15cm,0
1812045,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,train,seed-21d,0
2416060,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,train,silique-1week,0
3020075,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,train,silique-2week,0
3624090,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,train,silique-3week,0
4228105,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,train,silique-4week,0
4832120,N1,0,153,ATTATAGCTGCAAGCAATGTAATGTTTGTGCATTAACAGTTTCGAG...,train,stem-down-15cm,0
1,N1,1088,1241,ATGCCCAGAAAGAAACAGGCTTATGTCCAGGATGTAAAGAACAATA...,train,bud-green,0


In [None]:
# take a look at overall accessibility
(df["target"].sum()/df.shape[0])*100

2.9404714930736637

In [None]:
# save dataset
df[["sequence", "set", "tissue", "target"]].to_csv("data/processed/napus_melted.csv", index=False)

## Train Napus Model

In [None]:
# load in napus data
# df_binary = pd.read_csv("data/processed/napus_spotlight.csv")
# df_binary = pd.read_csv("data/processed/napus_processed_every4_binary_new.csv")
# df_binary = pd.read_csv("data/processed/napus_melted.csv")
data_in_path = "data/processed/napus_multi.csv"
df_binary = pd.read_csv(data_in_path)

### Define & Train Frequency Napus Model

In [None]:
def get_model(args, in_dim):  # initializes model architecture
    mdl = Sequential()

    # this is the only layer that is enforced. to test linear regression only, set layer_1_size to 1 and layer_1_activation to "linear"
    mdl.add(Dense(args["layer_1_size"], input_dim=in_dim, activation=args["layer_1_activation"]))

    if args["layer_2_size"] > 0:       mdl.add(Dense(args["layer_2_size"], activation=args["layer_2_activation"]))
    if args["layer_3_size"] > 0:       mdl.add(Dense(args["layer_3_size"], activation=args["layer_3_activation"]))
    if args["output_layer_size"] > 0:  mdl.add(Dense(args["output_layer_size"], activation=args["output_layer_activation"]))

    return mdl


def train_test_val(args, df):
    include = []  # captures all sequences we are including as input features

    if args["include_mononuc_freq"] == 1:  include += nts
    if args["include_dinuc_freq"] == 1:    include += [nt1+nt2 for nt1 in nts for nt2 in nts]
    if args["include_trinuc_freq"] == 1:   include += [nt1+nt2+nt3 for nt1 in nts for nt2 in nts for nt3 in nts]

    for item in include:  # create new columns with the counts of sequences in "include"
      # print("including", item)
      df[item] = df.sequence.str.count(item)

    # add numbers to fill out for the extra NTs'

    train_df = df[df.set == "train"]
    X_train = np.array(train_df[include])
    # y_train = np.array(train_df[args["target_name"][0]].tolist())
    # print(y_train.shape)
    y_train = np.array(train_df[args["target_name"]])
    # print(y_train.shape)

    # y_train = np.array(train_df["target"].tolist())


    val_df = df[df.set == "val"]
    X_val = np.array(val_df[include])
    y_val = np.array(val_df[args["target_name"]])

    test_df = df[df.set == "test"]
    X_test = np.array(test_df[include])
    y_test = np.array(test_df[args["target_name"]])

    return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
# define frequency-based model
args = {"target_name":[s for s in df_binary.columns if "-" in s],  # "target" or [s for s in df.columns if "-" in s]
        "include_mononuc_freq":1,
        "include_dinuc_freq":0,
        "include_trinuc_freq":1,
        "layer_1_size":64,
        "layer_1_activation":"relu",
        "layer_2_size":24,
        "layer_2_activation":"relu",
        "layer_3_size":0,
        "layer_2_activation":"relu",
        "output_layer_size":9,  # either 1 or 9
        "output_layer_activation":"sigmoid",  # change to change ["linear", "sigmoid"]
        "loss":"binary_crossentropy",  # change to change ["mean_squared_error", "binary_crossentropy"]
        "extra_metric":[tf.keras.metrics.AUC()],  # change to change [Spearman, "accuracy", tf.keras.metrics.AUC()]
        'learning_rate':0.002,
        'batch_size':512,
        'num_epochs':500,
        'patience':20,
        'max_batch_steps':-1,
        'optimizer':'adam',
        'verbose_training':1,
        "path":data_in_path}

X_train, y_train, X_val, y_val, X_test, y_test = train_test_val(args, df_binary)

In [None]:
model = get_model(args, X_train.shape[1])  # initalize model

inner_path = "streetlight/napus_models"

# create path to folder with results 
dir_path = ("experiments/"+inner_path+"/nucfreq"
            +"_"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
            +"_out"+str(args["output_layer_size"])
            +"_nuc"
            +str(args["include_mononuc_freq"])
            +str(args["include_dinuc_freq"])
            +str(args["include_trinuc_freq"])
            +"_lay"+str(args["layer_1_size"])
            +"-"+str(args["layer_2_size"])
            +"-"+str(args["layer_3_size"])
            +"-"+str(args["output_layer_size"])
            +"_lr"+str(args["learning_rate"])
            +"_bs"+str(args["batch_size"]))

print(dir_path)

# for binary task
model.compile(optimizer=Adam(lr=args["learning_rate"]),  # CHANGE IF WE WANT TO CHANGE OPTIM
              loss=args["loss"],
              metrics=[args["extra_metric"]])

logdir = os.path.join(dir_path, "logs")
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g
es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=args["patience"], restore_best_weights=True)
mc_callback = ModelCheckpoint(dir_path+'/best_weights.h5', monitor='val_loss', save_best_only=True)

history = model.fit(X_train, y_train,
                    epochs=args["num_epochs"],
                    batch_size=args["batch_size"],
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

model_json = model.to_json()
with open(dir_path+"/model.json", "w") as json_file:
    json_file.write(model_json)

del args["extra_metric"]
with open(dir_path+"/settings.txt", 'w') as settings_file:
     settings_file.write(json.dumps(args))  # https://www.geeksforgeeks.org/write-a-dictionary-to-a-file-in-python/

hist_df = pd.DataFrame(history.history) 
hist_df.to_csv(dir_path+'/training_history.csv')

experiments/streetlight/napus_models/nucfreq_20221109-021034_out9_nuc101_lay64-24-0-9_lr0.002_bs512
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 45: early stopping


PAUSE, RESTART RUNTIME, RUN UP UNTIL DATA LOADING CELL

In [None]:
# restart runtime, and run the next few cells

# grab this from printed statement in training cell vvv
dir_path = "experiments/streetlight/napus_models/nucfreq_20221109-021034_out9_nuc101_lay64-24-0-9_lr0.002_bs512"  

# load json and create model
with open(dir_path+"/model.json", "r") as json_file:
    loaded_model_json = json_file.read()
saved_model = model_from_json(loaded_model_json)
# load weights into new model
saved_model.load_weights(dir_path+"/best_weights.h5")

In [None]:
def get_results(y, pred):  # positive = accessible
  results = {}

  results["TN"], results["FP"], results["FN"], results["TP"] = confusion_matrix(y, pred).ravel()  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

  # Sensitivity, hit rate, recall, or true positive rate
  results["recall-sensitivity"] = results["TP"]/(results["TP"]+results["FN"])
  # Specificity or true negative rate
  results["specificity"] = results["TN"]/(results["TN"]+results["FP"]) 
  # Precision or positive predictive value
  results["precision"] = results["TP"]/(results["TP"]+results["FP"])

  return results

train_prob = saved_model.predict(X_train)
val_prob = saved_model.predict(X_val)
test_prob = saved_model.predict(X_test)



In [None]:
for i, name in enumerate(args["target_name"]):
  print(name)

  train_pred = (train_prob[:,i] > 0.5).astype("int32")
  val_pred = (val_prob[:,i] > 0.5).astype("int32")
  test_pred = (test_prob[:,i] > 0.5).astype("int32")

  train_results = get_results(y_train[:,i], train_pred)
  val_results = get_results(y_val[:,i], val_pred)
  test_results = get_results(y_test[:,i], test_pred)

  with open(dir_path+"/results_"+name+".csv", "w") as f:
    f.write(",train,val,test\n")
    f.write("accuracy,"+str(accuracy_score(y_train[:,i], train_pred))+","+str(accuracy_score(y_val[:,i], val_pred))+","+str(accuracy_score(y_test[:,i], test_pred))+"\n")
    f.write("AUC,"+str(roc_auc_score(y_train[:,i], train_prob[:,i]))+","+str(roc_auc_score(y_val[:,i], val_prob[:,i]))+","+str(roc_auc_score(y_test[:,i], test_prob[:,i]))+"\n")
    f.write("precision,"+str(train_results["precision"])+","+str(val_results["precision"])+","+str(test_results["precision"])+"\n")
    f.write("recall-sensitivity,"+str(train_results["recall-sensitivity"])+","+str(val_results["recall-sensitivity"])+","+str(test_results["recall-sensitivity"])+"\n")
    f.write("specificity,"+str(train_results["specificity"])+","+str(val_results["specificity"])+","+str(test_results["specificity"])+"\n")
    f.write("TN,"+str(train_results["TN"])+","+str(val_results["TN"])+","+str(test_results["TN"])+"\n")
    f.write("FN,"+str(train_results["FN"])+","+str(val_results["FN"])+","+str(test_results["FN"])+"\n")
    f.write("TP,"+str(train_results["TP"])+","+str(val_results["TP"])+","+str(test_results["TP"])+"\n")
    f.write("FP,"+str(train_results["FP"])+","+str(val_results["FP"])+","+str(test_results["FP"]))

bud-green
bud-yellow
peduncle-down-15cm
seed-21d
silique-1week
silique-2week
silique-3week
silique-4week
stem-down-15cm


In [None]:
# with open(dir_path+"/results.csv", "w") as f:
#   train_prob = saved_model.predict(X_train)
#   val_prob = saved_model.predict(X_val)
#   test_prob = saved_model.predict(X_test)

#   train_pred = (train_prob > 0.5).astype("int32")
#   val_pred = (val_prob > 0.5).astype("int32")
#   test_pred = (test_prob > 0.5).astype("int32")

#   train_results = get_results(saved_model, y_train, train_pred)
#   val_results = get_results(saved_model, y_val, val_pred)
#   test_results = get_results(saved_model, y_test, test_pred)

#   f.write(",train,val,test\n")
#   f.write("accuracy,"+str(accuracy_score(y_train, train_pred))+","+str(accuracy_score(y_val, val_pred))+","+str(accuracy_score(y_test, test_pred))+"\n")
#   f.write("AUC,"+str(roc_auc_score(y_train, train_prob))+","+str(roc_auc_score(y_val, val_prob))+","+str(roc_auc_score(y_test, test_prob))+"\n")
#   f.write("precision,"+str(train_results["precision"])+","+str(val_results["precision"])+","+str(test_results["precision"])+"\n")
#   f.write("recall-sensitivity,"+str(train_results["recall-sensitivity"])+","+str(val_results["recall-sensitivity"])+","+str(test_results["recall-sensitivity"])+"\n")
#   f.write("specificity,"+str(train_results["specificity"])+","+str(val_results["specificity"])+","+str(test_results["specificity"])+"\n")
#   f.write("TN,"+str(train_results["TN"])+","+str(val_results["TN"])+","+str(test_results["TN"])+"\n")
#   f.write("FN,"+str(train_results["FN"])+","+str(val_results["FN"])+","+str(test_results["FN"])+"\n")
#   f.write("TP,"+str(train_results["TP"])+","+str(val_results["TP"])+","+str(test_results["TP"])+"\n")
#   f.write("FP,"+str(train_results["FP"])+","+str(val_results["FP"])+","+str(test_results["FP"])+"\n")

In [None]:
# new_model_json = "models/streetlight/freq_model.json"
# new_model_h5 = "models/streetlight/freq_weights.h5"

# model_json = saved_model.to_json()
# with open(new_model_json, "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# model.save_weights(new_model_h5)

### Define & Train Conv Napus Model

In [None]:
def train_test_val(args, df):  # splits dataframe into all the sets
    if args["shuffle"] == 1:  # shuffles NTs within each sequence
      df.loc[:,"sequence"] = [''.join(random.sample(s, len(s))) for s in df["sequence"]]

    train_df = df[df.set == "train"]
    X_train = np.array([get_ohe(sqnc) for sqnc in train_df["sequence"]])
    y_train = np.array(train_df[args["target_name"]])

    val_df = df[df.set == "val"]
    X_val = np.array([get_ohe(sqnc) for sqnc in val_df["sequence"]])
    y_val = np.array(val_df[args["target_name"]])

    test_df = df[df.set == "test"]
    X_test = np.array([get_ohe(sqnc) for sqnc in test_df["sequence"]])
    y_test = np.array(test_df[args["target_name"]])

    return X_train, y_train, X_val, y_val, X_test, y_test


def get_ohe(sequence):  # gets sequence in format model can use (145, 4)
    return np.array([mapping[nt] for nt in sequence])


def get_model(args):  # initializes model architecture
    mdl = Sequential()

    conv1_train = args["conv_one_set"] != 2  # True if conv layer should be trained
    mdl.add(Conv1D(120, 5, activation='relu', input_shape=(args["input_sequence_length"], 4), name="1DConv_1", trainable=conv1_train))
    mdl.add(BatchNormalization(name="batchNorm1", trainable=conv1_train))
    mdl.add(Dropout(0.1, name="drop1"))

    conv2_train = args["conv_two_set"] != 2  # True if conv layer should be trained
    mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=conv2_train))
    mdl.add(BatchNormalization(name="batchNorm2", trainable=conv2_train))
    mdl.add(Dropout(0.1, name="drop2"))

    if args["last_conv_layer"] == 1:  # if we are not removing last conv layer for simplicity
      conv3_train = args["conv_three_set"] != 2  # True if conv layer should be trained
      mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=conv3_train))
      mdl.add(BatchNormalization(name="batchNorm3", trainable=conv3_train))
      mdl.add(Dropout(0.1, name="drop3"))

    mdl.add(Flatten(name="flat"))

    if args["linear_mapping"] == 1: 
        mdl.add(Dense(12, activation='linear', name="dense1", trainable=False))

    # output layer
    mdl.add(Dense(args["output_layer_size"], activation=args['output_layer_activation'], name="dense2"))

    return mdl

In [None]:
# define conv-based model
args = {"input_sequence_length":153,  # CHANGE WITH DATASET
        "target_name":[s for s in df_binary.columns if "-" in s],  # "target" or [s for s in df.columns if "-" in s]
        "number_of_outputs":1,
        "conv_one_set":0,
        "conv_two_set":0,
        "conv_three_set":0,
        "linear_mapping":0,
        "last_conv_layer":1,
        "shuffle":0,
        "output_layer_size":9,  # 1 or 9
        "output_layer_activation":"sigmoid",  # change to change ["linear", "sigmoid"]
        "loss":"binary_crossentropy",  # change to change ["mean_squared_error", "binary_crossentropy"]
        "extra_metric":[tf.keras.metrics.AUC()],  # change to change [Spearman, "accuracy"]
        'learning_rate':0.002,
        'batch_size':512,
        'num_epochs':500,
        'patience':20,
        'max_batch_steps':-1,
        'optimizer':'adam',
        'verbose_training':1,
        "path":data_in_path}

X_train, y_train, X_val, y_val, X_test, y_test = train_test_val(args, df_binary)

done
done
done


In [None]:
model = get_model(args)  # initalize model

# create path to folder with results 

inner_path = "streetlight/napus_models"

date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
arch_settings = str(args["conv_one_set"])+str(args["conv_two_set"])+str(args["conv_three_set"])+str(args["linear_mapping"])
dir_path = "experiments/"+inner_path+"/conv_validation_"+date+"_out"+str(args["output_layer_size"])+"_"+arch_settings+"_lr"+str(args['learning_rate'])+"_bs"+str(args['batch_size'])+"_ep"+str(args['num_epochs'])

print(dir_path)

# for binary task
model.compile(optimizer=Adam(lr=args["learning_rate"]),  # CHANGE IF WE WANT TO CHANGE OPTIM
              loss=args["loss"],
              metrics=[args["extra_metric"]])

logdir = os.path.join(dir_path, "logs")
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g
es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=args["patience"], restore_best_weights=True)
mc_callback = ModelCheckpoint(dir_path+'/best_weights.h5', monitor='val_loss', save_best_only=True)

history = model.fit(X_train, y_train,
                    epochs=args["num_epochs"],
                    batch_size=args["batch_size"],
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

model_json = model.to_json()
with open(dir_path+"/model.json", "w") as json_file:
    json_file.write(model_json)

del args["extra_metric"]
with open(dir_path+"/settings.txt", 'w') as settings_file:
     settings_file.write(json.dumps(args))  # https://www.geeksforgeeks.org/write-a-dictionary-to-a-file-in-python/

hist_df = pd.DataFrame(history.history) 
hist_df.to_csv(dir_path+'/training_history.csv')

experiments/streetlight/napus_models/conv_validation_20221109-024110_out9_0000_lr0.002_bs512_ep500
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 26: early stopping


PAUSE, RESTART RUNTIME, RUN UP UNTIL DATA LOADING CELL

In [None]:
# restart runtime, and run the next few cells

# grab this from printed statement in training cell vvv
dir_path = "experiments/streetlight/napus_models/conv_validation_20221109-024110_out9_0000_lr0.002_bs512_ep500"  

# load json and create model
with open(dir_path+"/model.json", "r") as json_file:
    loaded_model_json = json_file.read()
saved_model = model_from_json(loaded_model_json)
# load weights into new model
saved_model.load_weights(dir_path+"/best_weights.h5")

In [None]:
def get_results(y, pred):  # positive = accessible
  results = {}

  results["TN"], results["FP"], results["FN"], results["TP"] = confusion_matrix(y, pred).ravel()  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

  # Sensitivity, hit rate, recall, or true positive rate
  results["recall-sensitivity"] = results["TP"]/(results["TP"]+results["FN"])
  # Specificity or true negative rate
  results["specificity"] = results["TN"]/(results["TN"]+results["FP"]) 
  # Precision or positive predictive value
  results["precision"] = results["TP"]/(results["TP"]+results["FP"])

  return results

In [None]:
train_prob = saved_model.predict(X_train)
val_prob = saved_model.predict(X_val)
test_prob = saved_model.predict(X_test)

for i, name in enumerate(args["target_name"]):
  print(name)

  train_pred = (train_prob[:,i] > 0.5).astype("int32")
  val_pred = (val_prob[:,i] > 0.5).astype("int32")
  test_pred = (test_prob[:,i] > 0.5).astype("int32")

  train_results = get_results(y_train[:,i], train_pred)
  val_results = get_results(y_val[:,i], val_pred)
  test_results = get_results(y_test[:,i], test_pred)

  with open(dir_path+"/results_"+name+".csv", "w") as f:
    f.write(",train,val,test\n")
    f.write("accuracy,"+str(accuracy_score(y_train[:,i], train_pred))+","+str(accuracy_score(y_val[:,i], val_pred))+","+str(accuracy_score(y_test[:,i], test_pred))+"\n")
    f.write("AUC,"+str(roc_auc_score(y_train[:,i], train_prob[:,i]))+","+str(roc_auc_score(y_val[:,i], val_prob[:,i]))+","+str(roc_auc_score(y_test[:,i], test_prob[:,i]))+"\n")
    f.write("precision,"+str(train_results["precision"])+","+str(val_results["precision"])+","+str(test_results["precision"])+"\n")
    f.write("recall-sensitivity,"+str(train_results["recall-sensitivity"])+","+str(val_results["recall-sensitivity"])+","+str(test_results["recall-sensitivity"])+"\n")
    f.write("specificity,"+str(train_results["specificity"])+","+str(val_results["specificity"])+","+str(test_results["specificity"])+"\n")
    f.write("TN,"+str(train_results["TN"])+","+str(val_results["TN"])+","+str(test_results["TN"])+"\n")
    f.write("FN,"+str(train_results["FN"])+","+str(val_results["FN"])+","+str(test_results["FN"])+"\n")
    f.write("TP,"+str(train_results["TP"])+","+str(val_results["TP"])+","+str(test_results["TP"])+"\n")
    f.write("FP,"+str(train_results["FP"])+","+str(val_results["FP"])+","+str(test_results["FP"]))

bud-green
bud-yellow
peduncle-down-15cm
seed-21d
silique-1week
silique-2week
silique-3week
silique-4week
stem-down-15cm


In [None]:
# with open(dir_path+"/results.csv", "w") as f:
#   train_prob = saved_model.predict(X_train)
#   val_prob = saved_model.predict(X_val)
#   test_prob = saved_model.predict(X_test)

#   train_pred = (train_prob > 0.5).astype("int32")
#   val_pred = (val_prob > 0.5).astype("int32")
#   test_pred = (test_prob > 0.5).astype("int32")

#   train_results = get_results(saved_model, y_train, train_pred)
#   val_results = get_results(saved_model, y_val, val_pred)
#   test_results = get_results(saved_model, y_test, test_pred)

#   f.write(",train,val,test\n")
#   f.write("accuracy,"+str(accuracy_score(y_train, train_pred))+","+str(accuracy_score(y_val, val_pred))+","+str(accuracy_score(y_test, test_pred))+"\n")
#   f.write("AUC,"+str(roc_auc_score(y_train, train_prob))+","+str(roc_auc_score(y_val, val_prob))+","+str(roc_auc_score(y_test, test_prob))+"\n")
#   f.write("precision,"+str(train_results["precision"])+","+str(val_results["precision"])+","+str(test_results["precision"])+"\n")
#   f.write("recall-sensitivity,"+str(train_results["recall-sensitivity"])+","+str(val_results["recall-sensitivity"])+","+str(test_results["recall-sensitivity"])+"\n")
#   f.write("specificity,"+str(train_results["specificity"])+","+str(val_results["specificity"])+","+str(test_results["specificity"])+"\n")
#   f.write("TN,"+str(train_results["TN"])+","+str(val_results["TN"])+","+str(test_results["TN"])+"\n")
#   f.write("FN,"+str(train_results["FN"])+","+str(val_results["FN"])+","+str(test_results["FN"])+"\n")
#   f.write("TP,"+str(train_results["TP"])+","+str(val_results["TP"])+","+str(test_results["TP"])+"\n")
#   f.write("FP,"+str(train_results["FP"])+","+str(val_results["FP"])+","+str(test_results["FP"])+"\n")



In [None]:
# new_model_json = "models/streetlight/conv_model.json"
# new_model_h5 = "models/streetlight/conv_weights.h5"

In [None]:
# # # save model to new place
# # !mkdir models/streetlight

# model_json = saved_model.to_json()
# with open(new_model_json, "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# model.save_weights(new_model_h5)

In [None]:
# # load json and create model
# with open(new_model_json, "r") as json_file:
#   loaded_model_json = json_file.read()
# saved_model = model_from_json(loaded_model_json)
# # load weights into new model
# saved_model.load_weights(new_model_h5)

In [None]:
# print(str(accuracy_score(y_test, (saved_model.predict(X_test) > 0.5).astype("int32"))))
# print(str(accuracy_score(y_val, (saved_model.predict(X_val) > 0.5).astype("int32"))))
# print(str(accuracy_score(y_train, (saved_model.predict(X_train) > 0.5).astype("int32"))))

0.7195299252336986


In [None]:
# # saved_model = model

# with open("results.csv", "w") as f:
#   f.write(",train,val,test\n")
#   f.write("accuracy,"+str(accuracy_score(y_train, (saved_model.predict(X_train) > 0.5).astype("int32")))+","+str(accuracy_score(y_val, (saved_model.predict(X_val) > 0.5).astype("int32")))+","+str(accuracy_score(y_test, (saved_model.predict(X_test) > 0.5).astype("int32")))+"\n")

## Create & Restrict Arabidopsis Dataset

In [None]:
# load in arabidopsis data we have (this code is taken from https://colab.research.google.com/drive/15zjI0_FTUnZKCSZFK48310b0i_dFqOtR?authuser=2#scrollTo=vtcXIATY2q-8)

downsize = 16

column_names = ["chromosome", 
                "start_coord", 
                "end_coord", 
                "sequence", 
                "raw_control_coverage", 
                "raw_treatment_coverage", 
                "norm_control_coverage", 
                "norm_treatment_coverage"]

df = pd.read_csv('data/raw/athal_starr_hidra.tsv', sep="\t", header=None)
df.columns = column_names

df = df[df.chromosome.isin(["Chr"+str(i) for i in range(1,6)])]  # keep rows with "Chr" in the chromosome column

if downsize:
  df = df.iloc[::downsize, :]  # select every nth row

odds = [s for s in list(set("".join(df.sequence))) if s not in ["A", "T", "C", "G"]]

mask = df.sequence.str.contains("|".join(odds))  # true if contains weird characters, false if contains only ATCG
df = df[np.logical_not(mask)]  # keep only rows without weird characters

df = df[(df.raw_control_coverage >= 30) & (df.raw_treatment_coverage >= 5)]

df["target"] = np.log2(df.norm_control_coverage/df.norm_treatment_coverage)

df["set"] = "train"
picked = ["Chr2","Chr4"]
random.Random(1202).shuffle(picked)
# set val and test by chromosome we want
df.loc[df.chromosome == picked[0], "set"] = "val"
df.loc[df.chromosome == picked[1], "set"] = "test"

df = df[["sequence", "target", "set"]]
df["sequence"] += "XXXXXXXX"   # pads

In [None]:
# save regular schmegular arabidopsis dataset - arabidopsis_full.csv
df.to_csv("data/processed/arabidopsis_full.csv", index=False)  # write to file

### Restrict using freq model

In [None]:
# definitions
def freq(df, mono, di, tri):
    include = []  # captures all sequences we are including as input features

    if mono:  include += nts
    if di:    include += [nt1+nt2 for nt1 in nts for nt2 in nts]
    if tri:   include += [nt1+nt2+nt3 for nt1 in nts for nt2 in nts for nt3 in nts]

    for item in include:  # create new columns with the counts of sequences in "include"
      df[item] = df.sequence.str.count(item)

    df = np.array(df[include])

    return df

In [None]:
# load in data
df = pd.read_csv("data/processed/arabidopsis_full.csv")

In [None]:
# load in freq model
new_model_json = "models/streetlight/freq_model.json"
new_model_h5 = "models/streetlight/freq_weights.h5"

# load json and create model
with open(new_model_json, "r") as json_file:
    loaded_model_json = json_file.read()
saved_model = model_from_json(loaded_model_json)
# load weights into new model
saved_model.load_weights(new_model_h5)

In [None]:
# run napus model on arabidopsis dataset, see how many it predicts to be accessible / inaccessible

# one hot encode data into numpy array
freqs = freq(df, True, False, True)

In [None]:
# run model on this array
probs = saved_model.predict(freqs)
preds = (probs > 0.5).astype("int32")

In [None]:
# add predictions to df
df["probs"] = probs
df["preds"] = preds

In [None]:
df.shape

(1031576, 73)

In [None]:
# see what this looks like with the set
df.groupby("set").count()

Unnamed: 0_level_0,sequence,target,A,T,C,G,AAA,AAT,AAC,AAG,...,GCA,GCT,GCC,GCG,GGA,GGT,GGC,GGG,probs,preds
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
test,161665,161665,161665,161665,161665,161665,161665,161665,161665,161665,...,161665,161665,161665,161665,161665,161665,161665,161665,161665,161665
train,702867,702867,702867,702867,702867,702867,702867,702867,702867,702867,...,702867,702867,702867,702867,702867,702867,702867,702867,702867,702867
val,167044,167044,167044,167044,167044,167044,167044,167044,167044,167044,...,167044,167044,167044,167044,167044,167044,167044,167044,167044,167044


In [None]:
# drop all rows that are predicted to be inaccessible

do = "cutoff"

# class pred
if do == "class":
  df = df.loc[df.preds == 1,:]
elif do == "cutoff":
  take_top = 0.02
  df = df.sort_values(by="probs").iloc[-int(take_top*df.shape[0]):,:]

In [None]:
df.shape

(20631, 73)

In [None]:
# see what this looks like with the set
df.groupby("set").count()

Unnamed: 0_level_0,sequence,target,A,T,C,G,AAA,AAT,AAC,AAG,...,GCA,GCT,GCC,GCG,GGA,GGT,GGC,GGG,probs,preds
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
test,3410,3410,3410,3410,3410,3410,3410,3410,3410,3410,...,3410,3410,3410,3410,3410,3410,3410,3410,3410,3410
train,13738,13738,13738,13738,13738,13738,13738,13738,13738,13738,...,13738,13738,13738,13738,13738,13738,13738,13738,13738,13738
val,3483,3483,3483,3483,3483,3483,3483,3483,3483,3483,...,3483,3483,3483,3483,3483,3483,3483,3483,3483,3483


In [None]:
# save new arabidopsis dataset - arabidopsis_freq_streetlight.csv
df.to_csv("data/processed/arabidopsis_freq_streetlight_top2percent.csv", index=False)  # write to file

In [None]:
728736/1031576

0.7064297734728222

### Restrict using conv model

In [None]:
def ohes(df):  # splits dataframe into all the sets
    df = np.array([get_ohe(sqnc) for sqnc in df["sequence"]])
    return df

def get_ohe(sequence):  # gets sequence in format model can use (145, 4)
    return np.array([mapping[nt] for nt in sequence])

In [None]:
# load in data
df = pd.read_csv("data/processed/arabidopsis_full.csv")

In [None]:
# load in conv model
new_model_json = "models/streetlight/conv_model.json"
new_model_h5 = "models/streetlight/conv_weights.h5"

# load json and create model
with open(new_model_json, "r") as json_file:
    loaded_model_json = json_file.read()
saved_model = model_from_json(loaded_model_json)
# load weights into new model
saved_model.load_weights(new_model_h5)

In [None]:
# run napus model on arabidopsis dataset, see how many it predicts to be accessible / inaccessible

# one hot encode data into numpy array
ohe_sqncs = ohes(df)

In [None]:
# run model on this array
probs = saved_model.predict(ohe_sqncs)
preds = (probs > 0.5).astype("int32")

In [None]:
# add predictions to df
df["probs"] = probs
df["preds"] = preds

In [None]:
df.shape

(1031576, 5)

In [None]:
# see what this looks like with the set
df.groupby("set").count()

Unnamed: 0_level_0,sequence,target,probs,preds
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test,161665,161665,161665,161665
train,702867,702867,702867,702867
val,167044,167044,167044,167044


In [None]:
# drop all rows that are predicted to be inaccessible

do = "cutoff"

if do == "class":
  df = df.loc[df.preds == 1,:]
elif do == "cutoff":
  take_top = 0.02
  df = df.sort_values(by="probs").iloc[-int(take_top*df.shape[0]):,:]

In [None]:
df.shape

(20631, 5)

In [None]:
# see what this looks like with the set
df.groupby("set").count()

Unnamed: 0_level_0,sequence,target,probs,preds
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test,3373,3373,3373,3373
train,13834,13834,13834,13834
val,3424,3424,3424,3424


In [None]:
# save new arabidopsis dataset - arabidopsis_conv_streetlight.csv

if do == "class":
  df.to_csv("data/processed/arabidopsis_conv_streetlight.csv", index=False)  # write to file
elif do == "cutoff":
  df.to_csv("data/processed/arabidopsis_conv_streetlight_top2percent.csv", index=False)  # write to file

In [None]:
df

Unnamed: 0,sequence,target,set,probs,preds
798858,GAGAGGAAAGAGATTGATGTTTGTTGGAGACTCGTTAAACAGAGGC...,0.244772,train,0.949927,1
502159,GGTGGTGGTGAAGCAAGCCCTTTTTGGTCATTCAAGCTTTGGCTCC...,0.604862,train,0.949929,1
836076,TGGGGTTGGGGTGGGTTGATGTAATTACAGAGATACAACCACAGTC...,-0.113891,train,0.949929,1
472760,AACTCTTAGCAGAGCAATTGGAAGGGGCATGGGTTAGGCCAAATGA...,0.011542,train,0.949929,1
53660,GGAACAATGCCACAACAGATCCGACCCGTGGGCCGAGCCAGTGTTC...,0.431339,train,0.949931,1
...,...,...,...,...,...
511228,CGTATGGTGGAGGTGATGGAGGTGGTGGCGGCGGCGAAGGATACAC...,0.807355,train,0.999464,1
107565,ACGGCGTCGGAGACGTGGGTAGAGACGGCGTCGGAGACGTGGGTCA...,-1.088559,train,0.999480,1
783067,AGGAGGAGGAGGTGGGGGTAGCGGAAATGGCTCAGGCCGTGGTCGG...,2.528379,test,0.999504,1
892423,GTGAAGGTGGACCCAGTGGTGGTGATGGTGAAGGTGGACCAAGTGG...,-1.173829,train,0.999584,1


In [None]:
705988/1031576

0.6843780778149162