In [2]:
import os, sys
import pandas as pd

from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# path 

In [3]:
PATH = "/wynton/home/ahituv/fongsl/EMF/US/ml_emf/"
BIN_PATH = os.path.join(PATH, "bin")
DATA_PATH = os.path.join(PATH, "data", "legnet")
OUTPUT_PATH = os.path.join(DATA_PATH, "output")

LEGNET_SH = os.path.join(BIN_PATH, "legnet_inference.sh")

## test files 
- 3929 sequences, 80 bp, from GPRA

In [4]:
COMPLEX_NATIVE = os.path.join(DATA_PATH, "complex_Native.txt")  # original test set
COMPLEX_NATIVE_NOBIN = os.path.join(DATA_PATH, "complex_Native_nobin.txt")  # no bin activity column
COMPLEX_NATIVE_NOBIN_270 = os.path.join(DATA_PATH, "complex_Native_nobin.270.txt")  # no bin activity column +  include another 190 bp from promoterscaffold, plasmid sequence

In [7]:
def launchLegNet(input_file, seqsize):
    """ predict 18 bins using input sequence, write to output file, specify sequence size"""

    PATH = "/wynton/home/ahituv/fongsl/EMF/US/ml_emf/bin/"
    LEGNET_SH = os.path.join(PATH, "legnet_inference.sh")

    OUTPUT_PATH = "/wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/output/"

    # str split to make output file a copy of the input file.
    infile = os.path.split(input_file)[1]
    outfile = os.path.join(OUTPUT_PATH, "inference." + ".".join(infile.split(".")[:-1]) + ".tsv")

    cmd = " ".join(["qsub -q gpu.q",
                   LEGNET_SH,
                   input_file,
                   outfile,
                   str(seqsize)
                   ])
    print(cmd)
    os.system(cmd)
    if os.path.exists(outfile) is False:
        print("running...")
        
    else:
        print("skipping...", outfile)

# test out of box

In [12]:
# out-of-box test set
launchLegNet(COMPLEX_NATIVE, 110)

qsub -q gpu.q /wynton/home/ahituv/fongsl/EMF/US/ml_emf/bin/legnet_inference.sh /wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/complex_Native.txt /wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/output/inference.complex_Native.tsv 110
skipping... /wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/output/inference.complex_Native.tsv


In [6]:
# test set w/o bin activities
launchLegNet(COMPLEX_NATIVE_NOBIN, 150)

qsub -q gpu.q /wynton/home/ahituv/fongsl/EMF/US/ml_emf/bin/legnet_model_300.sh /wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/complex_Native_nobin.txt /wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/output/inference.complex_Native_nobin.tsv 150
running...


In [25]:
# test set with longer sequence? 

launchLegNet(COMPLEX_NATIVE_NOBIN_270, 270)

running...


# compare results 

In [27]:
%%bash
ls /wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/output

complex_Native.inference.nobin.200bp.txt
complex_Native.inference.nobin.txt
complex_Native.inference.txt
inference.complex_Native_nobin.270.tsv
inference.complex_Native_nobin.tsv
inference.complex_Native.tsv
US.seqs.inference.txt


In [28]:
out1 = os.path.join(OUTPUT_PATH, "inference.complex_Native.tsv")
out2 = os.path.join(OUTPUT_PATH, "inference.complex_Native_nobin.tsv")
out3 = os.path.join(OUTPUT_PATH, "inference.complex_Native_nobin.270.tsv")

In [29]:
df1 = pd.read_csv(out1, sep='\t', header=None, names=["seq", "pred1"])
df2 = pd.read_csv(out2, sep='\t', header=None, names=["seq", "pred2"])
df3 = pd.read_csv(out3, sep='\t', header=None, names=["seq", "pred3"])

In [30]:
df1 = pd.merge(df1, df2)
df1 = pd.merge(df1, df3)
df1

Unnamed: 0,seq,pred1,pred2,pred3
0,TGCATTTTTTTCACATCAAAAAAAAAAAGAAAAACAAAGAAAAAGG...,14.675456,14.883397,13.896811
1,TGCATTTTTTTCACATCAAAAAAAAAAAGAAAAAGAAACAAAAAGG...,14.855910,15.061954,13.804238
2,TGCATTTTTTTCACATCAAAAAAAAAAAGAAAAAGAAAGAAAAAGA...,14.958251,15.171226,13.844028
3,TGCATTTTTTTCACATCAAAAAAAAAAAGAAAAAGAAAGAAAAAGG...,12.072526,12.165880,12.038105
4,TGCATTTTTTTCACATCAAAAAAAAAAAGAAAAAGAAAGAAAAAGG...,15.284714,15.501473,13.764419
...,...,...,...,...
3924,TGCATTTTTTTCACATCTTTTTTGATGCGCTATCATCCATTATTCT...,6.664516,6.817938,5.017213
3925,TGCATTTTTTTCACATCTTTTTTGATGCGCTATCATCCATTATTCT...,5.736858,5.900172,4.682234
3926,TGCATTTTTTTCACATCTTTTTTGATGCGCTATCATCCATTATTCT...,6.379897,6.527392,4.868117
3927,TGCATTTTTTTCACATCTTTTTTGATGCGCTATCATCCCTTATTCT...,6.765811,6.887493,5.031179


# pearson
- pred 1 v. pred 2 = identical data
- pred 1 v. pred 3 = similar data, pred3 sequence size is larger. 

In [31]:
stats.pearsonr(df1["pred1"], df1["pred2"]) # identical data, nearly identical predictions

(0.9984217246143716, 0.0)

In [32]:
stats.pearsonr(df1["pred1"], df1["pred3"])  # increasing window size to 270 reduces prediction to 0.904

(0.904194538887592, 0.0)

# spearman

In [33]:
stats.spearmanr(df1["pred1"], df1["pred2"])   # identical data

SpearmanrResult(correlation=0.9988139517636504, pvalue=0.0)

In [34]:
stats.spearmanr(df1["pred1"], df1["pred3"])  # increasing window size to 200

SpearmanrResult(correlation=0.9807884141233121, pvalue=0.0)