In [1]:
#from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqUtils import gc_fraction

# for plotting
from matplotlib import cm
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns

# for analysis
import numpy as np
import os
import pandas as pd

# for modeling
from scipy.cluster.hierarchy import dendrogram, linkage

from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_curve, roc_auc_score, mean_squared_error, precision_recall_curve, f1_score, auc

import statsmodels.api as sm

import sys
import config_readwrite as crw

In [3]:
config, cfn = crw.read(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "config.ini"))

In [2]:
section = "sei"

CHROM_PRED = config[section]["chrom_pred"]
CLASS_PRED = config[section]["class_pred"]
FASTA = config[section]["fasta"]
LABEL = config[section]["LABELS"]

section= "data"
MPRA = config[section]["hepg2.clean.trans.scaled"]

NameError: name 'config' is not defined

In [40]:
PATH = "/wynton/home/ahituv/fongsl/EMF/US/data/sei_predictions/chromatin-profiles-hdf5"
CLASS_PRED = os.path.join(PATH,"ultrasound_final_no_adapter.sei_padded.raw_sequence_class_scores.npy")
LABEL = os.path.join(PATH,"ultrasound_final_no_adapter.sei_padded_row_labels.txt")

# functions

In [9]:
def getHandles(fasta):
    """make a dictionary of all the output handles for one fasta file sei run"""
    
    SEI_SRC = "/wynton/home/ahituv/fongsl/bin/sei-framework/"
    SEI_PATH = os.path.join(os.path.split(fasta)[0], "sei_predictions")

    FASTA_CLEAN = os.path.splitext(fasta)[0] + ".clean.fa"
    FASTA_INDEX =  os.path.splitext(FASTA_CLEAN)[0] + ".index.txt"
    
    PADDED  = os.path.splitext(FASTA_CLEAN)[0] + ".sei_padded.fa"
    PATH, HANDLE = os.path.split(PADDED)
    HANDLE = HANDLE.strip(".fa")
    
    
    CHROM_PATH= os.path.join(
        SEI_PATH, "chromatin-profiles-hdf5")
    
    CHROM_OUT = os.path.join(CHROM_PATH, f"{HANDLE}_predictions.h5")
    CLASS_OUT = os.path.join(CHROM_PATH, f"{HANDLE}.raw_sequence_class_scores.npy")
    LABEL_OUT = os.path.join(CHROM_PATH, f"{HANDLE}_row_labels.txt")
    CLASS_TABLE = os.path.join(CHROM_PATH, f"{HANDLE}.raw_sequence_class_scores.table.tsv.gz")
    
    
    path_dict = {
        "FASTA": fasta,
        "FASTA_CLEAN":FASTA_CLEAN, 
        "FASTA_INDEX":FASTA_INDEX,
        "PADDED":PADDED, 
        "PATH": PATH,
        "HANDLE":HANDLE, 
        "SEI_PATH":SEI_PATH, 
        "SEI_SRC":SEI_SRC,
        "CHROM_PATH": CHROM_PATH, 
        "CHROM_PRED": CHROM_OUT,
        "CLASS_PRED" : CLASS_OUT, 
        "LABELS" : LABEL_OUT, 
        "CLASS_TABLE":CLASS_TABLE
    }
    
    return path_dict

In [13]:
def returnSequenceClassLabels():
    file = "/wynton/home/ahituv/fongsl/bin/sei-framework/sequence_class_labels.csv"
    lab = pd.read_csv(file)

    return lab


def processLabel(label_file, index):
    """
    input 
        label_file  (str) - path with the labels for the sequences run through the DNN

    Method
        1. opens sequence label file as pd dataframe
        2. drop the index column
        3. make UCSC genome browswer coordinates
        4. avg pct? 
        5. "bin"?
    """

    # read label file as pd dataframe
    lab = pd.read_csv(label_file, sep='\t')
    lab = lab.drop(columns=["index"])  # redundant index column

    if 
    ind = pd.read_csv(index, sep='\t', header=None, names=["name", 'tile.coor'])
    lab = pd.merge(lab, ind)
    return lab[["tile.coor"]]


def makeTable(class_pred, labels, index_file, table_file):

    # get sequence class labels from function above
    seqClass = returnSequenceClassLabels()
    
    # open npy data
    data = np.load(class_pred, allow_pickle=True)

    # turn into pd dataframe
    df = pd.DataFrame(data)

    # rename columns
    df.columns = list(seqClass["#Sequence class label"])[:-1]

    # process labels file
    lab = processLabel(labels, index_file)

    # add labels and data together
    df = pd.merge(lab, df, left_index=True, right_index=True)

    # write table to table file. 
    df.to_csv(table_file, sep='\t', index=False, compression="gzip")
    
    return df

In [14]:
def returnSequenceClassLabels():
    file = "/wynton/home/ahituv/fongsl/bin/sei-framework/sequence_class_labels.csv"
    lab = pd.read_csv(file)

    return lab

def seqClassLookup(annot_list):
    print(seqClass.loc[seqClass[seqClass.columns[0]].isin(annot_list)])


def getColorMap(col, df, cmap):
    
    lut = dict(zip(df[col].unique(), cmap))
    print(lut)

    row_colors = df[col].map(lut)

    return row_colors, lut

def plotClusterMap(table, outhandle, seq_coor):
    sns.clustermap(table, cmap="bwr", center=0,
                       figsize=(15, 15))
        
    plt.savefig(os.path.join(RE, f"{outhandle}.{seq_coor}.pdf"))
    plt.show()

# sei

## get labels

In [12]:
path_dict = getHandles(FASTA)

In [6]:
"""
# get sequence class labels. See Methods section of Chen 2022 for interpretation of these PC labels.
# apparently labels >40 are low active/heterochromatin. 
# Make up <2% of the genome. But 2% of the genome can still be significant.
"""

seqClass = returnSequenceClassLabels()

seqClass

Unnamed: 0,#Sequence class label,Sequence class name,Rank by size,Group
0,PC1,Polycomb / Heterochromatin,0,PC
1,L1,Low signal,1,L
2,TN1,Transcription,2,TN
3,TN2,Transcription,3,TN
4,L2,Low signal,4,L
...,...,...,...,...
57,L/HET,L/HET,56,L/HET
58,L/HET,L/HET,57,L/HET
59,L/HET,L/HET,58,L/HET
60,L/HET,L/HET,59,L/HET


In [16]:
#df = makeTable(path_dict["CLASS_PRED"], path_dict["LABELS"], path_dict["INDEX"], path_dict["CLASS_TABLE"])

KeyError: 'INDEX'

In [18]:
# get sequence class labels from function above
seqClass = returnSequenceClassLabels()

# open npy data
data = np.load(path_dict["CLASS_PRED"], allow_pickle=True)

# turn into pd dataframe
df = pd.DataFrame(data)

# rename columns
df.columns = list(seqClass["#Sequence class label"])[:-1]

In [22]:
lab = pd.read_csv(LABEL, sep='\t')
df = pd.merge(lab[["name"]], df, left_index=True, right_index=True)

In [23]:
df.head()

Unnamed: 0,name,PC1,L1,TN1,TN2,L2,E1,E2,E3,L3,...,L/HET,L/HET.1,L/HET.2,L/HET.3,L/HET.4,L/HET.5,L/HET.6,L/HET.7,L/HET.8,L/HET.9
0,bj_k27ac_down_chr3:29837526-29837795,0.022628,0.04163,0.006035,0.013955,0.047331,0.020433,0.013643,0.026056,0.024605,...,0.018757,0.019529,0.022906,0.017983,0.033052,0.007846,0.017115,0.008774,0.015631,0.024955
1,bj_k27ac_down_chr3:29837260-29837529,0.115338,0.282501,0.02929,0.076609,0.290853,0.090649,0.048598,0.132619,0.101007,...,0.008737,0.118401,0.103875,0.131711,0.124702,0.072847,0.067435,0.11527,0.067717,0.189769
2,bj_k27ac_down_chr3:29837792-29838061,0.265198,0.560469,0.18809,0.238264,0.539103,0.650885,1.054547,0.593886,0.195363,...,0.04571,0.18725,0.190351,0.195323,0.209486,0.188979,0.088443,0.107272,0.098169,0.305068
3,bj_k27ac_down_chr3:29837392-29837661,0.110181,0.260931,0.023877,0.065959,0.283169,0.081381,0.038789,0.121081,0.080591,...,0.012976,0.10749,0.113314,0.141739,0.108345,0.088501,0.073031,0.074532,0.060759,0.209045
4,bj_k27ac_down_chr3:29837659-29837928,0.287447,0.442067,0.293456,0.271424,0.37252,0.836546,1.112578,0.9595,0.182614,...,0.064244,0.089629,0.117874,0.143213,0.152323,0.277466,0.043765,0.080209,0.089555,0.180001


In [25]:
df.to_csv(path_dict["CLASS_TABLE"], sep='\t', index=False, compression="gzip")

# MPRA

In [28]:
mpra = pd.read_csv(MPRA, sep='\t')

In [29]:
mpra.head()

Unnamed: 0,name,l2.ratio.1.ctrl,l2.ratio.2.ctrl,l2.ratio.3.ctrl,l2.ratio.1.us,l2.ratio.2.us,l2.ratio.3.us,l2.ratio.med.ctrl,l2.ratio.mean.ctrl,l2.ratio.std.ctrl,l2.ratio.med.us,l2.ratio.mean.us,l2.ratio.std.us,delta.med,delta.mean,pval,label.ctrl,label.us,response
0,SYNTHETIC:_Added_CCTTCCTG_GCCCGGGGG_TATACATA_a...,-1.734932,-1.567864,-1.754317,-1.296335,-1.276191,-1.294565,-1.734932,-1.685704,0.102512,-1.294565,-1.28903,0.011155,0.440367,0.396674,0.020518,False,False,False
1,hob_k27ac_up_chr1:67427680-67427949,-0.424418,-0.573461,-0.482278,-0.718239,-0.742376,-0.715355,-0.482278,-0.493385,0.07514,-0.718239,-0.725323,0.014838,-0.235962,-0.231938,0.029332,False,False,False
2,SYNTHETIC:_The_motif_ATTAAA_was_added_in_Pos:8...,-0.536486,-0.727045,-0.652711,-0.06323,-0.1,-0.030887,-0.652711,-0.638747,0.096044,-0.06323,-0.064706,0.03458,0.589482,0.574042,0.004667,False,False,False
3,SYNTHETIC:_Added_TATGACTCATA_ACAGGTGTAC_ACAGGT...,0.035693,-0.341065,-0.087809,-0.984858,-1.007522,-0.870307,-0.087809,-0.13106,0.192067,-0.984858,-0.954229,0.073557,-0.897049,-0.823169,0.009918,False,False,False
4,k562_atac_up_chr15:78056378-78056647,1.793414,1.153666,1.886389,1.402855,1.376605,1.427877,1.793414,1.611156,0.398917,1.402855,1.402446,0.025639,-0.39056,-0.208711,0.460596,True,True,False


## missing MPRA + sei 

In [38]:
print(len(set(mpra["name"]).difference(set(df["name"]))))  # 24047 MPRA sequences missing sei data
print(len(set(df["name"]).difference(set(mpra["name"]))))  # 38480 sei sequences missing MPRA data

24047
38480


all synthetic. need to look into these

In [37]:
missing = set(mpra["name"]).difference(set(df["name"]))
out = os.path.join(os.path.splitext(FASTA)[0] + "missing.sei.pred.txt")
with open(out, "w") as writer:
    for i in missing:
        writer.write(f"{i}\n")
writer.close()

In [39]:
df.shape

(82427, 62)

In [41]:
shared =  set(mpra["name"]).intersection(set(df["name"]))
print(len(shared))

43947


# merged sei + MPRA

In [43]:
merged =pd.merge( mpra, df)
merged.shape

(43947, 80)

In [44]:
merged.head()

Unnamed: 0,name,l2.ratio.1.ctrl,l2.ratio.2.ctrl,l2.ratio.3.ctrl,l2.ratio.1.us,l2.ratio.2.us,l2.ratio.3.us,l2.ratio.med.ctrl,l2.ratio.mean.ctrl,l2.ratio.std.ctrl,...,L/HET,L/HET.1,L/HET.2,L/HET.3,L/HET.4,L/HET.5,L/HET.6,L/HET.7,L/HET.8,L/HET.9
0,hob_k27ac_up_chr1:67427680-67427949,-0.424418,-0.573461,-0.482278,-0.718239,-0.742376,-0.715355,-0.482278,-0.493385,0.07514,...,0.037978,0.178191,0.187819,0.251613,0.22826,0.160538,0.104082,0.13668,0.142096,0.367951
1,SYNTHETIC:_The_motif_ATTAAA_was_added_in_Pos:8...,-0.536486,-0.727045,-0.652711,-0.06323,-0.1,-0.030887,-0.652711,-0.638747,0.096044,...,0.012898,0.196193,0.240759,0.327502,0.256952,0.243622,0.159659,0.18568,0.160704,0.488359
2,k562_atac_up_chr15:78056378-78056647,1.793414,1.153666,1.886389,1.402855,1.376605,1.427877,1.793414,1.611156,0.398917,...,0.29026,0.937421,2.08414,2.350216,1.619657,1.858449,1.802296,1.689213,1.518893,2.364871
3,bj_atac_up_chr11:20163563-20163832,-1.687993,-1.50926,-1.550799,-0.583942,-0.776457,-0.714185,-1.550799,-1.582684,0.093535,...,0.195287,1.269946,3.019466,4.741728,2.949888,2.383966,2.337254,2.057321,4.017609,2.528643
4,hepg2_atac_up_chr11:57235711-57235980,1.124567,0.542172,1.040234,-0.266068,-0.045856,-0.005265,1.040234,0.902324,0.314739,...,0.014754,0.287805,0.995698,0.986103,0.466484,0.72135,1.081005,0.511769,0.715045,0.935783


# X and Y
x = sei predictions
y = delta activity

In [48]:
sei_cols = list(df)[1:]
X = merged[sei_cols]
y = merged['delta.mean']

# NEED TO REFERENCE BED FILE VEFORE CONTINUING. PROBLEMS WITH DATA LEAKAGE ABOUND. 

## pipeline

In [53]:
pipe = Pipeline([('scaler', StandardScaler()), ('reg', LinearRegression())])