In [2]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import config_readwrite as crw
import glob
import h5py
import os, sys
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

In [3]:
#SAMPLE = 2
MER, NMUTS =14, 2

In [7]:
config, cfn = crw.read(os.path.join(os.path.dirname(os.getcwd()), "config.ini"))

#read
FA = config[f"common.{MER}mer.{NMUTS}mut"]['fa_fo-true']
TSV =  config[f"common.{MER}mer.{NMUTS}mut"]["tsv_fo-true"]

#write
section="sei"
crw.check(config, section)

PATH="/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions/chromatin-profiles-hdf5"

QUERY_F= os.path.join(PATH, f"key_common.{MER}mers.{NMUTS}mut.nulls.fo.pam.ext4096.*_predictions.h5")

COLNAMES = "/wynton/home/ahituv/fongsl/bin/sei-framework/seq_prediction_columns.txt"

config[section]["path"] = PATH
config[section]["colnames"] = COLNAMES

crw.write(config, cfn)

In [5]:
def readHdF5(filename, path):
    """
    read hdf5 file, dataset
    
    input
        filename (str) - name of hdf5 results
        path (str) - abs path to directory
        
    require 
        h5py
        
    method 
        1. assemble file
        2. read file
        3. get dataset

    return
        file, data
    
    """
    #1
    file = os.path.join(path, filename) 
    
    #2
    f = h5py.File(file, 'r')
    
    #3
    dset = f['data']

    return f, dset

def getIndexNames():
    
    """
    return list of 21097 index names corresponding to sei prediction features
    
    input
        none
    method 
        1. make a list for the index names
        2. open the index names file 
        3. append name to index name list
    
    return
        idxnames (list) - list of index names
    """
    
    #1
    idxnames = []
    
    #2
    IDXNAMES = "/wynton/home/ahituv/fongsl/bin/sei-framework/seq_prediction_columns.txt"
    with open(IDXNAMES, "r") as reader:
        for line in reader:
            #3
            idxnames.append(line.strip("\n"))
            
    return idxnames

def arrayToDF(dset):
    
    df = pd.DataFrame(np.vstack(dset))
    print(df.shape)
    df.columns = getIndexNames() # function get column names
    
    return df.reset_index()

def getRowNames(row_textfile, path):
    """
    return list corresponding to sample names
    """
    
     #1
    rownames = {}
    file = os.path.join(path, row_textfile) 
    
    #2
    with open(file, "r") as reader:
        for line in reader:
            
            #3
            idx, name = line.strip("\n").split("\t")
            if idx !="index":
                rownames[idx]=name
    df = pd.DataFrame(rownames.items(), columns= ["index_col", "id"])
    df["index_col"]=df["index_col"].astype(int)
    return df

# load data

## load hdf5

In [8]:
n = len(glob.glob(QUERY_F))
n

10

In [14]:
collection = {}
for n in np.arange(len(glob.glob(QUERY_F))):

    F = f"key_common.{MER}mers.{NMUTS}mut.nulls.fo.pam.ext4096.{n}_predictions.h5"
    F_LABELS = f"key_common.{MER}mers.{NMUTS}mut.nulls.fo.pam.ext4096.{n}_row_labels.txt"
    f, dset = readHdF5(F,PATH)

    df = arrayToDF(dset)

    ## get row names

    names = getRowNames(F_LABELS, PATH)

    # add to dataframe
    df = pd.merge(names, df, how="left", left_on ="index_col", right_on = "index")

    # drop the index_col name (redundant)
    df = df.drop(columns=["index_col", "index"])

    # format dataframe

    df["coor"] = df["id"].apply(lambda x: x.split("_")[0])
    df["pos"] = df["id"].apply(lambda x: x.split(".")[-1] if "." in x else None)
    df["pair_id"] = df["id"].apply(lambda x: (x.split("_")[1]).split(".")[0])

    # add kmer info
    df["nullomer"] = False 
    df.loc[df["id"].str.contains("null"), "nullomer" ] = True



    df = df.loc[df.pos.astype(str) != "None"]  # remove all the extra repeats of the same kmer control data
    collection[n]= df

(1994, 21907)
(1998, 21907)
(1998, 21907)
(1998, 21907)
(1998, 21907)
(1998, 21907)
(1998, 21907)
(1998, 21907)
(1998, 21907)
(1458, 21907)


In [15]:
# combine
df = pd.concat(collection.values())
print(df.shape)
df.head(33)

(9718, 21912)


Unnamed: 0,id,HeLa_Epithelium_Cervix|BTAF1|ID:1,H9_Embryonic_Stem_Cell_Embryo|EOMES|ID:1006,H9_Embryonic_Stem_Cell_Embryo|EP300|ID:1007,H9_Embryonic_Stem_Cell_Embryo|FOXH1|ID:1010,H9_Embryonic_Stem_Cell_Embryo|FOXH1|ID:1011,H9_Embryonic_Stem_Cell_Embryo|FOXP1|ID:1012,H9_Embryonic_Stem_Cell_Embryo|FOXP1|ID:1013,H9_Embryonic_Stem_Cell_Embryo|H2BK12ac|ID:1014,H9_Embryonic_Stem_Cell_Embryo|H2BK15ac|ID:1017,...,Urothelia (UT189)|DNase|ENCODE,WERI-Rb-1|CTCF|ENCODE,WERI-Rb-1|DNase|ENCODE,WI-38|CTCF|ENCODE,WI-38 (4OHTAM_20nM_72hr)|DNase|ENCODE,WI-38|DNase|ENCODE,coor,pos,pair_id,nullomer
1,chr17:77482474-77482674_ctrl-0.43,7.9e-05,0.014717,0.00711,0.00132,0.003049,0.07998,0.090706,0.000235,8.7e-05,...,0.258364,0.023295,0.030718,0.001406,0.068632,0.048106,chr17:77482474-77482674,43,ctrl-0,False
3,chr17:77482474-77482674_ctrl-1.43,0.000103,0.0146,0.008232,0.001291,0.003212,0.082509,0.08608,0.00024,9.1e-05,...,0.275807,0.026247,0.029319,0.001568,0.077084,0.053802,chr17:77482474-77482674,43,ctrl-1,False
5,chr17:77482474-77482674_ctrl-2.43,6.9e-05,0.012112,0.005903,0.001169,0.002733,0.070554,0.077229,0.000214,8.1e-05,...,0.251006,0.020388,0.024467,0.001364,0.063907,0.043438,chr17:77482474-77482674,43,ctrl-2,False
7,chr17:77482474-77482674_ctrl-3.43,5.2e-05,0.008468,0.003654,0.00076,0.001958,0.054221,0.059875,0.000123,5.2e-05,...,0.220965,0.008236,0.016953,0.000628,0.035971,0.02686,chr17:77482474-77482674,43,ctrl-3,False
9,chr17:77482474-77482674_ctrl-4.43,5.3e-05,0.008991,0.004116,0.000795,0.002051,0.05832,0.06267,0.000134,5.6e-05,...,0.214432,0.008752,0.01911,0.000672,0.040682,0.030374,chr17:77482474-77482674,43,ctrl-4,False
11,chr17:77482474-77482674_ctrl-5.43,0.00013,0.020483,0.014169,0.001828,0.004116,0.096462,0.107205,0.000434,0.00013,...,0.281699,0.034101,0.043569,0.00176,0.1066,0.074197,chr17:77482474-77482674,43,ctrl-5,False
13,chr17:77482474-77482674_ctrl-6.43,5.2e-05,0.013457,0.005265,0.001224,0.00277,0.081919,0.078133,0.000147,6.1e-05,...,0.260772,0.021756,0.029353,0.001378,0.064923,0.048093,chr17:77482474-77482674,43,ctrl-6,False
15,chr17:77482474-77482674_ctrl-7.43,7.5e-05,0.014487,0.006841,0.00135,0.003126,0.08102,0.081532,0.000237,9.2e-05,...,0.263421,0.018692,0.028128,0.001293,0.07749,0.052894,chr17:77482474-77482674,43,ctrl-7,False
17,chr17:77482474-77482674_ctrl-8.43,6.6e-05,0.015343,0.005474,0.001373,0.00285,0.085446,0.079787,0.000155,7.1e-05,...,0.284547,0.013729,0.031843,0.001042,0.093562,0.066021,chr17:77482474-77482674,43,ctrl-8,False
19,chr17:77482474-77482674_ctrl-9.43,7.8e-05,0.014007,0.006599,0.00132,0.003033,0.081481,0.08308,0.00022,8.4e-05,...,0.276663,0.018308,0.030547,0.00121,0.078793,0.056937,chr17:77482474-77482674,43,ctrl-9,False


In [16]:
test_coor = set(df.loc[df['nullomer'] == True, "coor"])

In [18]:
print(len(test_coor))
list(test_coor)[:3]

294


['chr16:56657731-56657931',
 'chr7:45045392-45045592',
 'chr20:63879357-63879557']

# analyze one locus

In [19]:
def getPredValues(preds, track, kmer, null):

    # get raw prediction values
    kmer_pred = preds.loc[preds.index == track][kmer].iloc[0]
    null_pred = preds.loc[preds.index == track][null].iloc[0]
    ctrl_pred = preds.loc[preds.index ==
                          track][preds.columns[:-2]].T
    ctrl_pred_mean = ctrl_pred[track].mean() 
    ctrl_pred_std = ctrl_pred[track].std()

In [20]:
out = os.path.join(PATH, "key_HepG2.K562.Pred.tsv")
with open(out, "w") as writer:

    v = 0
    for COOR in test_coor:


        # COOR = "chr1:10248183-10248383"
        TCOOR = '.'.join(COOR.split(":"))  # stupid formatting

        coordf = df.loc[(df["coor"] == COOR)].copy()

        # Separate by prediction positions 
        # there can be multiple places where kmer/nullomer occurs in sequence.
        for position in set(coordf["pos"]):
            cdf = coordf.loc[coordf["pos"] == position].copy()

            # melt dataframe
            test = pd.melt(cdf, id_vars=["nullomer", "coor", "pair_id", "id", "pos"],
                           var_name="track", value_name="pred_prob")

            # constrain to CLs we want to study
            CLS = ["K562", "HepG2", " WTC11"]
            cldf = test.loc[test["track"].str.contains(
                CLS[0]) | test["track"].str.contains(CLS[1])].copy()

            # keep only datasets w/ controls
            if len(set(cldf.nullomer)) > 1:
                
                # kmer_id
                kmer = cldf.loc[cldf['pair_id'].str.contains(
                    "kmer"), "pair_id"].iloc[0]
                
                # nullomer_id
                null = cldf.loc[cldf['pair_id'].str.contains(
                    "null"), "pair_id"].iloc[0]

                # pivota dataframe so that rows are tracks, columns are experiments, and values are the predicted probabilityes
                preds = pd.pivot(cldf, index="pair_id", columns="track",
                                 values="pred_prob").reset_index()


                # compute dif between original kmer sequence and mutated ones.
                #dif = preds.sub(preds[kmer], axis=0).reset_index()
                dif = preds.copy()            
                for track in list(dif)[1:]:
                    dif[track] = stats.zscore(dif[track])

                    # z score for null
                    kmer_z = dif.loc[dif["pair_id"] == kmer, track].iloc[0], 
                    null_z = dif.loc[dif["pair_id"] == null, track].iloc[0]

                    # get raw prediction values
                    kmer_pred = preds.loc[preds["pair_id"]==kmer, track].iloc[0]
                    null_pred = preds.loc[preds["pair_id"]==null, track].iloc[0]

                    ctrl_pred = preds.loc[~preds["pair_id"].isin([null, kmer]), track]

                    ctrl_pred_mean, ctrl_pred_std = ctrl_pred.mean(), ctrl_pred.std()


                    # if nullomer z score is 2 sd away.

                    if abs(null_z) > 2 and abs(null_pred - ctrl_pred_mean) > 0.05:
                        print(track, null_z, abs(null_pred - ctrl_pred_mean))

                        if v<5:
                            # plot
                            fig, ax = plt.subplots(figsize=(4, 4))

                            # plot distribution of prediction differences
                            sns.histplot(preds[track])

                            # plot null difference
                            ax.axvline(null_pred, ls="--", c="r")

                            # name the plot
                            ax.set(title=track,
                                   xlabel=f"pred - {COOR}"
                                   )
                            plt.show()


                    # write data to file
                    write_info = [
                        COOR, TCOOR,
                        kmer, null,  # pair ids
                        track,  # prediction track
                        str(kmer_pred),
                        str(null_pred),
                        str(ctrl_pred_mean),
                        str(ctrl_pred_std),
                        str(null_z)  # nullomer difference predicted
                    ]

                    if v == 0:
                        columns = ["#coord", "tsvCoor", "kmer", "null",
                                   "track", "predKmer", "predNull",
                                   "predCtrl_mean", 'predCtrl_std',
                                   "difNullKmer"
                                   ]
                        writer.write("\t".join(columns)+"\n")
                        writer.write("\t".join(write_info)+"\n")
                    else:
                        writer.write("\t".join(write_info)+"\n")


                    v += 1
writer.close()

HepG2_Epithelium_Liver|CREB1|ID:46231
HepG2_Epithelium_Liver|CREB1|ID:56047
K562_Erythroblast_Bone_Marrow|H3K27ac|ID:35400
HepG2_Epithelium_Liver|EP300|ID:46225
HepG2_Epithelium_Liver|MAFK|ID:45815
K562_Erythroblast_Bone_Marrow|BACH1|ID:45900
K562_Erythroblast_Bone_Marrow|HMBOX1|ID:64414
K562_Erythroblast_Bone_Marrow|IKZF1|ID:63463


KeyboardInterrupt: 

In [None]:
print(out)

# evaluate predictions

In [None]:
cols = ['#coord',
        "tsvCoor",
        'track',
        'predKmer',
        'predNull',
        'predCtrl_mean',
        'predCtrl_std',
        'difNullKmer']
odf = pd.read_csv(out, sep='\t', 
                  usecols=cols,
                  
                 )
odf=odf.rename(columns={"#coord":"loci"})

In [None]:
# formatting
odf=odf.loc[odf["difNullKmer"]!="difNullKmer"]

odf[['predKmer',
 'predNull',
 'predCtrl_mean',
 'predCtrl_std',
 'difNullKmer']]=odf[['predKmer',
 'predNull',
 'predCtrl_mean',
 'predCtrl_std',
 'difNullKmer']].astype(float)

# label some columns
odf["cl"] = odf["track"].apply(
    lambda x: x.split("|")[0].split("_")[0])
odf["feature"] = odf["track"].apply(lambda x: x.split("|")[1])
odf["dataset_id"] = odf["track"].apply(
    lambda x: x.split("|")[2])

odf
odf.sort_values(by="difNullKmer")#, ascending = False)

In [None]:
features = ["H3K4me3", "H3K27ac", "H3K4me1", "H3K27me3"]
for feature in features:

    promoter = odf.loc[odf["feature"]==feature].drop_duplicates()
    print(promoter.shape)

    fig,ax=plt.subplots(figsize=(6,6))
    sns.scatterplot(x="predKmer", y='predNull', 
                    data = promoter, 
                    alpha=0.2,
                    hue="cl"
                   )
    ax.set(title=feature)
    plt.show()

# add nullomer sequences to mix

In [None]:
test = promoter.sample(frac=0.01)
print(test.shape)
test.head()

In [None]:
TSV= '/wynton/home/ahituv/fongsl/nullomers/data/lock/common/key_common.14mers.2mut.nulls.fo.pam.tsv'

tsv = pd.read_csv(TSV, sep='\t')

tsv.head()

In [None]:
tsv['ends'] = tsv["matched_seq"].apply(lambda x: x[-2:])
tsv['NGG'] = tsv["ends"].apply(lambda x: True if x in ['GG', "CC"] else False)
tsv = tsv.loc[tsv.NGG == True]

In [None]:
len(tsv.loc[tsv["NGG"]==False]), len(tsv.loc[tsv["NGG"]==True])
# lots of NGG false because kmers that do mutate to NGG can also mutate to other nullomers (not NGG nullomers.)

In [None]:
tsv = tsv.loc[tsv["id"].str.contains('null')]
tsv.head()

In [None]:
i = set(tsv["loci"]).intersection(set(test["loci"]))

# merge null sequence data to prediction data
plot = pd.merge(test, tsv.loc[tsv["loci"].isin(i), ["loci","matched_seq"]], 
        how="left",
        )
plot.head()

In [None]:
fig,ax=plt.subplots(figsize=(6,6))

sns.scatterplot(x="predKmer", y='predNull', 
                data = plot, 
                hue="matched_seq",
               )
ax.set(title="H3K4me3")
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls = "--", c="grey")
#ax.legend(bbox_to_anchor=(1,1))

In [None]:
smalldf = odf.loc[abs(odf["difNullKmer"])>1].drop_duplicates()
print(smalldf.shape)
fig, ax = plt.subplots(figsize = (4,4))
sns.scatterplot(x="predKmer", y='predNull', data = smalldf)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls = "--", c="grey")