20240227

SarahFong

Take output from DiffBind summarized peak and count matrix and prepare data for 
1. Clean diffbind normalized count data (remove bad coordinates)
2. perform PCA on ATAC read count data. 

In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import glob
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import numpy as np
import os, sys
import pandas as pd
import pybedtools as pbt

from scipy import stats
from scipy.stats import zscore, boxcox

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

# load data

In [2]:
PATH = "/wynton/group/ahituv/data/US-MPRA/ATAC-seq/Diffbind_results"
os.chdir(PATH)

# functions

## clean bed

In [3]:
def cleanBed(file):

    # name of the outfile
    outfile = ".".join(file.split('.')[:-1]) + ".cleaned.bed"

    # python command for cleaning bed file
    cmd = ' '.join([
        "python", 
        "/wynton/home/ahituv/fongsl/tools/genome/bed_clean.py",  # custom script that removes bad coordinates (start>end, start<0)
        file
    ])

    if os.path.exists(outfile) is False:  # if not already cleaned
        os.system(cmd)  # execute command
    else:
        print("already made?", outfile)

    return outfile

## compute replicate mean, peak centers

## count pbt obj lines

In [4]:
def pbtCounter(pbt_obj):
    """count lines in pybedtool object, return count"""
    
    v = 0
    for i in pbt_obj:
        v+=1
    print(v)
    
    return v

## label elements w/ names

In [5]:
def labelDf(df, label_name, label_value):
    """add an indexed label to dataframe"""
    df["label"]= label_value
    df[label_name]=label_value + "." + df.index.map(str)
    
    return df

## make fa into pd.df

In [6]:
def fastaDf(file):
    """ turn fasta file into pd dataframe"""

    fasta_df = {}
    print(file)
    
    # parse open fasta file
    with open(file, "r") as reader:
        for value in SimpleFastaParser(reader):
            seqid, seq = value
            fasta_df[seqid] = seq
            
    # make dataframe
    df = pd.DataFrame(fasta_df.items())

    # name columns
    df.columns = ["seq.id", "seq"]
    return df

## make chromosome list

In [7]:
def chrList():
    """return  list of chromosomes"""
    
    chrs = []
    
    for n in np.arange(1,23):
        chrs.append(f"chr{n}")
    
    # add sex chromosomes
    chrs.append("chrX")
    chrs.append("chrY")
    
    return chrs

def filterConc(df, min_conc, col1, col2):
    print("before filter shape", df.shape)
    filtered = df.loc[(df[col1]>min_conc)| 
                      (df[col2]>min_conc) 
                     ].drop_duplicates().copy()
    
    print("after filter shape", filtered.shape)
    return filtered

def zscore(df, col):
    """compute z-score for a column"""
    df[f'{col}.zscore'] = (df[col] - df[col].mean())/df[col].std()
    
    return df


In [8]:
def plotHist(df, fdf, CL):
    fig, ax= plt.subplots(figsize=(6,6))
    sns.histplot(x="Conc_Ultrasound", data=df, label="unfiltered")
    sns.histplot(x="Conc_Ultrasound", data=fdf, label="filtered")
    ax.set(title=CL)
    ax.legend()

In [14]:
def encodeIntersection(cl, df, dif_bed, fdf):
    
    ENCODE_CCREs = {'hepg2': "/wynton/group/ahituv/data/encode/hepg2",
                    "k562": "/wynton/group/ahituv/data/encode/k562",
                    "hob": "/wynton/group/ahituv/data/encode/MG63",  # imperfect match, human osteosarcoma - https://www.sigmaaldrich.com/US/en/product/sigma/cb_86051601?utm_source=google&utm_medium=cpc&utm_campaign=8906396346&utm_content=88453247983&gclid=Cj0KCQiArrCvBhCNARIsAOkAGcVymGqSXznvXF-1hmqzVx8mK0z7eopHlS356QdcjYOyInUuoAHrmHEaAvWHEALw_wcB
                    "bj": "/wynton/group/ahituv/data/encode/bj"
                    }

    ENCODE_PATH = ENCODE_CCREs[cl]
    annots = glob.glob(os.path.join(ENCODE_PATH, "*.bed"))

    b = pbt.BedTool.from_dataframe(df)
    d = pbt.BedTool(dif_bed)
    f = pbt.BedTool.from_dataframe(fdf)
    dxf = d.intersect(f, wa=True)  # evaluate the filtered diff accessible elements
    
    df_count, difacc_count, fdf_count, fdifacc_count = pbtCounter(
        b), pbtCounter(d), pbtCounter(f), pbtCounter(dxf)

    reg_annots = ["CTCF-only,CTCF-bound",
                  "dELS",
                  "PLS,CTCF-bound",
                  "pELS",
                  "PLS", "pELS,CTCF-bound",
                  "dELS", "CTCF-bound"]
    
    results = {}
    
    results["totals"] = [cl, "total.atac", None, df_count, fdf_count, difacc_count, fdifacc_count]
    
    for a in annots:
        
        annot = ((a.split("/")[-1]).split(".")[0])
        
        if annot in reg_annots: # per annotation
            print("\n\n", cl,  annot)
            encode_bed = pbt.BedTool(a)
            encode_count = pbtCounter(encode_bed)
            
            result_count_vector= [cl, annot, encode_count]

            for i in [b, f, d, dxf]:  # per dataset
                ixe = i.intersect(encode_bed, wa=True)
                result_count_vector.append(pbtCounter(ixe))

            
            results[annot] = result_count_vector
            
            
    colnames=["cl", "annotation", "N_cCRE", "N_allAcc", "N_filterAcc", "N_diffAcc", "N_filterDiffAcc"]
    return pd.DataFrame(results.values(), columns = colnames)

# Main

## clean bed files

In [15]:
# params
MIN_CONC= 3
CLS =['hepg2', 'k562', "hob", "bj"]

cl_results = {}
for CL in CLS:
    print(CL)

    ALL = f"{CL}_deseq2-nondiff.csv"
    ALL_BED = ALL.strip(".csv") + ".bed"

    # DESEQ2 diff acc
    DIFF = f"{CL}_deseq2.csv"
    DIFF_BED = DIFF.strip(".csv") + ".bed"

    DIFF_IN_ALL = f"{CL}_deseq2-nondiff.in.diffAcc.bed"

    df = pd.read_csv(ALL)
    df = df.loc[df["seqnames"].isin(chrList())]

    fdf = filterConc(df, MIN_CONC, "Conc_Ultrasound", "Conc_Control")

    #plotHist(df, fdf, CL)

    # intersect w/ encode

    cl_results[CL] = encodeIntersection(CL, df, DIFF_BED, fdf)


hepg2
before filter shape (127857, 11)
after filter shape (49249, 11)
127857
1916
49249
777


 hepg2 CTCF-only,CTCF-bound
20944
12332
6072
194
140


 hepg2 dELS
25435
15164
11296
196
149


 hepg2 PLS,CTCF-bound
2377
2017
1972
12
12


 hepg2 pELS
16656
5931
5331
50
42


 hepg2 PLS
13597
10324
10085
58
56


 hepg2 pELS,CTCF-bound
2613
1226
1159
7
6
k562
before filter shape (102300, 11)
after filter shape (82505, 11)
102300
890
82505
655


 k562 pELS,CTCF-bound
6580
2207
2206
1
0


 k562 CTCF-only,CTCF-bound
27649
19741
18179
130
116


 k562 PLS,CTCF-bound
6568
5146
5144
10
10


 k562 pELS
33276
6502
6461
31
31


 k562 PLS
12752
7926
7920
20
19


 k562 dELS
30119
16034
15794
154
150
hob
before filter shape (159246, 11)
after filter shape (124148, 11)
159246
2324
124148
2218


 hob PLS
18798
13636
13560
567
567
bj
before filter shape (117245, 11)
after filter shape (67339, 11)
117245
433
67339
242


 bj PLS,CTCF-bound
3835
3345
3330
12
12


 bj PLS
9445
7770
7680
23
23


In [18]:
results = pd.concat(cl_results.values())

results["frac_filtered"] = results['N_filterAcc']/results["N_allAcc"]
results["frac_filtered_diff"] = results['N_filterDiffAcc']/results["N_diffAcc"]
results.round(2)

Unnamed: 0,cl,annotation,N_cCRE,N_allAcc,N_filterAcc,N_diffAcc,N_filterDiffAcc,frac_filtered,frac_filtered_diff
0,hepg2,total.atac,,127857,49249,1916,777,0.39,0.41
1,hepg2,"CTCF-only,CTCF-bound",20944.0,12332,6072,194,140,0.49,0.72
2,hepg2,dELS,25435.0,15164,11296,196,149,0.74,0.76
3,hepg2,"PLS,CTCF-bound",2377.0,2017,1972,12,12,0.98,1.0
4,hepg2,pELS,16656.0,5931,5331,50,42,0.9,0.84
5,hepg2,PLS,13597.0,10324,10085,58,56,0.98,0.97
6,hepg2,"pELS,CTCF-bound",2613.0,1226,1159,7,6,0.95,0.86
0,k562,total.atac,,102300,82505,890,655,0.81,0.74
1,k562,"pELS,CTCF-bound",6580.0,2207,2206,1,0,1.0,0.0
2,k562,"CTCF-only,CTCF-bound",27649.0,19741,18179,130,116,0.92,0.89


# filtering results

In [19]:
results.loc[results["annotation"]=="total.atac"].round(2)

Unnamed: 0,cl,annotation,N_cCRE,N_allAcc,N_filterAcc,N_diffAcc,N_filterDiffAcc,frac_filtered,frac_filtered_diff
0,hepg2,total.atac,,127857,49249,1916,777,0.39,0.41
0,k562,total.atac,,102300,82505,890,655,0.81,0.74
0,hob,total.atac,,159246,124148,2324,2218,0.78,0.95
0,bj,total.atac,,117245,67339,433,242,0.57,0.56


In [20]:
results.loc[results["annotation"].str.contains("LS")].sort_values(by="annotation").round(2)

Unnamed: 0,cl,annotation,N_cCRE,N_allAcc,N_filterAcc,N_diffAcc,N_filterDiffAcc,frac_filtered,frac_filtered_diff
5,hepg2,PLS,13597.0,10324,10085,58,56,0.98,0.97
5,k562,PLS,12752.0,7926,7920,20,19,1.0,0.95
1,hob,PLS,18798.0,13636,13560,567,567,0.99,1.0
2,bj,PLS,9445.0,7770,7680,23,23,0.99,1.0
3,hepg2,"PLS,CTCF-bound",2377.0,2017,1972,12,12,0.98,1.0
3,k562,"PLS,CTCF-bound",6568.0,5146,5144,10,10,1.0,1.0
1,bj,"PLS,CTCF-bound",3835.0,3345,3330,12,12,1.0,1.0
2,hepg2,dELS,25435.0,15164,11296,196,149,0.74,0.76
6,k562,dELS,30119.0,16034,15794,154,150,0.99,0.97
4,hepg2,pELS,16656.0,5931,5331,50,42,0.9,0.84


# PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
values = pd.merge(ctrl[["coor", "rep1", "rep2", ]], us[["rep1", "rep2", "coor"]], left_on="coor", right_on="coor")
values

In [None]:
X = values[["rep1_x", "rep2_x", "rep1_y", "rep2_y"]]#.pivot(columns = 'coor', values=['rep1', 'rep2'])

In [None]:
pca = PCA(n_components=4)
X_ = pca.fit_transform(X)

print(pca.explained_variance_ratio_) 

In [None]:
X_