20240227

SarahFong

Take output from DiffBind summarized peak and count matrix and prepare data for 
1. Clean diffbind normalized count data (remove bad coordinates)
2. perform PCA on ATAC read count data. 

In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import numpy as np
import os, sys
import pandas as pd
import pybedtools as pbt

from scipy import stats
from scipy.stats import zscore, boxcox

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

# load data

In [2]:
PATH = "/wynton/group/ahituv/data/US-MPRA/ATAC-seq"
os.chdir(PATH)

# params
CL = 'hepg2'
ALL = f"{CL}_counts.txt"
CONTROL = f"{CL}c_counts.txt"
TREATED = f"{CL}t_counts.txt"

# DESEQ2 information
DIFF = './diffbind_results/hepg2_deseq2.csv'
DIFF_BED = "./" + DIFF.strip(".csv") + ".bed"

# peak information
PEAK_SIZE = 270

# genome information
HG38= "/wynton/group/ahituv/data/dna/hg38/hg38.chrom.sizes"
FA_HG38="/wynton/group/ahituv/data/dna/hg38/hg38.fa"

# legnet OUTPUT files
FULL = f"{CL}.centered.coor.mean.reads.full.tsv"
HELD_OUT = f"{CL}.heldoutchromosomes.tsv"
HELD_OUT_FA = f"{CL}.heldoutchromosomes.fa"
TRAINING = f"{CL}.training.tsv"

# functions

## clean bed

In [3]:
def cleanBed(file):

    # name of the outfile
    outfile = ".".join(file.split('.')[:-1]) + ".cleaned.bed"

    # python command for cleaning bed file
    cmd = ' '.join([
        "python", 
        "/wynton/home/ahituv/fongsl/tools/genome/bed_clean.py",  # custom script that removes bad coordinates (start>end, start<0)
        file
    ])

    if os.path.exists(outfile) is False:  # if not already cleaned
        os.system(cmd)  # execute command
    else:
        print("already made?", outfile)

    return outfile

## compute replicate mean, peak centers

In [4]:
def openDfComputeMean(file, cl, treatment):
    """ compute mean of normalized read counts per cell line, treatment
        inputs
            file (str) - path to file to read
            cl (str) - name of cell line
            treatment(bool) - treated or not treated readcount values

        method
            1. open dataframe, name columns
            2. determine number of replicates included in the dataframe
            3. compute the mean normalized read count of the replicates, 
                print correlation
            4. annotate with cl and treatment metadata, length, midpoints
            5. drop zero values
            6. write file

        return 
            pd dataframe

    """
    outfile = file.strip(".bed") + ".sum.bed"

# if os.path.exists(outfile) is False:

    # 1 load and name columns
    col_names = ["#chr", "start", "end", "rep1", "rep2", "rep3"]
    df = pd.read_csv(file_clean, sep='\t',
                     header=None,
                     )

    # rename columns
    df.columns = col_names[:len(list(df))]

    # drop zeros
    before = df.shape[0]
    df = df[df != 0].dropna()
    print("before zero drop", before, "after dropping zeros", df.shape)

    # 2 determine n reps
    if "rep3" in list(df):
        NREPS = 3
    else:
        NREPS = 2

    # get replicate column names as list
    rep_cols = list(df)[-NREPS:]

    # 3 take mean of normalized readcounts for all N reps
    print("rep normalized ATAC readcount correlation", cl,
          'treated', treatment, "\n\n", df[rep_cols].corr())
    df[rep_cols] = df[rep_cols].astype(float)
    
    # mean and std
    df["norm.readcount.mean"] = df[rep_cols].mean(axis=1)
    df["norm.readcount.std"] = df[rep_cols].std(axis=1)

    # 4 annotate metadata
    df["cl"] = cl
    df["treatment"] = treatment

    df['len'] = df["end"] - df["start"]  # length of element
    df["start_midpoint"] = df["start"] + df["len"]/2  # element midpoint
    # element end midpoint (remember, zero-start, half open)
    df["end_midpoint"] = df["start_midpoint"] + 1
    df[["start_midpoint", "end_midpoint"]] = df[[
        "start_midpoint", "end_midpoint"]].astype(int)

    # 5 write
    df.to_csv(outfile, sep='\t', index=False)
    # else:
    #   df=pd.read_csv(outfile, sep='\t')

    return df, outfile

## count pbt obj lines

In [5]:
def pbtCounter(pbt_obj):
    """count lines in pybedtool object, return count"""
    
    v = 0
    for i in pbt_obj:
        v+=1
    print(v)
    
    return v

## label elements w/ names

In [6]:
def labelDf(df, label_name, label_value):
    """add an indexed label to dataframe"""
    df["label"]= label_value
    df[label_name]=label_value + "." + df.index.map(str)
    
    return df

## make fa into pd.df

In [7]:
def fastaDf(file):
    """ turn fasta file into pd dataframe"""

    fasta_df = {}
    print(file)
    
    # parse open fasta file
    with open(file, "r") as reader:
        for value in SimpleFastaParser(reader):
            seqid, seq = value
            fasta_df[seqid] = seq
            
    # make dataframe
    df = pd.DataFrame(fasta_df.items())

    # name columns
    df.columns = ["seq.id", "seq"]
    return df

## make chromosome list

In [8]:
def chrList():
    """return  list of chromosomes"""
    
    chrs = []
    
    for n in np.arange(1,23):
        chrs.append(f"chr{n}")
    
    # add sex chromosomes
    chrs.append("chrX")
    chrs.append("chrY")
    
    return chrs

# Main

## clean bed files

In [9]:
# clean both files
cleaned = []
for file in [CONTROL, TREATED]:
    cleaned.append(cleanBed(file))

# make variables from cleaned list
CLEAN_CONTROL, CLEAN_TREATED = cleaned

already made? hepg2c_counts.cleaned.bed
already made? hepg2t_counts.cleaned.bed


In [10]:
results = {}
summary_beds = []
for file_clean in cleaned:
    
    treatment = False  # annotate treatment from filename
    
    if "t_" in file_clean:
         treatment = True
    df, outfile = openDfComputeMean(file_clean, CL, treatment)
    results[file_clean] = df
    summary_beds.append(outfile)

df = pd.concat(results.values())
df.sort_values(by=['#chr', "start", "end"]).head()

before zero drop 67447 after dropping zeros (67131, 5)
rep normalized ATAC readcount correlation hepg2 treated False 

           rep1      rep2
rep1  1.000000  0.968764
rep2  0.968764  1.000000
before zero drop 57397 after dropping zeros (57101, 5)
rep normalized ATAC readcount correlation hepg2 treated True 

           rep1      rep2
rep1  1.000000  0.977277
rep2  0.977277  1.000000


Unnamed: 0,#chr,start,end,rep1,rep2,norm.readcount.mean,norm.readcount.std,cl,treatment,len,start_midpoint,end_midpoint
0,chr1,9902,10302,17.945055,20.89936,19.422208,2.08901,hepg2,True,400,10102,10103
0,chr1,9906,10306,15.40702,37.819709,26.613365,15.848164,hepg2,False,400,10106,10107
1,chr1,180689,181089,34.86852,35.211454,35.039987,0.242491,hepg2,False,400,180889,180890
2,chr1,180693,181093,16.889463,20.89936,18.894412,2.835425,hepg2,True,400,180893,180894
2,chr1,598813,599213,3.243583,3.912384,3.577983,0.472913,hepg2,False,400,599013,599014


In [11]:
df = pd.read_csv(FULL, sep='\t')

In [12]:
df.head()

Unnamed: 0,#chr,start_trim,end_trim,type,ctrl,US,seq.id,seq
0,chr1,598878,599149,ctrl-only.0,3.577983,0.0,chr1:598878-599149,gtgtgtgatgttctctctgattacattggaactgtgcgtttgcgga...
1,chr1,610594,610865,ctrl-only.1,2.273856,0.0,chr1:610594-610865,TCTGTGGCCAGCAGGCGGCGCTGCAGGAGAGGAGATGCCCAGGCCT...
2,chr1,1185478,1185749,ctrl-only.10,6.098455,0.0,chr1:1185478-1185749,GTCCTGGAGCAGCAGCAGCTGCCCGTGCAGGCCCGGACTCTCCCTA...
3,chr1,5652318,5652589,ctrl-only.100,7.561415,0.0,chr1:5652318-5652589,AGTCCACAAGAAGGCAGGAAAACTGAGCTCCTTCCTCCTGGAGCGC...
4,chr1,59422999,59423270,ctrl-only.1000,7.966863,0.0,chr1:59422999-59423270,GTTTGGATACTATTGTGCCAGGCAGCCTTTCCGAACACTCCAGCCT...


In [13]:
df.shape

(82515, 8)

# data normalization

In [14]:
def zscore(df, col):
    """compute z-score for a column"""
    df[f'{col}.zscore'] = (df[col] - df[col].mean())/df[col].std()
    
    return df


## keep only inner quantile of data 

In [15]:
# quantile 
print(df["ctrl"].quantile(0.75))
df = df.loc[(df["ctrl"]< df["ctrl"].quantile(0.75) )&
           (df["ctrl"]>df["ctrl"].quantile(0.25))]

# log2
df["ctrl.log2"] = np.log2(df["ctrl"])

df = zscore(df, "ctrl")
df = zscore(df, "ctrl.log2")

27.63740371983405


# data transformations

## no transformation 
- plain read counts

In [None]:
sns.histplot(x="ctrl", data=df)

## log2 

In [None]:
sns.histplot(x="rep1.log2", data=us)

## z score of read norm counts

In [None]:
sns.histplot(x="rep1.zscore", data=us)

## log2 zscore

In [None]:
sns.histplot(x="rep1.log2.zscore", data=us)

## box cox

In [None]:
us['rep1.boxcox'] = boxcox(us["rep1"], 0.5)  # 0 =log transformation, 0.5=sqrt transformation

sns.histplot(x="rep1.boxcox", data=us)

In [None]:
us['rep1.sqrt'] = np.sqrt(us["rep1"])

sns.histplot(x="rep1.sqrt", data=us)

In [None]:
from sklearn.decomposition import PCA

In [None]:
values = pd.merge(ctrl[["coor", "rep1", "rep2", ]], us[["rep1", "rep2", "coor"]], left_on="coor", right_on="coor")
values

In [None]:
X = values[["rep1_x", "rep2_x", "rep1_y", "rep2_y"]]#.pivot(columns = 'coor', values=['rep1', 'rep2'])

In [None]:
pca = PCA(n_components=4)
X_ = pca.fit_transform(X)

print(pca.explained_variance_ratio_) 

In [None]:
X_