<a href="https://colab.research.google.com/github/semenko/liquid-cell-atlas/blob/main/Correlation_Matrix_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyBigWig pybedtools gunzip bedparse deeptools pyGenomeTracks
!apt install bedtools

In [None]:
import pandas as pd
import io
import itertools
import numpy as np
from tqdm import tqdm
import csv
import os as os
import urllib
import pickle
import json
import pyBigWig
import pybedtools
import sys

In [None]:
! wget 'http://dcc.blueprint-epigenome.eu/data/blueprint_files.tsv' -N 
data_tsv = pd.read_csv('blueprint_files.tsv', sep='\t')

noDisease_bw_data = data_tsv[(data_tsv['Disease'] == 'None') & 
                             (data_tsv['Format'] == 'bigWig') & 
                             (data_tsv['Experiment'] == 'Bisulfite-Seq')]

In [None]:
! wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/gencode.v40.annotation.gtf.gz
! gunzip gencode.v40.annotation.gtf.gz
! bedparse gtf2bed <gencode.v40.annotation.gtf> output.bed --extraFields gene_id,gene_name
output_bed = pybedtools.BedTool("output.bed")

os.remove("gencode.v40.annotation.gtf")

In [None]:
gene_loc = pd.read_csv("output.bed", sep = '\t', names = ["chrom", "start", "end", "name", "e1", "strand", "e2", "e3", "e4", "e5", "e6", "e7", "gene_id", "gene_name"])
gene_loc = gene_loc[["chrom", "start", "end", "strand", "gene_id", "gene_name", "name"]]
gene_loc = gene_loc.drop(gene_loc[gene_loc["chrom"] == "chrM"].index)

gene_locs = {}

gene_names = list(set(gene_loc["gene_name"]))

with open("promoter.bed", 'w') as outfile:
    bed_writer = csv.writer(outfile, delimiter='\t')
    for i in tqdm(range(len(gene_names))):
        name = gene_names[i]
        gene = gene_loc[gene_loc["gene_name"] == name]
        chr = gene["chrom"].values[0]
        start_loc = min(gene["start"].values) - 500
        end_loc = max(gene["end"].values) + 500

        # A .bed is just a .tsv with ['chrom', 'chromStart', 'chromEnd']
        if gene["strand"].values[0] == '+':
            bed_writer.writerow([chr, start_loc - 500, start_loc + 500])
        else:
            bed_writer.writerow([chr, end_loc - 500, end_loc + 500])

In [None]:
cell_types = noDisease_bw_data['Cell type'].unique()

for cell_type in cell_types:
    track = 0
    for url in noDisease_bw_data[noDisease_bw_data['Cell type'] == cell_type]["URL"]:
        if track == 1:
            break
        ! wget "$url" -q -N
        file_name = url.split("/")[-1]
        new_file_name = cell_type + str(track) + ".bw"
        os.rename(file_name, new_file_name)
        track += 1

In [None]:
for cell_type in cell_types:
    urls = noDisease_bw_data[noDisease_bw_data['Cell type'] == cell_type]["URL"]
    if len(urls) > 2:
        url = urls.iloc[2]
        ! wget "$url" -q -N
        file_name = url.split("/")[-1]
        new_file_name = cell_type + str(1) + ".bw"
        os.rename(file_name, new_file_name)

In [None]:
! multiBigwigSummary BED-file -b *.bw -o double_results.npz --BED promoter.bed

In [None]:
! plotCorrelation -in double_results.npz --corMethod pearson --skipZeros --whatToPlot heatmap -o double_PearsonCorr_bigwigScores.png --removeOutliers