<a href="https://colab.research.google.com/github/semenko/liquid-cell-atlas/blob/main/data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Necessary Libraries

In [26]:
!pip install pyBigWig pybedtools
!apt install bedtools

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
bedtools is already the newest version (2.26.0+dfsg-5).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [27]:
import pandas as pd
from google.colab import files
import io
import json
import itertools
import numpy as np
import altair as alt
from tqdm.notebook import tqdm
import csv
import os
import urllib

import pyBigWig
import pybedtools

# Get TSV of all data from Blueprint, Filter and Download it

In [28]:
# Download the TSV file from http://dcc.blueprint-epigenome.eu/#/files, and upload it here
# file = files.upload()
! wget 'http://dcc.blueprint-epigenome.eu/data/blueprint_files.tsv'
data_tsv = pd.read_csv('blueprint_files.tsv', sep='\t')

--2022-06-30 21:13:46--  http://dcc.blueprint-epigenome.eu/data/blueprint_files.tsv
Resolving dcc.blueprint-epigenome.eu (dcc.blueprint-epigenome.eu)... 193.62.193.83, 193.62.192.83
Connecting to dcc.blueprint-epigenome.eu (dcc.blueprint-epigenome.eu)|193.62.193.83|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4341342 (4.1M) [text/tab-separated-values]
Saving to: ‘blueprint_files.tsv.3’


2022-06-30 21:13:50 (1.15 MB/s) - ‘blueprint_files.tsv.3’ saved [4341342/4341342]



Filtering the tsv file to get rid of individuals with diseases, and only keeping the bigWig file format. We also only keep bisulfite sequencing data.

In [29]:
noDisease_bw_data = data_tsv[(data_tsv['Disease'] == 'None') & 
                             (data_tsv['Format'] == 'bigWig') & 
                             (data_tsv['Experiment'] == 'Bisulfite-Seq')]

Types of cells present in the dataset. For now, I'll choose a macrophage sample, and a plasma cell

In [30]:
cell_types = noDisease_bw_data['Cell type'].unique()
cell_types

array(['band form neutrophil', 'neutrophilic metamyelocyte',
       'neutrophilic myelocyte', 'segmented neutrophil of bone marrow',
       'hematopoietic multipotent progenitor cell', 'precursor B cell',
       'precursor lymphocyte of B lineage', 'plasma cell',
       'mature neutrophil', 'CD38-negative naive B cell',
       'CD14-positive, CD16-negative classical monocyte',
       'CD8-positive, alpha-beta T cell',
       'cytotoxic CD56-dim natural killer cell',
       'inflammatory macrophage', 'erythroblast',
       'CD34-negative, CD41-positive, CD42-positive megakaryocyte cell',
       'macrophage', 'endothelial cell of umbilical vein (proliferating)',
       'endothelial cell of umbilical vein (resting)',
       'alternatively activated macrophage',
       'conventional dendritic cell',
       'CD3-negative, CD4-positive, CD8-positive, double positive thymocyte',
       'CD3-positive, CD4-positive, CD8-positive, double positive thymocyte',
       'CD4-positive, alpha-beta thym

Get example of macrophage and plasma data, and extract its url.

In [31]:
cell_type_1 = 'macrophage'
cell_type_2 = 'plasma cell'

assert cell_type_1 in cell_types, "{} not a valid cell type".format(cell_type_1)
assert cell_type_2 in cell_types, "{} not a valid cell type".format(cell_type_2)

cell1_data = noDisease_bw_data[noDisease_bw_data['Cell type'] == cell_type_1]
cell2_data = noDisease_bw_data[noDisease_bw_data['Cell type'] == cell_type_2]

cell1_call = cell1_data.iloc[0]
cell1_cov = cell1_data.iloc[1]
cell2_call = cell2_data.iloc[0]
cell2_cov = cell2_data.iloc[1]

cell1_call_url = cell1_call['URL']
cell1_cov_url = cell1_cov['URL']
cell2_call_url = cell2_call['URL']
cell2_cov_url = cell2_cov['URL']

cell1_call_filename = cell1_call_url.split("/")[-1]
cell1_cov_filename = cell1_cov_url.split("/")[-1]
cell2_call_filename = cell2_call_url.split("/")[-1]
cell2_cov_filename = cell2_cov_url.split("/")[-1]

Downloading the data at the four URLs! This will take around 10 minutes.

In [32]:
!wget '$cell1_call_url'
!wget '$cell1_cov_url'
!wget '$cell2_call_url'
!wget '$cell2_cov_url'

--2022-06-30 21:13:51--  http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/cord_blood/S00BHQ/macrophage/Bisulfite-Seq/CNAG/S00BHQ51.CPG_methylation_calls.bs_call.GRCh38.20160531.bw
Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.138
Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.138|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 125016699 (119M) [application/octet-stream]
Saving to: ‘S00BHQ51.CPG_methylation_calls.bs_call.GRCh38.20160531.bw.1’


2022-06-30 21:15:26 (1.26 MB/s) - ‘S00BHQ51.CPG_methylation_calls.bs_call.GRCh38.20160531.bw.1’ saved [125016699/125016699]

--2022-06-30 21:15:26--  http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/cord_blood/S00BHQ/macrophage/Bisulfite-Seq/CNAG/S00BHQ51.CPG_methylation_calls.bs_cov.GRCh38.20160531.bw
Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.138
Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.138|:80... connected.
HTTP request sent, awaiting re

In [33]:
# Annotations & bs_cov / bs_call names
CELL_TYPE_TO_FILE_ID = {
    "Macrophage": ["S00BHQ51"],
    "Plasma_cell": ["G202"]
}

# Reverse mapping of file id -> cell type
# e.g.  'S01BHIA1': 'Monocyte'
FILE_ID_TO_CELL_TYPE = {sample:cell_type for cell_type, sample_list in CELL_TYPE_TO_FILE_ID.items() for sample in sample_list}

# If the blueprint dict changes, we need to replace our cache files
# This is a tiny checksum of the dictionary state, which we incorporate into
# our cache filenames below.
CELL_TYPE_DICT_SIG = str(hex(abs(hash(json.dumps(CELL_TYPE_TO_FILE_ID, sort_keys=True))))[2:10])
print(f"Dictionary signature for cache files: {CELL_TYPE_DICT_SIG}\n")


BLUEPRINT_FILEKEYS = list(itertools.chain.from_iterable(CELL_TYPE_TO_FILE_ID.values()))

# Validity testing
# assert all(len(vals) > 1 for vals in CELL_TYPE_TO_FILE_ID.values()), "We need more than one example per cell type."
assert len(BLUEPRINT_FILEKEYS) == len(set(BLUEPRINT_FILEKEYS)), "One filename is duplicated in the cell types"

print(f"Number of Blueprint cell types: {len(CELL_TYPE_TO_FILE_ID.keys())}")
print(f"Number of Blueprint raw files: {len(BLUEPRINT_FILEKEYS)}")

Dictionary signature for cache files: 52eba6c5

Number of Blueprint cell types: 2
Number of Blueprint raw files: 2


# Create shared **cov** map

Load coverage (if >minimum below) across all files, then determine the set() shared across **all** samples.

IF a locus (e.g. "chr1:123") is missing from **one** single sample, it will be **excluded** from our entire analysis.

IF a locus has \<10 reads in **one** single sample, it will be **excluded** from our entire analysis.

In [34]:
CHROMOSOMES = ["chr" + str(i) for i in range(1, 23)] + ["chrX"]
CHROMOSOMES = ["chr1"]
IGNORE_CACHE = False

BLUEPRINT_CPG_COV_MINIMUM = 10

RUN_SIGNATURE = f"{BLUEPRINT_CPG_COV_MINIMUM}_{CELL_TYPE_DICT_SIG}"

assert type(BLUEPRINT_CPG_COV_MINIMUM) is int
assert BLUEPRINT_CPG_COV_MINIMUM > 0

print(f"Minimum Blueprint coverage limit: {BLUEPRINT_CPG_COV_MINIMUM}")
print(f" (CpGs with fewer than {BLUEPRINT_CPG_COV_MINIMUM} reads in *any* sample will be ignored.)\n")

# Our output / save file
INTERSECTED_COVERAGE_BED = f"intersected_bs_cov_min_{RUN_SIGNATURE}.bed"

print(f"Coverage BED: {INTERSECTED_COVERAGE_BED}")

if os.path.exists(INTERSECTED_COVERAGE_BED) and not IGNORE_CACHE:
    print("\tPost-processed cov .bed already exists. (Skipping raw Blueprint bs_cov parsing.)")
else:
    print("\t.bed does not exist yet -- parsing bs_cov .bw files.")
    INTERSECTED_BS_COV_POSITIONS = {}

    for file_key in tqdm(BLUEPRINT_FILEKEYS):
        with pyBigWig.open(file_key + ".CPG_methylation_calls.bs_cov.GRCh38.20160531.bw") as bw_object:
            bw_header = bw_object.header()
            if bw_header['nBasesCovered'] < 1e7 or bw_header['sumData'] < 1e8:
                print('\t\t*** WARNING: Input .bw has few reads or low coverage. This may cause unexpected results, consider removing this file.')
                print(f"\t\tnBasesCovered: {bw_header['nBasesCovered']}, sumData: {bw_header['sumData']}")
            for chrom in CHROMOSOMES:
                current_loop_values = set([start for start, _, cov in bw_object.intervals("chr1") if cov >= BLUEPRINT_CPG_COV_MINIMUM])
                existing_values = INTERSECTED_BS_COV_POSITIONS.get(chrom, current_loop_values)
                INTERSECTED_BS_COV_POSITIONS[chrom] = existing_values.intersection(current_loop_values)
            # print(track)

    # Save this hard work as a .bed for later recovery if needed
    with open(INTERSECTED_COVERAGE_BED, 'w') as outfile:
        # A .bed is just a .tsv with ['chrom', 'chromStart', 'chromEnd']
        bed_writer = csv.writer(outfile, delimiter='\t')
        for chr in CHROMOSOMES:
            for entry in INTERSECTED_BS_COV_POSITIONS[chr]:
                bed_writer.writerow([chr, entry, entry+1])

    print(f"\nWrote data to: {INTERSECTED_COVERAGE_BED}") # Unsorted

Minimum Blueprint coverage limit: 10
 (CpGs with fewer than 10 reads in *any* sample will be ignored.)

Coverage BED: intersected_bs_cov_min_10_52eba6c5.bed
	Post-processed cov .bed already exists. (Skipping raw Blueprint bs_cov parsing.)


In [35]:
! sort -k 1,1 -k2,2n intersected_bs_cov_min_10_52eba6c5.bed

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	21311844	21311845
chr1	21311893	21311894
chr1	21311978	21311979
chr1	21312053	21312054
chr1	21312100	21312101
chr1	21312181	21312182
chr1	21312207	21312208
chr1	21312216	21312217
chr1	21313109	21313110
chr1	21313247	21313248
chr1	21313255	21313256
chr1	21313377	21313378
chr1	21313387	21313388
chr1	21313520	21313521
chr1	21313795	21313796
chr1	21313886	21313887
chr1	21313893	21313894
chr1	21313906	21313907
chr1	21314006	21314007
chr1	21314018	21314019
chr1	21314087	21314088
chr1	21314122	21314123
chr1	21314223	21314224
chr1	21314257	21314258
chr1	21314279	21314280
chr1	21314308	21314309
chr1	21314334	21314335
chr1	21314355	21314356
chr1	21314417	21314418
chr1	21314485	21314486
chr1	21314696	21314697
chr1	21314715	21314716
chr1	21314871	21314872
chr1	21315309	21315310
chr1	21315444	21315445
chr1	21315514	21315515
chr1	21315661	21315662
chr1	21315687	21315688
chr1	21315884	21315885
chr1	21315928	21315929
chr1	21316023	2

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	36527614	36527615
chr1	36527646	36527647
chr1	36527707	36527708
chr1	36527747	36527748
chr1	36527751	36527752
chr1	36527772	36527773
chr1	36527931	36527932
chr1	36527936	36527937
chr1	36527955	36527956
chr1	36528053	36528054
chr1	36528070	36528071
chr1	36528123	36528124
chr1	36528215	36528216
chr1	36528241	36528242
chr1	36528286	36528287
chr1	36528303	36528304
chr1	36528389	36528390
chr1	36528395	36528396
chr1	36528507	36528508
chr1	36528588	36528589
chr1	36529047	36529048
chr1	36529638	36529639
chr1	36529874	36529875
chr1	36529929	36529930
chr1	36529987	36529988
chr1	36530261	36530262
chr1	36530286	36530287
chr1	36530292	36530293
chr1	36530370	36530371
chr1	36530376	36530377
chr1	36530438	36530439
chr1	36530478	36530479
chr1	36530809	36530810
chr1	36530852	36530853
chr1	36530930	36530931
chr1	36530944	36530945
chr1	36530966	36530967
chr1	36531097	36531098
chr1	36531297	36531298
chr1	36531319	36531320
chr1	36531327	3

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	100059378	100059379
chr1	100059406	100059407
chr1	100059494	100059495
chr1	100059902	100059903
chr1	100059934	100059935
chr1	100059956	100059957
chr1	100061900	100061901
chr1	100062354	100062355
chr1	100062456	100062457
chr1	100062619	100062620
chr1	100062715	100062716
chr1	100062849	100062850
chr1	100062874	100062875
chr1	100063743	100063744
chr1	100063749	100063750
chr1	100063759	100063760
chr1	100063781	100063782
chr1	100063816	100063817
chr1	100064163	100064164
chr1	100064171	100064172
chr1	100064319	100064320
chr1	100064370	100064371
chr1	100064379	100064380
chr1	100064407	100064408
chr1	100064419	100064420
chr1	100064825	100064826
chr1	100064948	100064949
chr1	100065020	100065021
chr1	100065539	100065540
chr1	100066309	100066310
chr1	100066710	100066711
chr1	100066726	100066727
chr1	100066754	100066755
chr1	100066759	100066760
chr1	100066770	100066771
chr1	100066784	100066785
chr1	100066804	100066805
chr1	10006

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	150586443	150586444
chr1	150586477	150586478
chr1	150586485	150586486
chr1	150586683	150586684
chr1	150586687	150586688
chr1	150586693	150586694
chr1	150586731	150586732
chr1	150586755	150586756
chr1	150586763	150586764
chr1	150587188	150587189
chr1	150587244	150587245
chr1	150587291	150587292
chr1	150587347	150587348
chr1	150587560	150587561
chr1	150587607	150587608
chr1	150587931	150587932
chr1	150587935	150587936
chr1	150587937	150587938
chr1	150587947	150587948
chr1	150587975	150587976
chr1	150588005	150588006
chr1	150588036	150588037
chr1	150588065	150588066
chr1	150588069	150588070
chr1	150588077	150588078
chr1	150588081	150588082
chr1	150588101	150588102
chr1	150588125	150588126
chr1	150588140	150588141
chr1	150588157	150588158
chr1	150588165	150588166
chr1	150588173	150588174
chr1	150588178	150588179
chr1	150588185	150588186
chr1	150588194	150588195
chr1	150588202	150588203
chr1	150588210	150588211
chr1	15058

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	167511744	167511745
chr1	167511777	167511778
chr1	167511786	167511787
chr1	167511802	167511803
chr1	167511817	167511818
chr1	167511839	167511840
chr1	167511861	167511862
chr1	167511982	167511983
chr1	167511985	167511986
chr1	167512115	167512116
chr1	167512150	167512151
chr1	167512192	167512193
chr1	167512205	167512206
chr1	167512227	167512228
chr1	167512230	167512231
chr1	167512412	167512413
chr1	167512723	167512724
chr1	167512772	167512773
chr1	167514067	167514068
chr1	167514228	167514229
chr1	167514384	167514385
chr1	167514389	167514390
chr1	167514484	167514485
chr1	167514495	167514496
chr1	167514534	167514535
chr1	167514556	167514557
chr1	167514568	167514569
chr1	167514602	167514603
chr1	167514649	167514650
chr1	167515025	167515026
chr1	167515085	167515086
chr1	167515133	167515134
chr1	167515144	167515145
chr1	167515279	167515280
chr1	167515328	167515329
chr1	167515659	167515660
chr1	167516035	167516036
chr1	16751

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	176228449	176228450
chr1	176228527	176228528
chr1	176228621	176228622
chr1	176229193	176229194
chr1	176229750	176229751
chr1	176229806	176229807
chr1	176230048	176230049
chr1	176230093	176230094
chr1	176230169	176230170
chr1	176230420	176230421
chr1	176230731	176230732
chr1	176231152	176231153
chr1	176231291	176231292
chr1	176231293	176231294
chr1	176231484	176231485
chr1	176231523	176231524
chr1	176231609	176231610
chr1	176231679	176231680
chr1	176231696	176231697
chr1	176231723	176231724
chr1	176231729	176231730
chr1	176231800	176231801
chr1	176232681	176232682
chr1	176232947	176232948
chr1	176233486	176233487
chr1	176233737	176233738
chr1	176233800	176233801
chr1	176234061	176234062
chr1	176234192	176234193
chr1	176234219	176234220
chr1	176234243	176234244
chr1	176234252	176234253
chr1	176234827	176234828
chr1	176234851	176234852
chr1	176234859	176234860
chr1	176234897	176234898
chr1	176234921	176234922
chr1	17623

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	202337102	202337103
chr1	202337175	202337176
chr1	202337219	202337220
chr1	202337229	202337230
chr1	202337231	202337232
chr1	202338796	202338797
chr1	202339555	202339556
chr1	202339577	202339578
chr1	202339628	202339629
chr1	202339638	202339639
chr1	202340163	202340164
chr1	202340165	202340166
chr1	202340187	202340188
chr1	202340385	202340386
chr1	202340444	202340445
chr1	202340467	202340468
chr1	202340469	202340470
chr1	202340499	202340500
chr1	202340942	202340943
chr1	202341110	202341111
chr1	202341124	202341125
chr1	202341143	202341144
chr1	202341252	202341253
chr1	202341257	202341258
chr1	202341299	202341300
chr1	202341303	202341304
chr1	202341305	202341306
chr1	202341315	202341316
chr1	202341343	202341344
chr1	202341348	202341349
chr1	202341352	202341353
chr1	202341359	202341360
chr1	202341373	202341374
chr1	202341384	202341385
chr1	202341393	202341394
chr1	202341404	202341405
chr1	202341533	202341534
chr1	20234

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	220338523	220338524
chr1	220338587	220338588
chr1	220338618	220338619
chr1	220338645	220338646
chr1	220338687	220338688
chr1	220338781	220338782
chr1	220339536	220339537
chr1	220339564	220339565
chr1	220339922	220339923
chr1	220340056	220340057
chr1	220340180	220340181
chr1	220340505	220340506
chr1	220340802	220340803
chr1	220340804	220340805
chr1	220340810	220340811
chr1	220340827	220340828
chr1	220340849	220340850
chr1	220342021	220342022
chr1	220342234	220342235
chr1	220342684	220342685
chr1	220342688	220342689
chr1	220342720	220342721
chr1	220342759	220342760
chr1	220342940	220342941
chr1	220342968	220342969
chr1	220342999	220343000
chr1	220343150	220343151
chr1	220343220	220343221
chr1	220343384	220343385
chr1	220343401	220343402
chr1	220343432	220343433
chr1	220343703	220343704
chr1	220343717	220343718
chr1	220343728	220343729
chr1	220343733	220343734
chr1	220343846	220343847
chr1	220344060	220344061
chr1	22034

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	236737081	236737082
chr1	236737084	236737085
chr1	236737093	236737094
chr1	236737123	236737124
chr1	236737131	236737132
chr1	236737206	236737207
chr1	236737253	236737254
chr1	236737289	236737290
chr1	236737435	236737436
chr1	236737485	236737486
chr1	236737531	236737532
chr1	236737662	236737663
chr1	236737679	236737680
chr1	236737827	236737828
chr1	236737845	236737846
chr1	236737850	236737851
chr1	236737856	236737857
chr1	236737867	236737868
chr1	236738094	236738095
chr1	236738153	236738154
chr1	236738207	236738208
chr1	236738254	236738255
chr1	236738275	236738276
chr1	236738287	236738288
chr1	236738297	236738298
chr1	236738618	236738619
chr1	236738751	236738752
chr1	236738755	236738756
chr1	236738789	236738790
chr1	236739043	236739044
chr1	236739071	236739072
chr1	236739090	236739091
chr1	236739107	236739108
chr1	236739123	236739124
chr1	236739125	236739126
chr1	236739135	236739136
chr1	236739151	236739152
chr1	23673

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chr1	247881798	247881799
chr1	247881847	247881848
chr1	247881900	247881901
chr1	247882067	247882068
chr1	247882201	247882202
chr1	247882213	247882214
chr1	247882413	247882414
chr1	247882570	247882571
chr1	247882613	247882614
chr1	247882617	247882618
chr1	247882705	247882706
chr1	247883029	247883030
chr1	247883105	247883106
chr1	247883272	247883273
chr1	247883299	247883300
chr1	247883301	247883302
chr1	247883356	247883357
chr1	247883396	247883397
chr1	247883416	247883417
chr1	247883432	247883433
chr1	247883525	247883526
chr1	247883548	247883549
chr1	247883572	247883573
chr1	247883697	247883698
chr1	247883806	247883807
chr1	247883810	247883811
chr1	247883822	247883823
chr1	247883850	247883851
chr1	247883855	247883856
chr1	247883866	247883867
chr1	247883880	247883881
chr1	247883900	247883901
chr1	247883911	247883912
chr1	247884009	247884010
chr1	247884033	247884034
chr1	247884039	247884040
chr1	247884041	247884042
chr1	24788

# Removing ENCODE Regions (Doesn't Work)

Encode defines a standard list of "bad" regions -- very low complexity / information content, etc. that show up in some studies but are not informative. We remove them and do some other standard data cleaning.

In [36]:
# Load the .bed from above
raw_bs_cov_bed = pybedtools.BedTool(INTERSECTED_COVERAGE_BED)
print(f"Number of entries in bs_cov (raw): {len(raw_bs_cov_bed):,}")

if not os.path.exists("ENCFF356LFX.bed.gz"):
    print("Downloading ENCODE DAC Exclusion List")
    urllib.request.urlretrieve("https://www.encodeproject.org/files/ENCFF356LFX/@@download/ENCFF356LFX.bed.gz", "ENCFF356LFX.bed.gz")

excluded_regions = pybedtools.BedTool("ENCFF356LFX.bed.gz")
print(f"Number of entries in excluded_regions: {len(excluded_regions):,}")
assert len(excluded_regions) > 900 # 910 as of 1/2022

# .saveas forces this to render, otherwise may be a generator
# TODO: Fix path to be more specific
cleaned_sorted_bs_cov = raw_bs_cov_bed.subtract(excluded_regions).sort().saveas('bs_cov_cleaned_sorted.bed')
print(f"Number of remaining bs_cov entries: {len(cleaned_sorted_bs_cov):,}")
# assert len(cleaned_sorted_bs_cov) > 90000 # We expect about 90k entries


# Convert `bs_cov_cleaned_sorted` to a dict too:
#  key: chr
#  val: [sorted list of bs_cov positions]
# NOTE: This works because dicts are insertion ordered as of Python>3.7
BS_COV_POSITIONS = {}
for bed_entry in cleaned_sorted_bs_cov:
    BS_COV_POSITIONS.setdefault(bed_entry.chrom, []).append(bed_entry.start)

Number of entries in bs_cov (raw): 1,603,942
Number of entries in excluded_regions: 910


chr1	8388609	8388610

chr1	8388609	8388610



Number of remaining bs_cov entries: 0


# Load bs_call data

In [37]:
FILE_ID_TO_CPG_CALLS = { }

# cache_file = "bs_call_min_" + str(BLUEPRINT_CPG_COV_MINIMUM) + ".json"
print("Parsing bs_call files.")
for file_key in tqdm(BLUEPRINT_FILEKEYS):
    print(f"{file_key}")
    FILE_ID_TO_CPG_CALLS[file_key] = {}
    with pyBigWig.open(file_key + ".CPG_methylation_calls.bs_call.GRCh38.20160531.bw") as bw_object:
        for chrom in CHROMOSOMES:
            # This is more nuanced than the bs_cov data, since we only want to look at the 
            # CpGs that were covered across all samples. (The intervals now in BS_COV_POSITIONS).

            # Each .bw interval is a nested tuple of: ((start, end, value))
            # We extract all the values that overlap our bs_cov set.
            # FILE_ID_TO_CPG_CALLS[file_key][chrom] = [bw_object.intervals(chrom, pos, pos+1)[0][2] for pos in INTERSECTED_BS_COV_POSITIONS[chrom]]
            temp = []
            for pos in INTERSECTED_BS_COV_POSITIONS[chrom]:
                try:
                  temp.append(bw_object.intervals(chrom, pos, pos+1)[0][2])
                  FILE_ID_TO_CPG_CALLS[file_key][chrom] = temp
                except:
                  print(bw_object.intervals(chrom, pos, pos+1))
                  break

Parsing bs_call files. [This should take ~10 minutes.]


  0%|          | 0/2 [00:00<?, ?it/s]

S00BHQ51
G202
