<a href="https://colab.research.google.com/github/semenko/liquid-cell-atlas/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Packages

In [30]:
! pip install pyBigWig
!apt install bedtools

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
bedtools is already the newest version (2.26.0+dfsg-5).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [36]:
import pandas as pd
from google.colab import files
import io
import itertools
import numpy as np
from tqdm.notebook import tqdm
import csv
import os
import urllib
import pickle
import json
import pyBigWig
import pybedtools

# Data Processing

In [2]:
# Download the TSV file from http://dcc.blueprint-epigenome.eu/#/files, and upload it here
! wget 'http://dcc.blueprint-epigenome.eu/data/blueprint_files.tsv'
data_tsv = pd.read_csv('blueprint_files.tsv', sep='\t')

noDisease_bw_data = data_tsv[(data_tsv['Disease'] == 'None') & 
                             (data_tsv['Format'] == 'bigWig') & 
                             (data_tsv['Experiment'] == 'Bisulfite-Seq')]

--2022-08-02 14:10:25--  http://dcc.blueprint-epigenome.eu/data/blueprint_files.tsv
Resolving dcc.blueprint-epigenome.eu (dcc.blueprint-epigenome.eu)... 193.62.193.83, 193.62.192.83
Connecting to dcc.blueprint-epigenome.eu (dcc.blueprint-epigenome.eu)|193.62.193.83|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4341342 (4.1M) [text/tab-separated-values]
Saving to: ‘blueprint_files.tsv’


2022-08-02 14:10:31 (774 KB/s) - ‘blueprint_files.tsv’ saved [4341342/4341342]



In [3]:
cov_files = []
call_files = []

cell_types = noDisease_bw_data['Cell type'].unique()
for cell_type in cell_types:
    cell_data = noDisease_bw_data[noDisease_bw_data['Cell type'] == cell_type]

    cell_call = cell_data.iloc[0]
    cell_cov = cell_data.iloc[1]

    cell_call_url = cell_call['URL']
    cell_cov_url = cell_cov['URL']
    
    cov_files.append(cell_cov_url)
    call_files.append(cell_call_url)

In [None]:
for url in cov_files:
    ! wget "$url"

for url in call_files:
    ! wget "$url"

In [18]:
CELL_TYPE_TO_FILE_ID = {}

for i in range(len(cell_types)):
    url = cov_files[i]
    file_name = url.split("/")[-1]
    ID = file_name.split(".")[0]
    CELL_TYPE_TO_FILE_ID[cell_types[i]] = ID

FILE_ID_TO_CELL_TYPE = {sample:cell_type for cell_type, sample_list in CELL_TYPE_TO_FILE_ID.items() for sample in sample_list}

# If the blueprint dict changes, we need to replace our cache files
# This is a tiny checksum of the dictionary state, which we incorporate into
# our cache filenames below.
CELL_TYPE_DICT_SIG = str(hex(abs(hash(json.dumps(CELL_TYPE_TO_FILE_ID, sort_keys=True))))[2:10])
print(f"Dictionary signature for cache files: {CELL_TYPE_DICT_SIG}\n")


BLUEPRINT_FILEKEYS = list((CELL_TYPE_TO_FILE_ID.values()))

# Validity testing
assert all(len(vals) > 1 for vals in CELL_TYPE_TO_FILE_ID.values()), "We need more than one example per cell type."
assert len(BLUEPRINT_FILEKEYS) == len(set(BLUEPRINT_FILEKEYS)), "One filename is duplicated in the cell types"

print(f"Number of Blueprint cell types: {len(CELL_TYPE_TO_FILE_ID.keys())}")
print(f"Number of Blueprint raw files: {len(BLUEPRINT_FILEKEYS)}")

Dictionary signature for cache files: 77a40a58

Number of Blueprint cell types: 44
Number of Blueprint raw files: 44


## Create Shared COV Files

In [23]:
# Get common coverage between the files, remove ENCODE regions, and find DMRs.
CHROMOSOMES = ["chr" + str(i) for i in range(1, 23)] + ["chrX"]

IGNORE_CACHE = False

BLUEPRINT_CPG_COV_MINIMUM = 10

RUN_SIGNATURE = f"{BLUEPRINT_CPG_COV_MINIMUM}_{CELL_TYPE_DICT_SIG}"

assert type(BLUEPRINT_CPG_COV_MINIMUM) is int
assert BLUEPRINT_CPG_COV_MINIMUM > 0

print(f"Minimum Blueprint coverage limit: {BLUEPRINT_CPG_COV_MINIMUM}")
print(f" (CpGs with fewer than {BLUEPRINT_CPG_COV_MINIMUM} reads in *any* sample will be ignored.)\n")

# Our output / save file
INTERSECTED_COVERAGE_BED = f"intersected_bs_cov_min_{RUN_SIGNATURE}.bed"
CLEAN_INTERSECTED_COVERAGE_BED = f"intersected_bs_cov_min_{RUN_SIGNATURE}_clean.bed"

print(f"Coverage BED: {INTERSECTED_COVERAGE_BED}")

if os.path.exists(INTERSECTED_COVERAGE_BED) and not IGNORE_CACHE:
    print("\tPost-processed cov .bed already exists. (Skipping raw Blueprint bs_cov parsing.)")
else:
    print("\t.bed does not exist yet -- parsing bs_cov .bw files.")
    INTERSECTED_BS_COV_POSITIONS = {}
    MAX_CHROMOSOME_SIZE = {}

    for file_key in tqdm(BLUEPRINT_FILEKEYS):
        with pyBigWig.open(file_key + ".CPG_methylation_calls.bs_cov.GRCh38.20160531.bw") as bw_object:
            bw_header = bw_object.header()
            if bw_header['nBasesCovered'] < 1e7 or bw_header['sumData'] < 1e8:
                print('\t\t*** WARNING: Input .bw has few reads or low coverage. This may cause unexpected results, consider removing this file.')
                print(f"\t\tnBasesCovered: {bw_header['nBasesCovered']}, sumData: {bw_header['sumData']}")
            for chrom in CHROMOSOMES:
                current_loop_values = set([start for start, _, cov in bw_object.intervals(chrom) if cov >= BLUEPRINT_CPG_COV_MINIMUM])
                MAX_CHROMOSOME_SIZE[chrom] = np.max(list(current_loop_values))
                existing_values = INTERSECTED_BS_COV_POSITIONS.get(chrom, current_loop_values)
                INTERSECTED_BS_COV_POSITIONS[chrom] = existing_values.intersection(current_loop_values)
            # print(track)

    # Save this hard work as a .bed for later recovery if needed
    with open(INTERSECTED_COVERAGE_BED, 'w') as outfile:
        # A .bed is just a .tsv with ['chrom', 'chromStart', 'chromEnd']
        bed_writer = csv.writer(outfile, delimiter='\t')
        for chr in CHROMOSOMES:
            for entry in INTERSECTED_BS_COV_POSITIONS[chr]:
                bed_writer.writerow([chr, entry, entry+1])

    print(f"\nWrote data to: {INTERSECTED_COVERAGE_BED}") # Unsorted

! cat {INTERSECTED_COVERAGE_BED} | tr -d '\r' > {CLEAN_INTERSECTED_COVERAGE_BED}

Minimum Blueprint coverage limit: 10
 (CpGs with fewer than 10 reads in *any* sample will be ignored.)

Coverage BED: intersected_bs_cov_min_10_77a40a58.bed
	.bed does not exist yet -- parsing bs_cov .bw files.


  0%|          | 0/44 [00:00<?, ?it/s]


Wrote data to: intersected_bs_cov_min_10_77a40a58.bed


In [37]:
# Load the .bed from above
raw_bs_cov_bed = pybedtools.BedTool(CLEAN_INTERSECTED_COVERAGE_BED)
print(f"Number of entries in bs_cov (raw): {len(raw_bs_cov_bed):,}")

if not os.path.exists("ENCFF356LFX.bed.gz"):
    print("Downloading ENCODE DAC Exclusion List")
    urllib.request.urlretrieve("https://www.encodeproject.org/files/ENCFF356LFX/@@download/ENCFF356LFX.bed.gz", "ENCFF356LFX.bed.gz")

! gunzip ENCFF356LFX.bed.gz
excluded_regions = pybedtools.BedTool("ENCFF356LFX.bed")
print(f"Number of entries in excluded_regions: {len(excluded_regions):,}")
assert len(excluded_regions) > 900 # 910 as of 1/2022

# .saveas forces this to render, otherwise may be a generator
# TODO: Fix path to be more specific
cleaned_sorted_bs_cov = raw_bs_cov_bed.subtract(excluded_regions).sort().saveas('bs_cov_cleaned_sorted.bed')
print(f"Number of remaining bs_cov entries: {len(cleaned_sorted_bs_cov):,}")
# assert len(cleaned_sorted_bs_cov) > 90000 # We expect about 90k entries


# Convert `bs_cov_cleaned_sorted` to a dict too:
#  key: chr
#  val: [sorted list of bs_cov positions]
# NOTE: This works because dicts are insertion ordered as of Python>3.7
BS_COV_POSITIONS = {}
for bed_entry in cleaned_sorted_bs_cov:
    BS_COV_POSITIONS.setdefault(bed_entry.chrom, []).append(bed_entry.start)

Number of entries in bs_cov (raw): 1,409,602
Downloading ENCODE DAC Exclusion List
gzip: ENCFF356LFX.bed already exists; do you wish to overwrite (y or n)? y
Number of entries in excluded_regions: 910


NotImplementedError: ignored

In [None]:
BS_COV_POSITIONS = {}
for bed_entry in raw_bs_cov_bed:
    BS_COV_POSITIONS.setdefault(bed_entry.chrom, []).append(bed_entry.start)

FILE_ID_TO_CPG_CALLS = { }

# cache_file = "bs_call_min_" + str(BLUEPRINT_CPG_COV_MINIMUM) + ".json"
print("Parsing bs_call files.")
for file_key in tqdm(BLUEPRINT_FILEKEYS):
    print(f"{file_key}")
    FILE_ID_TO_CPG_CALLS[file_key] = {}
    with pyBigWig.open(file_key + ".CPG_methylation_calls.bs_call.GRCh38.20160531.bw") as bw_object:
        for chrom in CHROMOSOMES:
            # This is more nuanced than the bs_cov data, since we only want to look at the 
            # CpGs that were covered across all samples. (The intervals now in BS_COV_POSITIONS).

            # Each .bw interval is a nested tuple of: ((start, end, value))
            # We extract all the values that overlap our bs_cov set.
            # FILE_ID_TO_CPG_CALLS[file_key][chrom] = [bw_object.intervals(chrom, pos, pos+1)[0][2] for pos in INTERSECTED_BS_COV_POSITIONS[chrom]]
            temp = []
            for pos in BS_COV_POSITIONS[chrom]:
                temp.append(bw_object.intervals(chrom, pos, pos+1)[0][2])
                FILE_ID_TO_CPG_CALLS[file_key][chrom] = temp

SCALED_BS_COV_POSITIONS = {}
for chrom in CHROMOSOMES:
    SCALED_BS_COV_POSITIONS[chrom] = np.array(BS_COV_POSITIONS[chrom]) / MAX_CHROMOSOME_SIZE[chrom]

Parsing bs_call files.


  0%|          | 0/44 [00:00<?, ?it/s]

S00JGXA1
S00JFZA1
S00JE0A1
S00JHVA1
G199
PreB2C-V152
G200


In [None]:
# Create the dataset
# Nx1 input with methylation array at proposed locations, and 1x1 string output?


In [None]:
!git clone https://github.com/hussius/tabnet_fork.git

In [None]:
os.chdir('tabnet_fork')

In [None]:
!pip install -r requirements.txt

In [None]:
! python opt_tabnet.py \
       --csv-path PATH_TO_CSV \
       --target-name "cell_type" \
       --categorical-features methylation