<a href="https://colab.research.google.com/github/semenko/liquid-cell-atlas/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from google.colab import files
import io
import itertools
import numpy as np
from tqdm.notebook import tqdm
import csv
import os
import urllib
import pickle

In [2]:
# Download the TSV file from http://dcc.blueprint-epigenome.eu/#/files, and upload it here
! wget 'http://dcc.blueprint-epigenome.eu/data/blueprint_files.tsv'
data_tsv = pd.read_csv('blueprint_files.tsv', sep='\t')

noDisease_bw_data = data_tsv[(data_tsv['Disease'] == 'None') & 
                             (data_tsv['Format'] == 'bigWig') & 
                             (data_tsv['Experiment'] == 'Bisulfite-Seq')]

--2022-08-01 18:38:10--  http://dcc.blueprint-epigenome.eu/data/blueprint_files.tsv
Resolving dcc.blueprint-epigenome.eu (dcc.blueprint-epigenome.eu)... 193.62.192.83, 193.62.193.83
Connecting to dcc.blueprint-epigenome.eu (dcc.blueprint-epigenome.eu)|193.62.192.83|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4341342 (4.1M) [text/tab-separated-values]
Saving to: ‘blueprint_files.tsv’


2022-08-01 18:38:14 (1.39 MB/s) - ‘blueprint_files.tsv’ saved [4341342/4341342]



In [4]:
cov_files = []
call_files = []

cell_types = noDisease_bw_data['Cell type'].unique()
for cell_type in cell_types:
    cell_data = noDisease_bw_data[noDisease_bw_data['Cell type'] == cell_type]

    cell_call = cell_data.iloc[0]
    cell_cov = cell_data.iloc[1]

    cell_call_url = cell_call['URL']
    cell_cov_url = cell_cov['URL']
    
    cov_files.append(cell_cov_url)
    call_files.append(cell_call_url)

In [None]:
for url in cov_files:
    ! wget "$url"

for url in call_files:
    ! wget "$url"

--2022-08-01 18:39:27--  http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/bone_marrow/BM030613/band_form_neutrophil/Bisulfite-Seq/CNAG/S00JGXA1.CPG_methylation_calls.bs_cov.GRCh38.20160531.bw
Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.138
Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.138|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118280227 (113M) [application/octet-stream]
Saving to: ‘S00JGXA1.CPG_methylation_calls.bs_cov.GRCh38.20160531.bw’


2022-08-01 18:39:37 (11.0 MB/s) - ‘S00JGXA1.CPG_methylation_calls.bs_cov.GRCh38.20160531.bw’ saved [118280227/118280227]

--2022-08-01 18:39:37--  http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/bone_marrow/BM030613/neutrophilic_metamyelocyte/Bisulfite-Seq/CNAG/S00JFZA1.CPG_methylation_calls.bs_cov.GRCh38.20160531.bw
Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.138
Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.138|:80... connected.
HTTP 

In [None]:
CELL_TYPE_TO_FILE_ID = {}

for i in range(len(cell_types)):
    url = cov_files[i]
    file_name = url.split("/")[-1]
    ID = file_name.split(".")[-1]
    CELL_TYPE_TO_FILE_ID[cell_types[i]] = ID

FILE_ID_TO_CELL_TYPE = {sample:cell_type for cell_type, sample_list in CELL_TYPE_TO_FILE_ID.items() for sample in sample_list}

# If the blueprint dict changes, we need to replace our cache files
# This is a tiny checksum of the dictionary state, which we incorporate into
# our cache filenames below.
CELL_TYPE_DICT_SIG = str(hex(abs(hash(json.dumps(CELL_TYPE_TO_FILE_ID, sort_keys=True))))[2:10])
print(f"Dictionary signature for cache files: {CELL_TYPE_DICT_SIG}\n")


BLUEPRINT_FILEKEYS = list(itertools.chain.from_iterable(CELL_TYPE_TO_FILE_ID.values()))

# Validity testing
assert all(len(vals) > 1 for vals in CELL_TYPE_TO_FILE_ID.values()), "We need more than one example per cell type."
assert len(BLUEPRINT_FILEKEYS) == len(set(BLUEPRINT_FILEKEYS)), "One filename is duplicated in the cell types"

print(f"Number of Blueprint cell types: {len(CELL_TYPE_TO_FILE_ID.keys())}")
print(f"Number of Blueprint raw files: {len(BLUEPRINT_FILEKEYS)}")

In [None]:
# Get common coverage between the files, remove ENCODE regions, and find DMRs.


In [None]:
# Create the dataset
# Nx1 input with methylation array at proposed locations, and 1x1 string output?


In [None]:
!git clone https://github.com/hussius/tabnet_fork.git

In [None]:
os.chdir('tabnet_fork')

In [None]:
!pip install -r requirements.txt

In [None]:
! python opt_tabnet.py \
       --csv-path PATH_TO_CSV \
       --target-name "cell_type" \
       --categorical-features methylation