In [1]:
import os
import sys
import re
import time
import yaml
import shutil
import argparse
import tarfile
import tempfile
import random
import subprocess
import copy
import functools

import re

import numpy as np
import pandas as pd

from tqdm import tqdm

import torch

import boda
from boda.generator.parameters import BasicParameters
from boda.common import utils
from boda.common.utils import set_best, save_model, unpack_artifact, model_fn

import hypertune

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.graphics.tsaplots import plot_acf

from scipy import spatial
from scipy.cluster import hierarchy

In [2]:
def cloud_load(func):
    @functools.wraps(func)
    def wrapper_decorator(*args, **kwargs):
        print(f'loading file from {args[0]}', file=sys.stderr)
        if 'gs://' == args[0][:5]:
            args = copy.deepcopy(list(args))
            with tempfile.TemporaryDirectory() as tempdir:
                temp_loc = os.path.join(tempdir, os.path.basename(args[0]))
                subprocess.check_call(['gsutil','cp',args[0], temp_loc])
                args[0] = temp_loc
                value = func(*args, **kwargs)
        else:
            value = func(*args, **kwargs)
        print('Success', file=sys.stderr)
        return value
    return wrapper_decorator

def cloud_dump(func):
    @functools.wraps(func)
    def wrapper_decorator(*args, **kwargs):
        print(f'dumping file to {args[0]}', file=sys.stderr)
        if 'gs://' == args[0][:5]:
            args = copy.deepcopy(list(args))
            remote_loc = copy.deepcopy(args[0])
            with tempfile.TemporaryDirectory() as tempdir:
                temp_loc = os.path.join(tempdir, os.path.basename(args[0]))
                args[0] = temp_loc
                value = func(*args, **kwargs)
                subprocess.check_call(['gsutil','cp',temp_loc,remote_loc])
        else:
            value = func(*args, **kwargs)
        print('Success', file=sys.stderr)
        return value
    return wrapper_decorator

@cloud_load
def load_fasta(fasta_fn):
    data = {'id': [], 'sequence': []}
    with open(fasta_fn, 'r') as f:
        for i, line in enumerate(f):
            if i % 2 == 0:
                data['id'].append(line.rstrip())
            else:
                data['sequence'].append(line.rstrip())
    return pd.DataFrame(data)

@cloud_dump
def pandas_to_fasta(fasta_fn, data, carat_field='id', sequence_field='sequence'):
    with open(fasta_fn, 'w') as f:
        for idx, line in data.iterrows():
            print(line[carat_field], file=f)
            print(line[sequence_field], file=f)
    return None

In [3]:
def check_asiSI(sequence):
    asiSI = 'GCGATCGC'
    if asiSI[1:] == sequence[:7].upper():
        return True
    elif asiSI[:-1] == sequence[-7:].upper():
        return True
    elif asiSI in sequence.upper():
        return True
    else:
        return False


# DHS FASTAs

In [4]:
K562_update = [
    {'id': '>control_DHS_K_11088::chr11:65508340-65508520', 'sequence': 'TCTCTACTACACACACACACACACACACACACACACACACACACACACACACAATTTGCTGGGCGTGGTGGAGGGCGCCTGTAGTCCTAGCTACTCGTGAGGCTGAGGCAGGAGAATGGCGTGAACCTGGGAGGCGGAGCTTGCCGTGAGGTGAGATCGGGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGT'}
]

HEPG2_update=[]

SKNSH_update=[
    {'id': '>control_DHS_S_42234::chrX:13377400-13377428', 'sequence': 'CCTTCGTATCACGCCTTTCCAGTGAGTAAGATCATCTAGATAGATTCTCCAAAAGAAATAACCCAGTAAGGGAGGGGCAACCAGCATGACCACAATGCTAGTTGTTGTGTGCGAATGGTAAACAGAATGAAATTATGAATGTGTGTGCGGGTCATTATATGCATGCAGACACATACATATGCATATGTAGTATAAATCTC'},
    {'id': '>control_DHS_S_17598::chr17:50038379-50038620', 'sequence': 'GCATCCCCTCCTGGAGAGCTTCTGCCTGCTGTCACCCTGTCCAAGGGCCCAACCCTGGTGGATATCCCTAAACTTCTTGGGGTCCCCTTCCCTGCCTTAGCAGAAGCCTGGAGCAGGCATGGCCTCTCTACCTTTCCTGACATTTCCCCCTCTCCTGTGGCCTTAGATTGAGACAAGGATGTCGTTTGAAAGAAGTCTCT'},
    {'id': '>control_DHS_S_9972::chr12:102120782-102120918', 'sequence': 'TGTTTCACTAACGTCTCATTCTGTGAGGTAGGAATTCCCTTCCTGGTTTTGCAGGTGAAGAAACTTATGCTCATTAGAGATTTTAAATACCGAAGGACTACAGGTCATAAAGGGGGAGCCTGGAAGTGATATCTGACTCGGAAGTGTAGTTCTTTCAACCATTTCACACTGCCTGCCCAGTGACAATTTTCTGTCTCACA'}
]

In [5]:
k562_fasta = load_fasta('gs://syrgoth/boda_library_design_202112/fastas/control_DHS_K562.fa')

print(f"dropping {sum([ check_asiSI(seq) for seq in k562_fasta['sequence'] ])} lines", file=sys.stderr)
print(f"replacing with {len(K562_update)} lines", file=sys.stderr)

k562_fasta = k562_fasta.loc[ [ check_asiSI(seq)==False for seq in k562_fasta['sequence'] ] ].reset_index(drop=True)
for update in K562_update:
    k562_fasta = k562_fasta.append(
        update, 
        ignore_index=True
    )

loading file from gs://syrgoth/boda_library_design_202112/fastas/control_DHS_K562.fa
Success
dropping 1 lines
replacing with 1 lines


In [6]:
k562_fasta

Unnamed: 0,id,sequence
0,>control_DHS_K_3832::chr1:146934470-146934670,ttttctcctcttcctggggctgcctaatctctacccaccatgtgcc...
1,>control_DHS_K_27285::chr18:77990220-77990420,aacagctcactacaaggtgccgcccgccccatgcggcttaggtgag...
2,>control_DHS_K_27286::chr18:77993720-77993920,aagagcctatgcagcgaaactcccgtttttataaccatcagatctc...
3,>control_DHS_K_45124::chr5:43038298-43038498,CCGCTTAAGTCCGAACTTGCTATGCAGAGAAACCACAATCAGGGCT...
4,>control_DHS_K_55475::chr8:41798719-41798919,AAGGTGGGGGCTGTCTCCCTCCATGGTTTCCTCTAGGGGTGGGTGT...
...,...,...
3995,>control_DHS_K_53715::chr7:106862701-106862901,acgtatgggaagcacACACTTGTCTGGCCATGTCTCTCCCTGTCTC...
3996,>control_DHS_K_31402::chr2:16646208-16646408,AGTTGGTAGAATCAAGATGTCTAGAGTTCATCCTCCCccaccaggc...
3997,>control_DHS_K_3547::chr1:113659150-113659350,GATCATATCTCTGCAGTGAATGGGCAGTCCATTGTTGAACTGTCTC...
3998,>control_DHS_K_55299::chr8:30811759-30811959,ccgagtagctgtggccacaggcgtgagccaccacgcccggcTTGAG...


In [7]:
pandas_to_fasta('gs://syrgoth/boda_library_design_202112/fastas_2/control_DHS_K562__asiSI_filtered.fa', k562_fasta)

dumping file to gs://syrgoth/boda_library_design_202112/fastas_2/control_DHS_K562__asiSI_filtered.fa
Success


In [8]:
hepg2_fasta = load_fasta('gs://syrgoth/boda_library_design_202112/fastas/control_DHS_HEPG2.fa')

print(f"dropping {sum([ check_asiSI(seq) for seq in hepg2_fasta['sequence'] ])} lines", file=sys.stderr)
print(f"replacing with {len(HEPG2_update)} lines", file=sys.stderr)

hepg2_fasta = hepg2_fasta.loc[ [ check_asiSI(seq)==False for seq in hepg2_fasta['sequence'] ] ].reset_index(drop=True)
for update in HEPG2_update:
    hepg2_fasta = hepg2_fasta.append(
        update, 
        ignore_index=True
    )

loading file from gs://syrgoth/boda_library_design_202112/fastas/control_DHS_HEPG2.fa
Success
dropping 0 lines
replacing with 0 lines


In [9]:
hepg2_fasta

Unnamed: 0,id,sequence
0,>control_DHS_H_34251::chr3:194496600-194496800,aaagtccaagctcttagggccccacaccaaagctagctgcctgttt...
1,>control_DHS_H_34252::chr3:194497831-194498031,GGCAGCTCCTCCATGTGCCTGTTTCAGATGCCAAGAAGAGAAGCTC...
2,>control_DHS_H_49049::chrY:2824580-2824780,aagtttacgaagcccctcatatcgtgatggaatgcagccctccgga...
3,>control_DHS_H_49051::chrY:2841470-2841670,TGCTAAGAAATTCAGTTCCAGGATATGAACTCTACAGCGGAAGAAT...
4,>control_DHS_H_49054::chrY:6996030-6996230,TTTTTCCATGTGATGGGACAGGGGAATGCTGTGTACATGGCCGCTG...
...,...,...
3995,>control_DHS_H_13826::chr14:73996010-73996210,TTCCCAGAACGTCACACTCTGGCTTCCCTTCTTCCATGTCCCCGGA...
3996,>control_DHS_H_42716::chr7:76304235-76304435,TGTTCTTTTGATACATTTATCTTCTGTTTTTCTCAAATAAAGTTCA...
3997,>control_DHS_H_22806::chr19:18478620-18478820,GCCTGGTAGGGCCGCCACAGCTGGCCTGACACAGTCAGCAGGTCAG...
3998,>control_DHS_H_14485::chr15:25773340-25773540,TCTGCGGAGGTGCTCTGACAGGTCTTGCATTGTGCTCTGTGGACAC...


In [10]:
pandas_to_fasta('gs://syrgoth/boda_library_design_202112/fastas_2/control_DHS_HEPG2__asiSI_filtered.fa', hepg2_fasta)

dumping file to gs://syrgoth/boda_library_design_202112/fastas_2/control_DHS_HEPG2__asiSI_filtered.fa
Success


In [11]:
sknsh_fasta = load_fasta('gs://syrgoth/boda_library_design_202112/fastas/control_DHS_SKNSH.fa')

print(f"dropping {sum([ check_asiSI(seq) for seq in sknsh_fasta['sequence'] ])} lines", file=sys.stderr)
print(f"replacing with {len(SKNSH_update)} lines", file=sys.stderr)

sknsh_fasta = sknsh_fasta.loc[ [ check_asiSI(seq)==False for seq in sknsh_fasta['sequence'] ] ].reset_index(drop=True)
for update in SKNSH_update:
    sknsh_fasta = sknsh_fasta.append(
        update, 
        ignore_index=True
    )

loading file from gs://syrgoth/boda_library_design_202112/fastas/control_DHS_SKNSH.fa
Success
dropping 3 lines
replacing with 3 lines


In [12]:
sknsh_fasta

Unnamed: 0,id,sequence
0,>control_DHS_S_7829::chr11:96341553-96341753,CGCAGGAGGGGCAGCAGGGGGCGGTGGAGGGGCTGTAGTGGTGGCA...
1,>control_DHS_S_18572::chr17:78317157-78317357,GCAGGGAAGTATAACTGATCTGAAGGGTGGGGTGGGGGTTCTGCTC...
2,>control_DHS_S_42077::chr9:134709683-134709883,CAGGTTTGTTCCATGGAAAATGTGAGCCACAGACCCATCTCCCAGC...
3,>control_DHS_S_42554::chrX:49073970-49074170,AGCCGCACCTCCGACATCCTGCCGGTTAATGTGGCTGGACCAGCCA...
4,>control_DHS_S_13632::chr15:60661160-60661360,TCATTAGCTACCTTTATATAAAAAAATCACTGTAATTTGTACTAAC...
...,...,...
3995,>control_DHS_S_26860::chr22:26512753-26512953,tgtaaatctgtaaaatagagatgtaggattagcccatacggtagtt...
3996,>control_DHS_S_26893::chr22:28329310-28329510,ACAAGTACAGTTGAAATACACTGTTTCtctgtcagtgtacatccca...
3997,>control_DHS_S_42234::chrX:13377400-13377428,CCTTCGTATCACGCCTTTCCAGTGAGTAAGATCATCTAGATAGATT...
3998,>control_DHS_S_17598::chr17:50038379-50038620,GCATCCCCTCCTGGAGAGCTTCTGCCTGCTGTCACCCTGTCCAAGG...


In [13]:
pandas_to_fasta('gs://syrgoth/boda_library_design_202112/fastas_2/control_DHS_SKNSH__asiSI_filtered.fa', sknsh_fasta)

dumping file to gs://syrgoth/boda_library_design_202112/fastas_2/control_DHS_SKNSH__asiSI_filtered.fa
Success


# GW FASTA

In [14]:
GW_update=[
    {'id': '>chr3:84759100-84759300', 'sequence': 'GCAGCTGGAAGAGCCTGATAAGCATTCCCAGTCTCCAATGTGAACCCAGGAAAGTCATCCCTGACTGTGTCTCACAGGGGCCCTCAGGAAAGGCCGCCAGCAAACTCAAGGAGGGGTCAAAGGGTGAAAGAAGGTCCCGGTTAAAATTTGTGATATAATTTTGAGTGGGCACCGACTCCCTTGAACAGCATCTGGTGGGT'},
    {'id': '>chr2:112069700-112069900', 'sequence': 'CTGTGTCAAACGGTAGTTCTCTTTTCAGCTCTTTGAGGAATTGCCACACCGCTTTCCACAATGGGTGAACCAATTTGTACTCCCACCAGCAGTGTATAAGCATTCCCTTTTCTCCACAACCTTGCCAGCATCTGTTATTTTTTGACTTTTTAATAATAGCCATTCTGACTGGTGTGAGATGGTATCTCATTGTGGTTTTG'},
]

In [15]:
gw_fasta = load_fasta('gs://syrgoth/boda_library_design_202112/fastas/rc__k562__gw__track_000__round_0__20211210_203441__47591334.fa')

print(f"dropping {sum([ check_asiSI(seq) for seq in gw_fasta['sequence'] ])} lines", file=sys.stderr)
print(f"replacing with {len(GW_update)} lines", file=sys.stderr)

gw_fasta = gw_fasta.loc[ [ check_asiSI(seq)==False for seq in gw_fasta['sequence'] ] ].reset_index(drop=True)
for update in GW_update:
    gw_fasta = gw_fasta.append(
        update, 
        ignore_index=True
    )

loading file from gs://syrgoth/boda_library_design_202112/fastas/rc__k562__gw__track_000__round_0__20211210_203441__47591334.fa
Success
dropping 2 lines
replacing with 2 lines


In [16]:
pandas_to_fasta('gs://syrgoth/boda_library_design_202112/fastas_2/rc__k562__gw__track_000__round_0__20211210_203441__47591334.fa', gw_fasta)

dumping file to gs://syrgoth/boda_library_design_202112/fastas_2/rc__k562__gw__track_000__round_0__20211210_203441__47591334.fa
Success


In [18]:
subprocess.check_call([
    'gsutil','cp',
    'gs://syrgoth/boda_library_design_202112/fastas/rc__hepg2__gw__track_000__round_0__20211210_203441__37997823.fa', 
    'gs://syrgoth/boda_library_design_202112/fastas_2/rc__hepg2__gw__track_000__round_0__20211210_203441__37997823.fa'
])
subprocess.check_call([
    'gsutil','cp',
    'gs://syrgoth/boda_library_design_202112/fastas/rc__sknsh__gw__track_000__round_0__20211210_203441__82237643.fa', 
    'gs://syrgoth/boda_library_design_202112/fastas_2/rc__sknsh__gw__track_000__round_0__20211210_203441__82237643.fa'
])

0