In [1]:
import os
import sys
import re
import time
import yaml
import shutil
import argparse
import tarfile
import tempfile
import random
import subprocess
import copy
import functools

import numpy as np
import pandas as pd

from tqdm import tqdm

import torch

import boda
from boda.generator.parameters import BasicParameters
from boda.common import utils
from boda.common.utils import set_best, save_model, unpack_artifact, model_fn

import hypertune

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.graphics.tsaplots import plot_acf

from scipy import spatial
from scipy.cluster import hierarchy

In [2]:
fasta_fn = 'gs://syrgoth/boda_library_design_202112/test_design__20211215.fa'

In [3]:
def cloud_load(func):
    @functools.wraps(func)
    def wrapper_decorator(*args, **kwargs):
        print(f'loading file from {args[0]}', file=sys.stderr)
        if 'gs://' == args[0][:5]:
            args = copy.deepcopy(list(args))
            with tempfile.TemporaryDirectory() as tempdir:
                temp_loc = os.path.join(tempdir, os.path.basename(args[0]))
                subprocess.check_call(['gsutil','cp',args[0], temp_loc])
                args[0] = temp_loc
                value = func(*args, **kwargs)
        else:
            value = func(*args, **kwargs)
        print('Success', file=sys.stderr)
        return value
    return wrapper_decorator

def cloud_dump(func):
    @functools.wraps(func)
    def wrapper_decorator(*args, **kwargs):
        print(f'dumping file to {args[0]}', file=sys.stderr)
        if 'gs://' == args[0][:5]:
            args = copy.deepcopy(list(args))
            remote_loc = copy.deepcopy(args[0])
            with tempfile.TemporaryDirectory() as tempdir:
                temp_loc = os.path.join(tempdir, os.path.basename(args[0]))
                args[0] = temp_loc
                value = func(*args, **kwargs)
                subprocess.check_call(['gsutil','cp',temp_loc,remote_loc])
        else:
            value = func(*args, **kwargs)
        print('Success', file=sys.stderr)
        return value
    return wrapper_decorator

@cloud_load
def load_fasta(fasta_fn):
    data = {'id': [], 'sequence': []}
    with open(fasta_fn, 'r') as f:
        for i, line in enumerate(f):
            if i % 2 == 0:
                data['id'].append(line.rstrip())
            else:
                data['sequence'].append(line.rstrip())
    return pd.DataFrame(data)

@cloud_dump
def pandas_to_fasta(fasta_fn, data, carat_field='id', sequence_field='sequence'):
    with open(fasta_fn, 'w') as f:
        for idx, line in data.iterrows():
            print(line[carat_field], file=f)
            print(line[sequence_field], file=f)
    return None

In [4]:
check_seqs = load_fasta(fasta_fn)
print( f"found {check_seqs.shape[0] - check_seqs['sequence'].unique().shape[0]}/{check_seqs.shape[0]} duplicated sequences" )


loading file from gs://syrgoth/boda_library_design_202112/test_design__20211215.fa


found 57/117900 duplicated sequences


Success


In [5]:
check_seqs

Unnamed: 0,id,sequence
0,>20211210_200256__612928__61::fsp_uc__hepg2__0,GTCGCTATAGAAGTAGGAGGCCTCACCGTTCGATTGCGTGCACAAG...
1,>20211210_200256__612928__83::fsp_uc__hepg2__0,ACGTGGATTGTCTATCACCTACTCGGGAAGACCTCGGCTTGGGACC...
2,>20211210_200256__612928__126::fsp_uc__hepg2__0,CGCTCTACAACAACCTATTGTTGGGGGGTTGTTGTCAGTGTTCACC...
3,>20211210_200256__612928__190::fsp_uc__hepg2__0,AGAGGGCACATTGGATAACTGGTTAATTATTAACCAGTGCCCCAGT...
4,>20211210_200256__612928__195::fsp_uc__hepg2__0,CGGTTGGTATCCACCGCTGGTTACCCAAGCTCCACTGGTTAATCAT...
...,...,...
117895,>control_DHS_S_26860::chr22:26512753-26512953,TGTAAATCTGTAAAATAGAGATGTAGGATTAGCCCATACGGTAGTT...
117896,>control_DHS_S_26893::chr22:28329310-28329510,ACAAGTACAGTTGAAATACACTGTTTCTCTGTCAGTGTACATCCCA...
117897,>peakS_42234,CCTTCGTATCACGCCTTTCCAGTGAGTAAGATCATCTAGATAGATT...
117898,>peakS_17598,GCATCCCCTCCTGGAGAGCTTCTGCCTGCTGTCACCCTGTCCAAGG...


In [4]:
old_seqs = load_fasta('gs://syrgoth/boda_library_design_202112/boda_design__round_1__20211213.fa')
_ = [ print(f'key: {key} has {len(old_seqs[key])} items') for key in old_seqs.keys() ]

new_seqs = load_fasta('gs://syrgoth/boda_library_design_202112/boda_design__round_1__20211214.fa')
_ = [ print(f'key: {key} has {len(new_seqs[key])} items') for key in new_seqs.keys() ]


loading file from gs://syrgoth/boda_library_design_202112/boda_design__round_1__20211213.fa
Success
loading file from gs://syrgoth/boda_library_design_202112/boda_design__round_1__20211214.fa


key: id has 117900 items
key: sequence has 117900 items
key: id has 117900 items
key: sequence has 117900 items


Success


In [5]:
old_seqs['sequence']

0         CAATCTGTTCCCACCATCAGGTTACTGGTTAATCATTAACCAGGGC...
1         GGAGCTCTTTCAATGCCAGGCAGGCCCGTGGTGCAGTAAAGGCCTG...
2         GGTTCAAAGTCCAGTCACAGTCCAAAGTCCACTCTTTGTCCAAGAG...
3         TAATAGTATGTCATCGTAATATGCGGACTTTGAACCGTGTACCAGG...
4         AACACGGGCACGGGGGGGCGCTGTTTGAGTTTAAAAACGATCTAGG...
                                ...                        
117895    CACTTAGTTTCATTTGCTGTGGGACACATGTACACAGGAAGAGGCC...
117896    AACTGAGCCCGACAGATTCCATCAGCGACTGGGGTCATCAGCCCGC...
117897    GAAAATACCCTCTCACTCCTTTTACAGTGGGTCATGCACTAAGATC...
117898    TGTAAATCTGTAAAATAGAGATGTAGGATTAGCCCATACGGTAGTT...
117899    ACAAGTACAGTTGAAATACACTGTTTCTCTGTCAGTGTACATCCCA...
Name: sequence, Length: 117900, dtype: object

In [6]:
print( f"found {old_seqs.shape[0] - old_seqs['sequence'].unique().shape[0]}/{old_seqs.shape[0]} duplicated sequences" )

print( f"found {new_seqs.shape[0] - new_seqs['sequence'].unique().shape[0]}/{new_seqs.shape[0]} duplicated sequences" )



found 57/117900 duplicated sequences
found 57/117900 duplicated sequences


In [7]:
pd.Series(list(set(new_seqs['sequence']) & set(old_seqs['sequence'])))

0         ATCCACAGACCGGATTCGTCAACTAAATGGACTGAAGCCTTCGGTG...
1         TCGTGCTCCGCTGTGATTTCCCGCGACGCAAGATTCTGAGGAACTT...
2         CCTGGATACGCCATACAGCGCCTGCTGTGAAGGTAGCCTTATTTCA...
3         CGATGCTCCGCATCCTTGGCATCATTCGAGCCGGAACCTTAAGAGC...
4         TCTATCCGCCCGGGGTCTGAGGAAGTTGTTGAAGGGGCACTCGATA...
                                ...                        
105838    ATTCTACTACTGCGCTAAGTTCTCACCGGCAGGGCGACTTAGAATC...
105839    GCACTGGGCAAGTTAACCATTAACCAGGCATTGCGCAAATCATAGC...
105840    GAGCCGGGTATCGGGACGGGTACTTAGTTTTTTGAGCATTCCTGGC...
105841    TTTAGCGGCTGGATGCCCAGTTAGGGGTTGGTGGCGGAAGATGCTG...
105842    GTGCTCGGATTGTTTTTGTTGAGGAGACGCGCGTCATGCATCACCA...
Length: 105843, dtype: object