In [1]:
import sys
import os
import subprocess
import tarfile
import shutil
import math
import time
import random
import tempfile

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d
from Bio import motifs
import pickle
from datetime import datetime
from tqdm import tqdm

import boda
from boda.generator.parameters import StraightThroughParameters
from boda.generator import AdaLead
from boda.generator.energy import OverMaxEnergy, StremePenalty
from boda.common import constants, utils
from boda.generator.plot_tools import matrix_to_dms, ppm_to_IC, ppm_to_pwm

boda_src = os.path.join( os.path.dirname( os.path.dirname( os.getcwd() ) ), 'src' )
sys.path.insert(0, boda_src)

from main import unpack_artifact, model_fn
from pymeme import streme, parse_streme_output

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
def fasta_to_tensor(file_name):
    fasta_dict = {}
    with open(file_name, 'r') as f:
        for line in f:
            line_str = str(line)
            if line_str[0] == '>':
                my_id = line_str.lstrip('>').rstrip('\n')
                fasta_dict[my_id] = ''
            else:
                fasta_dict[my_id] += line_str.rstrip('\n')
    seq_tensors = []
    for sequence in list(fasta_dict.values()):
        seq_tensors.append(utils.dna2tensor(sequence))
    return torch.stack(seq_tensors, dim=0)

In [15]:
rootdir = 'top_sequences_files'
save_path = 'gs://syrgoth/boda_library_design_202112/pickles'
args = {'model': 'gs://syrgoth/aip_ui_test/model_artifacts__20211113_021200__287348.tar.gz',
       'step_size': 50}

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if file.endswith(".fa"):
            file_name = os.path.basename(file).rstrip('.fa') + '.pt'
            filepath = os.path.join(rootdir, file)
            save_dict = {}
            proposals_dict = {}
            name_parts = file_name.split('__') 
            save_dict['args'] = args
            save_dict['timestamp'] = name_parts[-2]
            save_dict['random_tag'] = name_parts[-1]
            proposals_dict['proposals'] = fasta_to_tensor(filepath)
            save_dict['proposals'] = [proposals_dict]
            with tempfile.TemporaryDirectory() as tmpdir:
                print(file_name)
                temp_loc = os.path.join(tmpdir, file_name)                           
                torch.save(save_dict, temp_loc)
                cloud_target = os.path.join(save_path, file_name)
                subprocess.check_call(
                    ['gsutil', 'cp', temp_loc, cloud_target]
                )

rc__sknsh__gw__track_000__round_0__20211210_203441__82237643.pt
rc__hepg2__gw__track_000__round_0__20211210_203441__37997823.pt
rc__k562__gw__track_000__round_0__20211210_203441__47591334.pt


In [13]:
save_dict

{'args': {'model': 'gs://syrgoth/aip_ui_test/model_artifacts__20211113_021200__287348.tar.gz',
  'step_size': 50},
 'timestamp': '20211210_203441',
 'random_tag': '47591334.pt',
 'proposals': [{'proposals': tensor([[[0., 0., 0.,  ..., 0., 1., 0.],
            [0., 0., 0.,  ..., 0., 0., 0.],
            [1., 1., 0.,  ..., 0., 0., 0.],
            [0., 0., 1.,  ..., 1., 0., 1.]],
   
           [[0., 0., 0.,  ..., 0., 0., 0.],
            [0., 1., 1.,  ..., 0., 0., 1.],
            [1., 0., 0.,  ..., 0., 0., 0.],
            [0., 0., 0.,  ..., 1., 1., 0.]],
   
           [[1., 0., 1.,  ..., 0., 1., 1.],
            [0., 0., 0.,  ..., 0., 0., 0.],
            [0., 0., 0.,  ..., 0., 0., 0.],
            [0., 1., 0.,  ..., 1., 0., 0.]],
   
           ...,
   
           [[0., 1., 0.,  ..., 0., 1., 0.],
            [0., 0., 1.,  ..., 0., 0., 0.],
            [0., 0., 0.,  ..., 0., 0., 1.],
            [1., 0., 0.,  ..., 1., 0., 0.]],
   
           [[0., 0., 0.,  ..., 1., 1., 0.],
        

### Copy fasta files to bucket

In [5]:
root_dir = 'top_sequences_files'
target_dir = 'gs://syrgoth/chunked_genome/top_sequences'

for subdir, dirs, files in os.walk(root_dir):
    for file in tqdm(files):
        origin_path = os.path.join(root_dir, file)
        target_path = os.path.join(target_dir, file)
        subprocess.check_call(['gsutil', 'cp', origin_path, target_path])

100%|██████████| 6/6 [00:07<00:00,  1.32s/it]
0it [00:00, ?it/s]
