In [1]:
import sys
import os
import subprocess
import tarfile
import shutil
import math
import time
import random
import tempfile

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d
from Bio import motifs
import pickle
from datetime import datetime
from tqdm import tqdm

import boda
from boda.generator.parameters import StraightThroughParameters
from boda.generator import AdaLead
from boda.generator.energy import OverMaxEnergy, StremePenalty
from boda.common import constants, utils
from boda.generator.plot_tools import matrix_to_dms, ppm_to_IC, ppm_to_pwm

boda_src = os.path.join( os.path.dirname( os.path.dirname( os.getcwd() ) ), 'src' )
sys.path.insert(0, boda_src)

from main import unpack_artifact, model_fn
from pymeme import streme, parse_streme_output

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
def fasta_to_tensor(file_name):
    fasta_dict = {}
    with open(file_name, 'r') as f:
        for line in f:
            line_str = str(line)
            if line_str[0] == '>':
                my_id = line_str.lstrip('>').rstrip('\n')
                fasta_dict[my_id] = ''
            else:
                fasta_dict[my_id] += line_str.rstrip('\n')
    seq_tensors = []
    for sequence in list(fasta_dict.values()):
        seq_tensors.append(utils.dna2tensor(sequence))
    return torch.stack(seq_tensors, dim=0)

In [6]:
logs_to_pickle = ['010', '012', '013', '014']#['001', '002', '003', '005', '006', '007', '008', '009', '010']
rootdir = 'run_logs/'
save_path = 'gs://syrgoth/boda_library_design_202112/pickles'

print(f'Saving as pickles in {save_path}')
for f in os.scandir(rootdir):
    if f.is_dir() and '.ipynb_checkpoints' not in f.path:
        args_path = f.path + '/settings.txt'
        args = {}
        with open(args_path) as temp_file:
            lines = [line.rstrip('\n') for line in temp_file]
            for line in lines:
                key, value = line.split(': ')
                args[key] = value
        for subdir, dirs, files in os.walk(f.path):
            for file in files:
                filepath = subdir + os.sep + file
                if filepath.endswith(".fa"):
                    file_name = file.rstrip('.fa') + '.pt'
                    name_parts = file_name.split('__')     
                    track_num = name_parts[3].split('_')[1]
                    if track_num in logs_to_pickle:
                        save_dict = {}
                        proposals_dict = {}
                        save_dict['args'] = args
                        save_dict['timestamp'] = name_parts[-2]
                        save_dict['random_tag'] = name_parts[-1]
                        proposals_dict['proposals'] = fasta_to_tensor(filepath)
                        save_dict['proposals'] = [proposals_dict]
                        with tempfile.TemporaryDirectory() as tmpdir:
                            print(file_name)
                            temp_loc = os.path.join(tmpdir, file_name)                           
                            torch.save(save_dict, temp_loc)
                            cloud_target = os.path.join(save_path, file_name)
                            subprocess.check_call(
                                ['gsutil', 'cp', temp_loc, cloud_target]
                            )

Saving as pickles in gs://syrgoth/boda_library_design_202112/pickles
rc__sknsh__al__track_010__round_0__20211208_041545__46700778.pt
rc__sknsh__al__track_010__round_3__20211208_050948__55024735.pt
rc__sknsh__al__track_010__round_2__20211208_045216__32596306.pt
rc__sknsh__al__track_010__round_1__20211208_043442__96006237.pt
rc__sknsh__al__track_010__round_5__20211208_054338__84059769.pt
rc__sknsh__al__track_010__round_4__20211208_052650__84057002.pt
rc__hepg2__al_uc__track_013__round_0__20211210_215555__90642008.pt
rc__k562__al_uc__track_012__round_0__20211210_213457__76910589.pt
rc__sknsh__al_uc__track_014__round_0__20211210_222517__22805080.pt


In [7]:
!gsutil ls $'gs://syrgoth/boda_library_design_202112/pickles'

gs://syrgoth/boda_library_design_202112/pickles/
gs://syrgoth/boda_library_design_202112/pickles/rc__hepg2__al__track_002__round_0__20211206_025443__72806280.pt
gs://syrgoth/boda_library_design_202112/pickles/rc__hepg2__al__track_002__round_1__20211206_025528__41332638.pt
gs://syrgoth/boda_library_design_202112/pickles/rc__hepg2__al__track_002__round_2__20211206_025555__20655304.pt
gs://syrgoth/boda_library_design_202112/pickles/rc__hepg2__al__track_002__round_3__20211206_025633__84440783.pt
gs://syrgoth/boda_library_design_202112/pickles/rc__hepg2__al__track_002__round_4__20211206_025708__20020621.pt
gs://syrgoth/boda_library_design_202112/pickles/rc__hepg2__al__track_002__round_5__20211206_025737__87493515.pt
gs://syrgoth/boda_library_design_202112/pickles/rc__hepg2__al__track_005__round_0__20211206_024503__33072627.pt
gs://syrgoth/boda_library_design_202112/pickles/rc__hepg2__al__track_009__round_0__20211208_015040__72147769.pt
gs://syrgoth/boda_library_design_202112/pickle

In [73]:
save_dict

{'args': {'bias_cell': 'sknsh',
  'num_proposed_sequences': '[100, 100, 100, 100, 100, 100]',
  'num_generations': '30',
  'score_pct': '0.1',
  'model': 'gs://syrgoth/aip_ui_test/model_artifacts__20211113_021200__287348.tar.gz',
  'model_queries_per_batch': '800',
  'eval_batch_size': '20',
  'sequences_batch_size': '20',
  'rho': '2',
  'threshold': '0.25',
  'recomb_rate': '0.1',
  'mu': '1'},
 'timestamp': '20211206_025338',
 'random_tag': '24126851.pt',
 'proposals': [{'proposals': tensor([[[1., 1., 0.,  ..., 0., 0., 0.],
            [0., 0., 0.,  ..., 1., 1., 0.],
            [0., 0., 0.,  ..., 0., 0., 1.],
            [0., 0., 1.,  ..., 0., 0., 0.]],
   
           [[0., 1., 0.,  ..., 1., 0., 0.],
            [1., 0., 0.,  ..., 0., 0., 0.],
            [0., 0., 0.,  ..., 0., 1., 1.],
            [0., 0., 1.,  ..., 0., 0., 0.]],
   
           [[0., 0., 0.,  ..., 0., 1., 0.],
            [1., 1., 0.,  ..., 1., 0., 1.],
            [0., 0., 0.,  ..., 0., 0., 0.],
            [0., 