In [1]:
import sys
import os
import subprocess
import tarfile
import shutil
import math
import time
import random
import tempfile
from functools import partial
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (random_split, DataLoader, TensorDataset, ConcatDataset)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d
from Bio import motifs

import boda
from boda.generator.parameters import StraightThroughParameters
from boda.generator import FastSeqProp
from boda.generator.plot_tools import matrix_to_dms, ppm_to_IC, ppm_to_pwm
from boda.model.mpra_basset import MPRA_Basset
from boda.common import constants, utils

boda_src = os.path.join( os.path.dirname( os.path.dirname( os.getcwd() ) ), 'src' )
sys.path.insert(0, boda_src)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

from main import unpack_artifact, model_fn
from pymeme import streme, parse_streme_output

In [7]:
def get_specific_sequences(in_df, fitness_threshold=4.0, bias_cell=0):
    col_name_dict = {'0':'OverMaxB_K562', '1': 'OverMaxB_HepG2', '2': 'OverMaxB_SKNSH'}
    col_name = col_name_dict[str(bias_cell)]
    return in_df[in_df[col_name] >= fitness_threshold].reset_index(drop=True)

def contains_restriction(sequence):
    if 'GCGATCGC' in sequence:
        idx = sequence.find('GCGATCGC')
        return idx, True
    elif sequence[:7] == 'CGATCGC':
        idx = -1
        return idx, True
    elif sequence[-7:] == 'GCGATCG':
        idx = 193
        return idx, True
    else:
        return 200, False

In [3]:
k562_specific_dfs = []
hepg2_specific_dfs = []
sknsh_specific_dfs = []

root_dir = 'pred_chunks'
for subdir, dirs, files in os.walk(root_dir):
    for file in tqdm(files):
        file_path = os.path.join(root_dir, file)
        in_df = pd.read_csv(file_path, sep='\t')
        k562_specific_dfs.append(get_specific_sequences(in_df, bias_cell=0))
        hepg2_specific_dfs.append(get_specific_sequences(in_df, bias_cell=1))
        sknsh_specific_dfs.append(get_specific_sequences(in_df, fitness_threshold=3.0, bias_cell=2))

  0%|          | 0/62 [00:00<?, ?it/s]

In [4]:
! gsutil cp gs://syrgoth/boda_library_design_202112/BODA_asiSI_sequences.txt ./
! gsutil cp gs://syrgoth/boda_library_design_202112/boda_design__round_1__20211214.fa ./

with open('BODA_asiSI_sequences.txt', 'r') as f:
    lines = f.readlines()
    
sequence_IDs = []
for line in lines:
    sequence_IDs.append(line.split()[1])
sequence_IDs = [sequence_ID for sequence_ID in sequence_IDs if 'gw' in sequence_ID]
    
fasta_file_name = 'boda_design__round_1__20211214.fa'
fasta_dict = {}
with open(fasta_file_name, 'r') as f:
    for line in f:
        line_str = str(line)
        if line_str[0] == '>':
            my_id = line_str.lstrip('>').rstrip('\n')
            fasta_dict[my_id] = ''
        else:
            fasta_dict[my_id] += line_str.rstrip('\n')
            
sequences = []
for sequence_ID in sequence_IDs:
    sequences.append(fasta_dict[sequence_ID])

Copying gs://syrgoth/boda_library_design_202112/BODA_asiSI_sequences.txt...
/ [1 files][  8.7 KiB/  8.7 KiB]                                                
Operation completed over 1 objects/8.7 KiB.                                      
Copying gs://syrgoth/boda_library_design_202112/boda_design__round_1__20211214.fa...
- [1 files][ 27.7 MiB/ 27.7 MiB]                                                
Operation completed over 1 objects/27.7 MiB.                                     


In [8]:
for sequence in sequences:
    print(contains_restriction(sequence))

(65, True)
(15, True)


In [22]:
k562_specific_df = pd.concat(k562_specific_dfs, ignore_index=True)
# hepg2_specific_df = pd.concat(hepg2_specific_dfs, ignore_index=True)
# sknsh_specific_df = pd.concat(sknsh_specific_dfs, ignore_index=True)

k562_specific_df.sort_values(by='OverMaxB_K562', ascending=False, inplace=True, ignore_index=True)
# hepg2_specific_df.sort_values(by='OverMaxB_HepG2', ascending=False, inplace=True, ignore_index=True)
# sknsh_specific_df.sort_values(by='OverMaxB_SKNSH', ascending=False, inplace=True, ignore_index=True)

k562_specific_df = k562_specific_df[~k562_specific_df['nt_sequence'].str.contains("N")].reset_index(drop=True)
# hepg2_specific_df = hepg2_specific_df[~hepg2_specific_df['nt_sequence'].str.contains("N")].reset_index(drop=True)
# sknsh_specific_df = sknsh_specific_df[~sknsh_specific_df['nt_sequence'].str.contains("N")].reset_index(drop=True)

k562_specific_df = k562_specific_df[k562_specific_df['K562_pred'] > 1].reset_index(drop=True)
# hepg2_specific_df = hepg2_specific_df[hepg2_specific_df['HepG2_pred'] > 1].reset_index(drop=True)
# sknsh_specific_df = sknsh_specific_df[sknsh_specific_df['SKNSH_pred'] > 1].reset_index(drop=True)

k562_out_df = k562_specific_df[:4010]
# hepg2_out_df = hepg2_specific_df[:4000]
# sknsh_out_df = sknsh_specific_df[:4000]

In [11]:
rootdir = 'top_sequences_files'
fasta_file_names = ['rc__hepg2__gw__track_000__round_0__20211210_203441__37997823.fa',
             'rc__k562__gw__track_000__round_0__20211210_203441__47591334.fa',
             'rc__sknsh__gw__track_000__round_0__20211210_203441__82237643.fa']

fasta_dict = {}
for fasta_file_name in fasta_file_names:
    fasta_file_path = os.path.join(rootdir, fasta_file_name)
    with open(fasta_file_path, 'r') as f:
        for line in f:
            line_str = str(line)
            if line_str[0] == '>':
                my_id = line_str.lstrip('>').rstrip('\n')
                fasta_dict[my_id] = ''
            else:
                fasta_dict[my_id] += line_str.rstrip('\n')

In [18]:
sequence_coords = []
for sequence in sequences:
    sequence_coords.append(list(fasta_dict.keys())[list(fasta_dict.values()).index(sequence)]) 

In [19]:
print(sequence_IDs)
print(sequence_coords)

['20211210_203441__47591334__2418::gw__k562__0', '20211210_203441__47591334__3234::gw__k562__0']
['chr16:88455900-88456100', 'chr16:88455950-88456150']


In [20]:
already_proposed = list(fasta_dict.keys())

In [25]:
count = 0
not_count = 0
for ID in k562_out_df['ID']:
    if ID in already_proposed:
        count += 1
    else:
        not_count += 1
print(count)
print(not_count)

4000
10


In [26]:
k562_out_df[4000:]

Unnamed: 0,ID,nt_sequence,seq_len,K562_pred,HepG2_pred,SKNSH_pred,OverMaxB_K562,OverMaxB_HepG2,OverMaxB_SKNSH
4000,chr10:82311000-82311200,ATTACTGATGAGGTATTACTCTCTGAGTTGCTCTAAGATTGAGCCA...,200,4.816442,0.682662,0.713479,4.584804,-4.630955,-4.584804
4001,chr3:84759100-84759300,GCAGCTGGAAGAGCCTGATAAGCATTCCCAGTCTCCAATGTGAACC...,200,3.381837,-0.115028,-0.146358,4.584784,-4.584784,-4.651821
4002,chr2:112069700-112069900,CTGTGTCAAACGGTAGTTCTCTTTTCAGCTCTTTGAGGAATTGCCA...,200,3.298926,-0.15504,-0.322825,4.584747,-4.584747,-4.96585
4003,chr6:36960600-36960800,GCCCTTTCCCTCTGGATCCTGCCACCCCTCCCTCTCCCTCCACCCC...,200,7.363471,2.836771,2.551322,4.584681,-4.584681,-4.889494
4004,chr9:34975700-34975900,TTTTTTTTTAACTAAAATACATTTTTAGCACAGCCTAACACAGAAG...,200,3.433438,-0.144171,-0.089544,4.584382,-4.700415,-4.584382
4005,chr17:54436400-54436600,AATGGGTGGGTGTGGTTACAGCTTCAGCAGACTTAAACATCCCTGC...,200,4.725089,0.652705,0.447912,4.58415,-4.58415,-4.907268
4006,chr8:105704600-105704800,GATAGAAAGAAAATAAGTAGCGGGTGTGTTGCATTTAATTATGGAA...,200,3.623678,0.006606,-0.136017,4.583804,-4.583804,-4.878712
4007,chrX:71450750-71450950,CTGGCACACAAAAAGTATTAAGAAAATGTCTTATGAATAAATAAGT...,200,4.207224,0.233571,0.328591,4.583682,-4.750466,-4.583682
4008,chr6:156854850-156855050,ATAATTAAAACAAAAATATGATCATAAGAATAAATGAACAAAATGT...,200,4.363891,0.422825,0.265513,4.58353,-4.58353,-4.852461
4009,chr6:168235700-168235900,AGGGGAGGAGGAGCACATGTGATAACTAGGGGAGGAGGAGCATGGT...,200,7.458227,2.927886,2.583333,4.583274,-4.583274,-4.94984


In [38]:
new_proposals = k562_out_df.iloc[4001:4003]['nt_sequence']
new_proposals_IDs = k562_out_df.iloc[4001:4003]['ID']

new_proposals_dict = dict(zip(new_proposals_IDs, new_proposals))

In [41]:
for proposal_ID, proposal_seq in new_proposals_dict.items():
    print(proposal_ID)
    print(contains_restriction(proposal_seq))    

chr3:84759100-84759300
(200, False)
chr2:112069700-112069900
(200, False)


In [46]:
asiSI_file_name = 'BODA_asiSI_gw_sequence_proposals.fa'
with open(asiSI_file_name, 'w') as ofile:
    for ID, sequence_str in new_proposals_dict.items():
        ofile.write(">" + ID + "\n" + sequence_str + "\n")

In [45]:
asiSI_file_name = 'BODA_asiSI_gw_sequence_replacements.fa'
with open(asiSI_file_name, 'w') as ofile:
    for seq_idx, sequence_str in enumerate(new_proposals_dict.values()):
        ID = sequence_IDs[seq_idx]
        ofile.write(">" + ID + "\n" + sequence_str + "\n")