# Correct mismatch bewteen ordered NDB adaptors and library

Steps:

1. try to map NDB in library to NDB that really get ordered
2. for ordered unmatched NDB, change its name in library to match NDB adaptors
3. for un-odered NDB, rename then to make them consecutive
4. generate adaptors

In [1]:
%run "E:\Users\puzheng\Documents\Startup_py3.py"
sys.path.append(r"E:\Users\puzheng\Documents")

import ImageAnalysis3
from ImageAnalysis3 import get_img_info, visual_tools, corrections, library_tools

from ImageAnalysis3.library_tools import LibraryDesigner as ld
from ImageAnalysis3.library_tools import LibraryTools as lt

%matplotlib notebook
print(os.getpid())

15744


In [2]:
# biopython imports
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML

# 1. map NDB in library

In [270]:
# load library
library_folder = r'X:\Libraries\CTP-07\chr2'
probe_filename = os.path.join(library_folder, 'final_probes', 'CTP-07.fasta')

pb_records = []
with open(probe_filename, 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        pb_records.append(record)

In [271]:
# generate dict for readout target sequence to NDB in library
library_readout_dict = {}
for _pb in pb_records:
    readout_name = _pb.id.split('[')[1].split(']')[0].split(',')[0].split('_u')[0]
    readout_site = _pb.seq[19:39]
    if str(readout_site) not in library_readout_dict and 'NDB' in readout_name:
        library_readout_dict[str(readout_site)] = readout_name
print(len(library_readout_dict))

1033


In [272]:
# NDB designed
designed_readouts = []
for _i in range(3):
    with open(os.path.join(r'X:\Libraries\Readouts',f"filtered_readouts_{_i}.fasta"), 'r') as handle:
        for record in SeqIO.parse(handle, "fasta"):
            designed_readouts.append(record)
designed_readouts = [_r for _r in sorted(designed_readouts, key=lambda v:int(v.id.split('_')[-1]))]
print(len(designed_readouts))
designed_seq_dict = {_r.id : str(_r.seq) for _r in designed_readouts}

1216


In [273]:
# load ordered NDB
import csv
ordered_filename = r'X:\Libraries\CTP-07\chr2\ordered_NDB.csv'

ordered_readout_dict = {}
readout_seq_dict = {}
readout_site_dict = {}
with open(ordered_filename, 'r') as _handle:
    _reader = csv.reader(_handle)
    _header = next(_reader)
    
    print("- header:", _header)
    for _content in _reader:
        ordered_readout_dict[_content[2]] = _content[0]
        readout_seq_dict[_content[0]] = _content[1]
        readout_site_dict[_content[0]] = _content[2]

- header: ['Name(New DNA Barcode)', 'Sequence', 'Rev-com last-20 (site in probe)']


In [277]:
# map
matched_designed_readouts = {}
unmatched_designed_readouts = {}
designed_to_ordered = {}
unordered_readouts = {}
for _k,_v in library_readout_dict.items():
    if _k in ordered_readout_dict:
        if _v == ordered_readout_dict[_k]:
            matched_designed_readouts[_v] = designed_seq_dict[_v]
        else:
            unmatched_designed_readouts[_v] = designed_seq_dict[_v]
            designed_to_ordered[_v] = ordered_readout_dict[_k]
    else:
        unordered_readouts[_v] = designed_seq_dict[_v]

In [276]:
readout_site_dict['NDB_1206'] in ordered_readout_dict

True

In [281]:
unmatched_designed_readouts['NDB_1218']

'GAAATTCAGCTAGGCGATAAGCATATCCTC'

In [278]:
len(matched_designed_readouts), len(unmatched_designed_readouts), len(unordered_readouts)

(900, 79, 54)

In [282]:
designed_to_ordered

{'NDB_1139': 'NDB_1138',
 'NDB_1142': 'NDB_1141',
 'NDB_1145': 'NDB_1143',
 'NDB_1148': 'NDB_1146',
 'NDB_1151': 'NDB_1148',
 'NDB_1154': 'NDB_1150',
 'NDB_1138': 'NDB_1137',
 'NDB_1160': 'NDB_1154',
 'NDB_1141': 'NDB_1140',
 'NDB_1163': 'NDB_1156',
 'NDB_1144': 'NDB_1142',
 'NDB_1166': 'NDB_1159',
 'NDB_1147': 'NDB_1145',
 'NDB_1169': 'NDB_1162',
 'NDB_1172': 'NDB_1165',
 'NDB_1175': 'NDB_1167',
 'NDB_1156': 'NDB_1152',
 'NDB_1178': 'NDB_1170',
 'NDB_1159': 'NDB_1153',
 'NDB_1181': 'NDB_1173',
 'NDB_1162': 'NDB_1155',
 'NDB_1184': 'NDB_1175',
 'NDB_1140': 'NDB_1139',
 'NDB_1165': 'NDB_1158',
 'NDB_1187': 'NDB_1178',
 'NDB_1168': 'NDB_1161',
 'NDB_1190': 'NDB_1181',
 'NDB_1146': 'NDB_1144',
 'NDB_1171': 'NDB_1164',
 'NDB_1193': 'NDB_1184',
 'NDB_1149': 'NDB_1147',
 'NDB_1196': 'NDB_1187',
 'NDB_1152': 'NDB_1149',
 'NDB_1177': 'NDB_1169',
 'NDB_1199': 'NDB_1190',
 'NDB_1155': 'NDB_1151',
 'NDB_1180': 'NDB_1172',
 'NDB_1202': 'NDB_1193',
 'NDB_1205': 'NDB_1195',
 'NDB_1186': 'NDB_1177',


# 2. change un-matched-designed_readouts by designed_to_ordered dict

In [286]:
from copy import copy
updated_pb_records = [copy(_pb) for _pb in pb_records]
updated_ndbs = []
for _i, _pb in enumerate(updated_pb_records):
    _names = _pb.id.split('[')[1].split(']')[0].split(',')
    _names = [_n.split('_u')[0] for _n in _names]
    if _names[0] in designed_to_ordered:
        _new_id = _pb.id.split('[')[0] + f'[{designed_to_ordered[_names[0]]}_u,{designed_to_ordered[_names[1]]}_u]_new'
        updated_pb_records[_i].id = _new_id
        
        if _names[0] not in updated_ndbs:
            updated_ndbs.append(_names[0])
        
    updated_pb_records[_i].name = ''
    updated_pb_records[_i].description = ''

print(len(updated_ndbs))

79


In [288]:
ct = 0
for _i, _pb in enumerate(updated_pb_records):
    if 'NDB_1206' in _pb.id and 'new' in _pb.id:
        ct += 1
print(ct)

250


# 3. unordered

In [289]:
# generate dict for readout target sequence to NDB in library
new_library_readouts = []

for _pb in updated_pb_records:
    readout_name = _pb.id.split('[')[1].split(']')[0].split(',')[0].split('_u')[0]
    if readout_name not in new_library_readouts and readout_name not in unordered_readouts and 'NDB' in readout_name:
        new_library_readouts.append(readout_name)

In [290]:
new_library_lists = [[] for _i in range(3)]
for _rd in new_library_readouts:
    _id = int(_rd.split('_')[-1])
    new_library_lists[(_id-1)%3].append(_rd)
for _i, _lst in enumerate(new_library_lists):
    _sorted_lst = [_rd for _rd in sorted(_lst, key=lambda v:int(v.split('_')[-1]))]
    new_library_lists[_i] = _sorted_lst

In [291]:
for _lst in new_library_lists:
    print(len(_lst))

322
318
328


distribute 3 colors:
19, 23, 12

In [292]:
# For un-ordered regions, only change names
unordered_names = [_rd for _rd in sorted(unordered_readouts, key=lambda v:int(v.split('_')[-1]))]

unordered_convert_dict = {}
# color 1
for _i,_rd in enumerate(unordered_names[:19]):
    unordered_convert_dict[_rd] = f'NDB_{1216+_i*3}'
# color 1
for _i,_rd in enumerate(unordered_names[19:19+23]):
    unordered_convert_dict[_rd] = f'NDB_{1217+_i*3}'
# color 1
for _i,_rd in enumerate(unordered_names[19+23:]):
    unordered_convert_dict[_rd] = f'NDB_{1218+_i*3}'

In [293]:
unordered_update_seq_dict = {}
for _rd, _seq in unordered_readouts.items():
    unordered_update_seq_dict[unordered_convert_dict[_rd]] = _seq
print(unordered_update_seq_dict)

{'NDB_1228': 'CCACAGCTATGGCTCGTGAAGTGAAACGAA', 'NDB_1222': 'CCGATCCTTATGAGAGCTCGTTGTTCGGTG', 'NDB_1225': 'AGTGATTATTCCGTGGAAATCCGCATTACC', 'NDB_1216': 'GCCAAGGTACCTAGTCTCGTAATCATAGGA', 'NDB_1219': 'CATTATTGACATGTACGCCATTTGGGTCGC', 'NDB_1237': 'ACTGAGAAGCGATCGTGCTAGTAGTACCGC', 'NDB_1231': 'ACCGAGGCGATATAGGAGAGTCCGCGCTAA', 'NDB_1240': 'CTTCAGAACTCTGGTGTCGAAGTCGCTAAT', 'NDB_1234': 'CGGGTTCCGTGATTCCTCGTCATGATGAGT', 'NDB_1258': 'AGAATAGGGTACCAGTATCGAGCCTAACGC', 'NDB_1261': 'GTTATGCCTATGCTCTCTTAGCGACCGATG', 'NDB_1264': 'TGAGATACCTTCACGGCATTGACCAACGTT', 'NDB_1217': 'ACCTAATTGCGTTGGTTCCTGTATGCACCG', 'NDB_1249': 'GCTGAACGCTATTGGTTGCAATCTTACGCG', 'NDB_1226': 'TGCAAAGTCGTGGCTTCGTATATACTCAAC', 'NDB_1252': 'TGGAATATTCCCGAACAGATATAGGTCACC', 'NDB_1235': 'GCGTTCACTACCAGTCATTGCTCGTAATGG', 'NDB_1244': 'CTTGTATAGTTCGATGGCTCAGTACACATC', 'NDB_1253': 'GCCAAGCAGCACGGTTTAACTGTCCATTAT', 'NDB_1243': 'GACGTAGCATTATGCCCGTCAGACAGAGGC', 'NDB_1259': 'GACCTTGTTACATCGACGTTCTCCAATGTA', 'NDB_1246': 'TCGTTTAGGAATTTAGCGAC

In [294]:
updated_unordered_ndbs = []

for _i, _pb in enumerate(updated_pb_records):
    _names = _pb.id.split('[')[1].split(']')[0].split(',')
    _names = [_n.split('_u')[0] for _n in _names]
    if _names[0] in unordered_convert_dict and 'new' not in _pb.id:
        _new_id = _pb.id.split('[')[0] + f'[{unordered_convert_dict[_names[0]]}_u,{unordered_convert_dict[_names[1]]}_u]_new'
        updated_pb_records[_i].id = _new_id
        
        if _names[0] not in updated_unordered_ndbs:
            updated_unordered_ndbs.append(_names[0])

print(len(updated_unordered_ndbs))

54


## save fasta

In [295]:
# save kept records
with open(os.path.join(library_folder, 'final_probes', 'CTP-07_updated.fasta'), 'w') as output_handle:
    SeqIO.write(updated_pb_records, output_handle, "fasta")

## all ndbs

In [296]:
all_ndbs = {_k:_v for _k,_v in readout_seq_dict.items()}
for _k,_v in unordered_update_seq_dict.items():
    if _v not in list(all_ndbs.values()):
        all_ndbs[_k] = _v
    else:
        print(_k)

print(len(all_ndbs))

1269


## fill in blank NDBs

In [297]:
max_num = max([int(_rd.split('_')[1]) for _rd in all_ndbs])
for _i in range(max_num):
    _name = f'NDB_{_i+1}'
    if _name in all_ndbs:
        continue
    else:
        print(_name)
        for _k, _seq in designed_seq_dict.items():
            if _seq not in list(all_ndbs.values()) and 'NDB' in _k:
                print(_k, _name)
                all_ndbs[_name] = _seq
                break

NDB_1254
NDB_1248 NDB_1254
NDB_1257
NDB_1251 NDB_1257
NDB_1260
NDB_1254 NDB_1260
NDB_1263
NDB_1257 NDB_1263
NDB_1266
NDB_1260 NDB_1266
NDB_1269
NDB_1263 NDB_1269
NDB_1272
NDB_1266 NDB_1272
NDB_1273
NDB_1269 NDB_1273
NDB_1275
NDB_1272 NDB_1275
NDB_1276
NDB_1273 NDB_1276
NDB_1278
NDB_1275 NDB_1278
NDB_1279
NDB_1276 NDB_1279
NDB_1281
NDB_1278 NDB_1281
NDB_1282
NDB_1279 NDB_1282


In [298]:
len(all_ndbs)

1283

In [299]:
all_ndbs_records = [SeqRecord(seq=Seq(_s), id=_k, name='', description='') for _k,_s in all_ndbs.items()]

In [300]:
all_stvs_records = [_r for _r in designed_readouts if 'Stv' in _r.id]

In [301]:
readout_folder = r'X:\Libraries\Readouts'
with open(os.path.join(readout_folder, 'updated_NDBs.fasta'), 'w') as output_handle:
    SeqIO.write(all_ndbs_records, output_handle, "fasta")
with open(os.path.join(readout_folder, 'updated_Stvs.fasta'), 'w') as output_handle:
    SeqIO.write(all_stvs_records, output_handle, "fasta")

## Check if match new library

In [302]:
probe_filename = os.path.join(library_folder, 'final_probes', 'CTP-07_updated.fasta')
new_pb_records = []
with open(probe_filename, 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        new_pb_records.append(record)

In [303]:
# generate dict for readout target sequence to NDB in library
library_readout_dict = {}
for _pb in new_pb_records:
    readout_name = _pb.id.split('[')[1].split(']')[0].split(',')[0].split('_u')[0]
    readout_site = _pb.seq[19:39]
    if str(readout_site) not in library_readout_dict and 'NDB' in readout_name:
        library_readout_dict[str(readout_site)] = readout_name
print(len(library_readout_dict))

1033


In [304]:
for r in all_ndbs_records:
    _site = str(r[-20:].reverse_complement().seq)
    if _site in library_readout_dict:
        if r.id != library_readout_dict[_site]:
            print(r.id, library_readout_dict[_site], _site, r.seq[-20:])

## generate a lookup table for this library

In [348]:
# generate dict for readout target sequence to NDB in library
library_readout_to_region_dict = {}
for _pb in new_pb_records:
    region_id = int(_pb.id.split('gene_')[1].split('_')[0])
    
    _names = _pb.id.split('[')[1].split(']')[0].split(',')
    if _names[0] == _names[1]:
        readout_name = _names[0][:-2]

    if region_id not in library_readout_to_region_dict:
        library_readout_to_region_dict[f'{_names[0][-1]}{region_id}'] = readout_name

In [349]:
library_readout_to_region_dict

{'u1': 'Stv_3',
 'u6': 'Stv_32',
 'u11': 'Stv_91',
 'u16': 'Stv_4',
 'u21': 'Stv_33',
 'u26': 'Stv_92',
 'u31': 'Stv_5',
 'u36': 'Stv_35',
 'u41': 'Stv_94',
 'u46': 'Stv_6',
 'u51': 'Stv_36',
 'u56': 'Stv_95',
 'u61': 'Stv_7',
 'u66': 'Stv_37',
 'u71': 'Stv_99',
 'u76': 'Stv_8',
 'u81': 'Stv_39',
 'u86': 'Stv_100',
 'u91': 'Stv_9',
 'u96': 'Stv_40',
 'u101': 'Stv_101',
 'u106': 'Stv_10',
 'u111': 'Stv_42',
 'u116': 'Stv_104',
 'u121': 'Stv_11',
 'u126': 'Stv_44',
 'u131': 'Stv_105',
 'u136': 'Stv_12',
 'u141': 'Stv_45',
 'u146': 'Stv_106',
 'u151': 'Stv_13',
 'u156': 'Stv_46',
 'u161': 'Stv_107',
 'u166': 'Stv_14',
 'u171': 'Stv_48',
 'u176': 'Stv_109',
 'u181': 'Stv_16',
 'u186': 'Stv_50',
 'u191': 'Stv_118',
 'u196': 'Stv_19',
 'u201': 'Stv_53',
 'u206': 'Stv_119',
 'u211': 'Stv_20',
 'u216': 'Stv_54',
 'u221': 'Stv_120',
 'u226': 'Stv_21',
 'u231': 'Stv_59',
 'u236': 'Stv_121',
 'u241': 'Stv_22',
 'u246': 'Stv_60',
 'u251': 'Stv_125',
 'u256': 'Stv_23',
 'u261': 'Stv_61',
 'u266': '

In [352]:
import csv
with open(os.path.join(library_folder, 'ref_dict.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(['region', 'readout'])
    for _uid, _r in library_readout_to_region_dict.items():
        csvwriter.writerow([_uid, _r])


# Generate adaptors

In [310]:
adaptor_folder = r'X:\Libraries\Adaptors'
adaptor_readout_sites = []
with open(os.path.join(adaptor_folder, 'Readout_sites.fasta'), 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        adaptor_readout_sites.append(record)
print(adaptor_readout_sites)

[SeqRecord(seq=Seq('TTTGCACTGCCGTCCTTGAC', SingleLetterAlphabet()), id='Stv_82', name='Stv_82', description='Stv_82 cy7 rev-com_last20', dbxrefs=[]), SeqRecord(seq=Seq('GATCCGATTGGAACCGTCCC', SingleLetterAlphabet()), id='Stv_1', name='Stv_1', description='Stv_1 cy5 rev-com_last20', dbxrefs=[]), SeqRecord(seq=Seq('TGCGAACTGTCCGGCTTTCA', SingleLetterAlphabet()), id='Stv_79', name='Stv_79', description='Stv_79 cy3 rev-com_last20', dbxrefs=[])]


In [311]:
readout_folder = r'X:\Libraries\Readouts'
with open(os.path.join(readout_folder, 'updated_NDBs.fasta'), 'r') as handle:
    all_ndbs_records = []
    for record in SeqIO.parse(handle, "fasta"):
        all_ndbs_records.append(record)

with open(os.path.join(readout_folder, 'updated_Stvs.fasta'), 'r') as handle:
    all_stvs_records = []
    for record in SeqIO.parse(handle, "fasta"):
        all_stvs_records.append(record)

## NDB adaptors

In [329]:
ndb_adaptors = []
for _r in all_ndbs_records:
    _id = int(_r.id.split('NDB_')[-1])
    adaptor_seq = _r[-20:] + adaptor_readout_sites[(_id-1)%3] + adaptor_readout_sites[(_id-1)%3]
    adaptor_seq.id = f"{_r.id}-2x{adaptor_readout_sites[(_id-1)%3].id}-adaptor"
    adaptor_seq.name = ''
    adaptor_seq.description=''
    ndb_adaptors.append(adaptor_seq)
print(len(ndb_adaptors))

1283


In [330]:
with open(os.path.join(adaptor_folder, 'NDB_adaptors.fasta'), 'w') as output_handle:
    SeqIO.write(ndb_adaptors, output_handle, "fasta")

## STV adaptors

In [333]:
stv_adaptors = []
# split by colors
for _i in range(3):
    for _r in all_stvs_records[_i*25:(_i+1)*25]:
            _id = int(_r.id.split('Stv_')[-1])
            adaptor_seq = _r[-20:] + adaptor_readout_sites[_i] + adaptor_readout_sites[_i]
            adaptor_seq.id = f"{_r.id}-2x{adaptor_readout_sites[_i].id}-adaptor"
            adaptor_seq.name = ''
            adaptor_seq.description=''
            stv_adaptors.append(adaptor_seq)

In [335]:
with open(os.path.join(adaptor_folder, 'Stv_adaptors.fasta'), 'w') as output_handle:
    SeqIO.write(stv_adaptors, output_handle, "fasta")