In [522]:
import os
import pandas as pd
import re
import random

In [523]:
outdir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20260128_r3_hs_lib'

r3_all_ordered_seqs_list = []

In [524]:
#control seq functions

####Heptad a and d to Ala####
def heptad_to_upper(input_string, split_char = 'r '):
    split_1 = input_string.split(split_char)[0]
    split_2 = input_string.split(split_char)[1]
    split_2 = split_2.upper()
    
    new_string = split_1+split_char+split_2

    return new_string


def heptad_a_d_to_ala(reg_string, seq_string):

    og_seq = re.split(r'\s+', seq_string)[1]
    
    for index, char in enumerate(reg_string):
        if char.isupper():
            if char in ['A', 'D']:
                seq_string = seq_string[:index] + 'A' + seq_string[index+1:]
            else:
                continue
    
    new_seq = re.split(r'\s+', seq_string)[1]

    return og_seq, new_seq


def a_d_to_ala(input_df, state='ssd', mode='old'):

    new_seqs_dict = {}

    if state == 'ssd':
        h1_reg_colname = 'h1_reg'
        h2_reg_colname = 'h2_reg'

        h1_seq_colname = 'h1_seq'
        h2_seq_colname = 'h2_seq'
    elif state == 'msd':
        h1_reg_colname = 'h1_reg_min2'
        h2_reg_colname = 'h2_reg_min2'

        h1_seq_colname = 'h1_seq_min2'
        h2_seq_colname = 'h2_seq_min2'

    for index, row in input_df.iterrows():

        h1_reg = row[h1_reg_colname]
        new_h1_reg = heptad_to_upper(h1_reg)

        h1_seq = row[h1_seq_colname]

        h1_og, h1_new = heptad_a_d_to_ala(new_h1_reg, h1_seq)

        h2_reg = row[h2_reg_colname]
        new_h2_reg = heptad_to_upper(h2_reg)

        h2_seq = row[h2_seq_colname]

        h2_og, h2_new = heptad_a_d_to_ala(new_h2_reg, h2_seq)

        #replace with ala seq in sequence
        if mode == 'old':
            sequence = row['final_sequence_for_ad']
            new_sequence = sequence.replace(h1_og, h1_new)
            new_sequence = new_sequence.replace(h2_og, h2_new)
        if mode == 'new':
            sequence = row['final_sequence_for_ad']
            #get index of alfa tag
            alfa_index = sequence.index('SRLEEELRRRLTE')
            conserved_region = sequence[alfa_index-3:]

            new_sequence = sequence.replace(h1_og, h1_new)
            new_sequence = new_sequence.replace(h2_og, h2_new)
            new_sequence = new_sequence[:alfa_index-3] + conserved_region

        new_seqs_dict[sequence] = new_sequence

    new_df = input_df.copy()
    new_df['a_d_to_ala_sequence'] = new_df['final_sequence_for_ad'].map(new_seqs_dict)

    return new_df

####

####SCRAMBLE CONTROLS####
def full_scrambler(sequence_list):
    '''function for full scrambles use random.sample
    Takes a list of original design sequences
    Outputs a dictionary with original seq and fully scrambled seq as key:value pair'''

    full_scramble_dict = {}
    for seq in sequence_list:
        residues = list(seq)
        sc_residues = random.sample(residues, k=len(residues))
        sc_seq = ''.join(sc_residues)
        full_scramble_dict[seq] = sc_seq
    return full_scramble_dict

def patterned_scrambler(sequence_list):
    '''Function for creating patterned scrambled designs
    In which hydrophobic or polar residues are shuffled to different positions with the same h or p character
    prolines and glycines are kept in original positions
    based on criteria described in in Rocklin, et al. 2017
    Takes a list of original design sequences
    Outputs a data frame with the original sequence, pattern of H/P/Pro/Gly, and the patterned scramble sequence'''

    patterned_scramble_dict = {}
    #Rocklin term these polar if they occur in a helix: D, E, H, K, N, Q, R, S, T, or Y
    #A, F, I, L, M, V, W, and Y used to count number of hydrophobic aa
    #I just used hydrophobic and polar definitions from default resfile categories
    hydrophobics_list = ['A', 'F', 'I', 'L', 'M', 'V', 'W', 'Y']
    polar_list = ['D', 'E', 'H', 'K', 'N', 'Q', 'R', 'S', 'T']
    seq_pattern_dict = {}

    for seq in sequence_list:
        residues = list(seq)
        patterned_res = []
        hydrophobic_residues = []
        polar_residues = []
        patterned_scramble_res = []
        for res in residues:
            if res == 'P':
                patterned_res.append('Pro')

            elif res == 'G':
                patterned_res.append('G')

            elif res in hydrophobics_list:
                patterned_res.append('H')
                hydrophobic_residues.append(res)

            elif res in polar_list:
                patterned_res.append('P')
                polar_residues.append(res)

            else:
                print('Cys found!')

        seq_pattern = ''.join(patterned_res)
        seq_pattern_dict[seq] = seq_pattern

        for res in patterned_res:

            if res == 'Pro':
                patterned_scramble_res.append('P')
            elif res == 'G':
                patterned_scramble_res.append('G')
            elif res == 'H':
                l = random.randint(0, len(hydrophobic_residues)-1)
                r = hydrophobic_residues.pop(l)
                patterned_scramble_res.append(r)
            elif res == 'P':
                l = random.randint(0, len(polar_residues)-1)
                r = polar_residues.pop(l)
                patterned_scramble_res.append(r)
        pattern_scramble_seq = ''.join(patterned_scramble_res)
        patterned_scramble_dict[seq] = pattern_scramble_seq

    df_name_pattern = pd.DataFrame.from_dict(seq_pattern_dict, orient='index', columns=['Seq HP pattern'])
    df_name_pattern.index.name = 'Insert protein sequence'
    df_name_pattern.reset_index(inplace=True)

    df_name_pattern_sc = pd.DataFrame.from_dict(patterned_scramble_dict, orient='index', columns=['patterned_scramble_sequence'])
    df_name_pattern_sc.index.name = 'Insert protein sequence'
    df_name_pattern_sc.reset_index(inplace=True)

    df = pd.merge(df_name_pattern, df_name_pattern_sc, on='Insert protein sequence')

    return df

####



###Candidate switch WT sequences

In [525]:
ad_method_df = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20260108_switch_candidates_analysis/r2-hs_ad_scramble_vs_design_msd_only.csv')
residuals_method_df = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20260108_switch_candidates_analysis/r2-hs_ts_prot_MSD_stable_residuals_zscore_above2.csv')

ad_method_df_short = ad_method_df[['Name', 'mean_kd', 'mean_ss']].copy()
ad_method_df_short['mutant_type'] = 'wt_ad_method'

residuals_method_df_short = residuals_method_df[['Name', 'mean_kd', 'mean_ss']].copy()
residuals_method_df_short['mutant_type'] = 'wt_residuals_method'

all_wt_switch_seqs_df = pd.concat([ad_method_df_short, residuals_method_df_short], ignore_index=True)

#get original full sequence for creating scrambles
r2_msd_df = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/final_order/r2_msd_designs_final.csv')
r2_msd_df_short = r2_msd_df[['shortname', 'sequence', 'final_sequence']].copy()

#get relevant designs for r3 lib
r2_msd_df_short = r2_msd_df_short.query('shortname in @all_wt_switch_seqs_df.Name', engine='python').copy()

all_wt_switch_seqs_df = pd.merge(all_wt_switch_seqs_df, r2_msd_df_short, left_on='Name', right_on='shortname', how='left')
all_wt_switch_seqs_df.drop(columns=['shortname'], inplace=True)
all_wt_switch_seqs_df.rename(columns={'sequence':'og_full_sequence'}, inplace=True)

#create truncated sequence without residues C-terminal to alfa tag
all_wt_switch_seqs_df['final_sequence'] = all_wt_switch_seqs_df['og_full_sequence'].str.split('SRLEEELRRRLTE').str[0] + 'SRLEEELRRRLTE'

#replace N-term residues with 'GA'
all_wt_switch_seqs_df['final_sequence'] = 'GA' + all_wt_switch_seqs_df['final_sequence'].str[2:]

#rename col
all_wt_switch_seqs_df.rename(columns={'final_sequence':'Sequence'}, inplace=True)

print(f'Total number of WT candidate switch sequences for R3 library: {all_wt_switch_seqs_df.shape[0]}')
print(f'Number of switch candidates from ad method: {all_wt_switch_seqs_df[all_wt_switch_seqs_df["mutant_type"]=="wt_ad_method"].shape[0]}')
print(f'Number of switch candidates from residuals method: {all_wt_switch_seqs_df[all_wt_switch_seqs_df["mutant_type"]=="wt_residuals_method"].shape[0]}')

r3_all_ordered_seqs_list.append(all_wt_switch_seqs_df[['Name', 'Sequence', 'mutant_type']])

Total number of WT candidate switch sequences for R3 library: 53
Number of switch candidates from ad method: 29
Number of switch candidates from residuals method: 24


####Single mutant sequences

In [526]:
single_mutants_df = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20260122_CB_point_mutants/20260120_msd_bbs/all_msd_switch_candidates_single_mutants_state1_state2.csv')

#create truncated sequence without residues C-terminal to alfa tag
single_mutants_df['final_sequence'] = single_mutants_df['sequence'].str.split('SRLEEELRRRLTE').str[0] + 'SRLEEELRRRLTE'

#replace N-term residues with 'GA'
single_mutants_df['final_sequence'] = 'GA' + single_mutants_df['final_sequence'].str[2:]

#rename shortname column
single_mutants_df.rename(columns={'name':'Name', 'final_sequence':'Sequence'}, inplace=True)

print(f'Total number of single mutant candidate switch sequences for R3 library: {single_mutants_df.shape[0]}')
#TODO: fix counts to not include divergent ones
print('Number of single mutant state 1: ', single_mutants_df[single_mutants_df["mutant_type"].str.contains(r"[\w-]+state1")].shape[0])
print('Number of single mutant state 2: ', single_mutants_df[single_mutants_df["mutant_type"].str.contains(r"single[\w-]+state2")].shape[0])
print('Number of single mutant divergent states: ', single_mutants_df[single_mutants_df["mutant_type"].str.contains("state\d+_")].shape[0])

r3_all_ordered_seqs_list.append(single_mutants_df[['Name', 'Sequence', 'mutant_type']])

Total number of single mutant candidate switch sequences for R3 library: 2883
Number of single mutant state 1:  1736
Number of single mutant state 2:  0
Number of single mutant divergent states:  16


####Double mutant sequences

In [527]:
double_mutants_df = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20260122_CB_point_mutants/20260120_msd_bbs/all_msd_switch_candidates_double_mutants.csv')

#create truncated sequence without residues C-terminal to alfa tag
double_mutants_df['final_sequence'] = double_mutants_df['sequence'].str.split('SRLEEELRRRLTE').str[0] + 'SRLEEELRRRLTE'

#replace N-term residues with 'GA'
double_mutants_df['final_sequence'] = 'GA' + double_mutants_df['final_sequence'].str[2:]

#rename col
double_mutants_df.rename(columns={'final_sequence':'Sequence'}, inplace=True)

print(f'Total number of double mutant candidate switch sequences for R3 library: {double_mutants_df.shape[0]}')
print(f'Number of double mutant state 1: {double_mutants_df[double_mutants_df["mutant_type"].str.contains("state1")].shape[0]}')
print(f'Number of double mutant state 2: {double_mutants_df[double_mutants_df["mutant_type"].str.contains("state2")].shape[0]}')

r3_all_ordered_seqs_list.append(double_mutants_df[['Name', 'Sequence', 'mutant_type']])

Total number of double mutant candidate switch sequences for R3 library: 2408
Number of double mutant state 1: 342
Number of double mutant state 2: 2066


In [528]:
#create double mutants for 375 without N-term GA
#for comparison to previous libraries

double_mutants_375_df = double_mutants_df.query('Name.str.contains("375")', engine='python').copy()

#revert N-term GA to original residues
double_mutants_375_df['Sequence'] = 'EE' + double_mutants_375_df['Sequence'].str[2:]
double_mutants_375_df['Name'] = double_mutants_375_df['Name'] + '-og-N-term'

print(f'Total number of double mutant candidate switch sequences for 375 without N-term GA for R3 library: {double_mutants_375_df.shape[0]}')

r3_all_ordered_seqs_list.append(double_mutants_375_df[['Name', 'Sequence', 'mutant_type']])

Total number of double mutant candidate switch sequences for 375 without N-term GA for R3 library: 80


####Control sequences

In [529]:
scramble_seq_dfs = []

scrambles_df = all_wt_switch_seqs_df.copy()
scrambles_df['duplicated_sequence'] = scrambles_df['Sequence'].str.split('SRLEEELRRRLTE').str[0]

#full scrambles
full_scramble_dict = full_scrambler(scrambles_df['duplicated_sequence'].tolist())
scrambles_df['full_scramble_sequence'] = scrambles_df['duplicated_sequence'].map(full_scramble_dict)
scrambles_df['full_scramble_sequence'] = scrambles_df['full_scramble_sequence'] + 'SRLEEELRRRLTE'

#update name
scrambles_df['shortname_full'] = scrambles_df['Name'] + '_full_scramble'
full_scrambles_df_subset = scrambles_df[['shortname_full', 'full_scramble_sequence']]
full_scrambles_df_subset = full_scrambles_df_subset.rename(columns={'shortname_full': 'Name', 'full_scramble_sequence': 'Sequence'})
full_scrambles_df_subset['mutant_type'] = 'full_scramble'
scramble_seq_dfs.append(full_scrambles_df_subset)


#patterned scrambles
patterned_scramble_df = patterned_scrambler(scrambles_df['duplicated_sequence'].tolist())
scrambles_df = pd.merge(scrambles_df, patterned_scramble_df, left_on='duplicated_sequence', right_on='Insert protein sequence', how='left')
scrambles_df['patterned_scramble_sequence'] = scrambles_df['patterned_scramble_sequence'] + 'SRLEEELRRRLTE'

#update name
scrambles_df['shortname_pat'] = scrambles_df['Name'] + '_patterned_scramble'
pat_scrambles_df_subset = scrambles_df[['shortname_pat', 'patterned_scramble_sequence']]
pat_scrambles_df_subset = pat_scrambles_df_subset.rename(columns={'shortname_pat': 'Name', 'patterned_scramble_sequence': 'Sequence'})
pat_scrambles_df_subset['mutant_type'] = 'patterned_scramble'
scramble_seq_dfs.append(pat_scrambles_df_subset)


#ad scrambles old way
ad_scrambles_df = r2_msd_df.query('shortname in @all_wt_switch_seqs_df.Name', engine='python').copy()

#make second final_sequence column that will match the h1 and h2 helices without N-term GA
ad_scrambles_df['final_sequence_for_ad'] = ad_scrambles_df['sequence']

#make msd ad to ala scrambles
msd_scrambles_df = a_d_to_ala(ad_scrambles_df, state='msd', mode='old')
msd_scrambles_df_new = a_d_to_ala(ad_scrambles_df, state='msd', mode='new')

#make ALFA tag C-terminal
msd_scrambles_df['a_d_to_ala_sequence'] = msd_scrambles_df['a_d_to_ala_sequence'].str.split('SRLEEELRRRLTE').str[0] + 'SRLEEELRRRLTE'
msd_scrambles_df_new['a_d_to_ala_sequence'] = msd_scrambles_df_new['a_d_to_ala_sequence'].str.split('SRLEEELRRRLTE').str[0] + 'SRLEEELRRRLTE'

#replace N-term residues with 'GA'
msd_scrambles_df['a_d_to_ala_sequence'] = 'GA' + msd_scrambles_df['a_d_to_ala_sequence'].str[2:]
msd_scrambles_df_new['a_d_to_ala_sequence'] = 'GA' + msd_scrambles_df_new['a_d_to_ala_sequence'].str[2:]

msd_scrambles_df['shortname'] = msd_scrambles_df['shortname'] + '_ad_scramble'
msd_scrambles_df_new['shortname'] = msd_scrambles_df_new['shortname'] + '_ad_scramble_new'

msd_scrambles_df_subset = msd_scrambles_df[['shortname', 'a_d_to_ala_sequence']]
msd_scrambles_df_subset = msd_scrambles_df_subset.rename(columns={'shortname':'Name', 'a_d_to_ala_sequence': 'Sequence'})
msd_scrambles_df_subset['mutant_type'] = 'ad_scramble_old_method'
scramble_seq_dfs.append(msd_scrambles_df_subset)

msd_scrambles_df_new_subset = msd_scrambles_df_new[['shortname', 'a_d_to_ala_sequence']]
msd_scrambles_df_new_subset = msd_scrambles_df_new_subset.rename(columns={'shortname':'Name', 'a_d_to_ala_sequence': 'Sequence'})
msd_scrambles_df_new_subset['mutant_type'] = 'ad_scramble_new_method'
scramble_seq_dfs.append(msd_scrambles_df_new_subset)

all_scramble_seqs_df = pd.concat(scramble_seq_dfs, ignore_index=True)

In [530]:
#import 384 scrambles from previous r2 lib, find their corresponding design as well
r2_scramble_seqs_df = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/final_order/scramble_seqs_final.csv')
#TODO: will need to fill in mutant type for r2 scrambles
r2_scramble_seqs_df['mutant_type'] = 'r2_existing_scramble'

concat_scrambles_df = pd.concat([r2_scramble_seqs_df, all_scramble_seqs_df], ignore_index=True)
print(f'Total number of scramble sequences before removing duplicates: {concat_scrambles_df.shape[0]}')

#if scramble already exists from r2 lib, drop the new one
concat_scrambles_df = concat_scrambles_df.drop_duplicates(subset=['Name'], keep='first')
print(f'Total number of scramble sequences after removing duplicates: {concat_scrambles_df.shape[0]}')

r3_all_ordered_seqs_list.append(concat_scrambles_df)

Total number of scramble sequences before removing duplicates: 596
Total number of scramble sequences after removing duplicates: 497


In [531]:
#make df with additional control seqs from Bram and 375 and point mutants
bespoke_ctrls_dict = {'bmp38' : 'GARLAQLKQERAALKQRLAALDQEIAALEWQIQSDPRKKQLYQRLLALISDRFALEQRIAALDQEIAALEAG', 
                 'bmp53' : 'GARLEELIAERLRLVGDLVDLDREIAALEQQIQSDPRKKQLEQRLAALKQERAALEQRIAALDWEIADLDDG', 
                 'bmp42' : 'GARLAQLKQERAALKQRLDALDQEIAALEWQIQSDPRKKQLYQRLLELIGERLALMGDIFELDVEIAALEAG', 
                 'bmp45' : 'GARLAQLKQERAALKQRLEALDQEIAALEWQIQSDPRKKQLLQRLYELFGERLALFGDIFDLDVEIAALEAG', 
                 'bmp51' : 'GARLAQLKQERAALKQRLDALEQEIAALEWQIQSDPRKKQLQQRLRQLYGERLRLESDIFDLDVEIAALEAG', 
                 'bmp57' : 'GARLAQLKQERAALKQRLEALDQEIAALEWQIQSDPRKKQLLQRLRRLYGERLALFGDIFDLDVEIAALEAG', 
                 'bmp23' : 'GARLAQLKQERAALKQRLAALDQEIAALEWQIQSPDRKKQLEQRLAALKQERAALEQRIAALDQEIAALEAG',
                 '375' : 'EELKRIEEEIAAIEREIARAEEKLKAQESDPRKKGLQAEREKLLEKLAELRKERERLSRLEEELRRRLTELRRRLE',
                 '375-GA-cdel' : 'GALKRIEEEIAAIEREIARAEEKLKAQESDPRKKGLQAEREKLLEKLAELRKERERLSRLEEELRRRLTE',
                '375-ad-to-A-GA-cdel' : 'GAAKRAEEEAAAAEREAARAEEKAKAQESDPRKKGLQAEAEKALEKAAEARKEAERASRLEEELRRRLTE',
                 '375-ad-to-A-cdel' : 'EEAKRAEEEAAAAEREAARAEEKAKAQESDPRKKGLQAEAEKALEKAAEARKEAERASRLEEELRRRLTE', 
                 '375-cdel' : 'EELKRIEEEIAAIEREIARAEEKLKAQESDPRKKGLQAEREKLLEKLAELRKERERLSRLEEELRRRLTE', 
                 '375-G35Q-cdel' : 'EELKRIEEEIAAIEREIARAEEKLKAQESDPRKKQLQAEREKLLEKLAELRKERERLSRLEEELRRRLTE', 
                 '375-G35Q-GA-cdel' : 'GALKRIEEEIAAIEREIARAEEKLKAQESDPRKKQLQAEREKLLEKLAELRKERERLSRLEEELRRRLTE',
                 '375-L36K-cdel' : 'EELKRIEEEIAAIEREIARAEEKLKAQESDPRKKGKQAEREKLLEKLAELRKERERLSRLEEELRRRLTE',
                 '375-L36Q-cdel' : 'EELKRIEEEIAAIEREIARAEEKLKAQESDPRKKGQQAEREKLLEKLAELRKERERLSRLEEELRRRLTE'}

#make df with columns 'Name and 'sequence' from dict
bespoke_ctrls_df = pd.DataFrame.from_dict(bespoke_ctrls_dict, orient='index', columns=['Sequence']).reset_index().rename(columns={'index': 'Name'})
bespoke_ctrls_df['mutant_type'] = 'bespoke_control'
print(f'Total number of bespoke control sequences for R3 library: {bespoke_ctrls_df.shape[0]}')

r3_all_ordered_seqs_list.append(bespoke_ctrls_df)

Total number of bespoke control sequences for R3 library: 16


In [532]:
#for all ssd seqs in scramble controls, get the wt seq
r2_ssd_df = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/final_order/r2_ssd_designs_final.csv')

#filter concat scrambles df to get ssd only
ssd_scramble_seqs_df = concat_scrambles_df[concat_scrambles_df['Name'].str.contains('SSD')].copy()
ssd_scramble_seqs_df['Name'] = ssd_scramble_seqs_df['Name'].str.replace('_full_scramble', '', regex=False)
ssd_scramble_seqs_df['Name'] = ssd_scramble_seqs_df['Name'].str.replace('_ad_scramble', '', regex=False)
ssd_scramble_seqs_df['Name'] = ssd_scramble_seqs_df['Name'].str.replace('_patterned_scramble', '', regex=False)

ssd_scramble_seqs_df.drop_duplicates(subset=['Name'], keep='first', inplace=True)

#filter r2_ssd_df to only those in ssd_scramble_seqs_df
ssd_wt_seqs_df = r2_ssd_df[r2_ssd_df['shortname'].isin(ssd_scramble_seqs_df['Name'])].copy()
ssd_wt_seqs_df = ssd_wt_seqs_df[['shortname', 'final_sequence']].rename(columns={'shortname':'Name', 'final_sequence':'Sequence'})
ssd_wt_seqs_df['mutant_type'] = 'ssd_r2'

print(f'Total number of SSD WT sequences corresponding to SSD scramble controls for R3 library: {ssd_wt_seqs_df.shape[0]}')

r3_all_ordered_seqs_list.append(ssd_wt_seqs_df)

Total number of SSD WT sequences corresponding to SSD scramble controls for R3 library: 64


####Generate final dfs

In [533]:
r3_all_df = pd.concat(r3_all_ordered_seqs_list, ignore_index=True)

r3_all_df.drop_duplicates(subset=['Sequence'], keep='first', inplace=True)
    #above returns one duplicate
    #design MSD-exp_bbs-mpnn-205_ad_scramble with new and old method produce same sequence
    #keep this duplicate

r3_all_df.to_csv(f'{outdir}/r3_hs_all_protein_seqs_final.csv', index=False)

print(f'Numbr of total sequences in r3 library: {r3_all_df.shape[0]}')

Numbr of total sequences in r3 library: 6000


In [534]:
#get all nucleotide sequences that were submitted for r2 hs final submission
r2_hs_final_submitted_seqs = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/20250716_r2_hs_final_submission/20250716_r2_hs_final_submission.csv')

#get nucleotide sequences for r3 designs that were already codon optimized in r2 lib
r3_codon_opt_seqs_df = r2_hs_final_submitted_seqs[r2_hs_final_submitted_seqs['Name'].isin(r3_all_df['Name'])].copy()
print(f'Number of R3 sequences already codon optimized in R2 lib: {r3_codon_opt_seqs_df.shape[0]}')

r3_codon_opt_seqs_df.to_csv(f'{outdir}/r3_hs_codon_optimized_seqs_from_r2_lib.csv', index=False)

#for all others, export as new df for codon opt
r3_NEEDS_codon_opt_seqs_df = r3_all_df[~r3_all_df['Name'].isin(r3_codon_opt_seqs_df['Name'])].copy()
print(f'Number of R3 sequences NEEDING codon optimization: {r3_NEEDS_codon_opt_seqs_df.shape[0]}')

r3_NEEDS_codon_opt_seqs_df.to_csv(f'{outdir}/r3_hs_seqs_needing_codon_optimization.csv', index=False)

#504 seqs already codon optimized in r2
#if add up r2 scranbles controls (384),
#bespoke controls (16),
#and ssd wt for scrambles (64)
#and wt switch candidates (53), get 517
#missing 13 are newly added bespoke controls (375-GA-cdel, 375-ad-to-A-GA-cdel, 375-G35Q-GA-cdel)
#and 10 SSD designs from r2 that were not ordered for some reason

Number of R3 sequences already codon optimized in R2 lib: 504
Number of R3 sequences NEEDING codon optimization: 5496


In [535]:
#why are all ssd seqs 71 residues long
#not all are
#there are 31 ssd seqs that are alfa thread 53, bm01 loop
#each of these has a scramble, so total is 124
# r2_all_seqs_df = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/final_order/r2_hs_all_protein_seqs_final.csv')

# r3_all_df['length'] = r3_all_df['Sequence'].str.len()
# r3_all_df_71 = r3_all_df[r3_all_df['length'] == 71].copy()
# print(r3_all_df_71.shape)

# r2_ssd_df = pd.read_csv('/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/final_order/r2_ssd_designs_final.csv')

# merge_df = pd.merge(r3_all_df_71, r2_ssd_df, left_on='Sequence', right_on='final_sequence', how='inner', indicator=True)
# merge_df.to_csv('/Users/stephaniecrilly/downloads/r3_71mer_vs_r2_ssd_merge.csv', index=False)

# r3_remaining_71mer_df = r3_all_df_71[~r3_all_df_71['Sequence'].isin(merge_df['Sequence'])].copy()
# r3_remaining_71mer_df.to_csv('/Users/stephaniecrilly/Downloads/r3_71mer_not_in_r2_ssd.csv', index=False)
# print(merge_df.shape)
# print(r3_remaining_71mer_df.shape)