#### 1. First set up amino acid fasta
#### 2. Codon optimised and get the tsv file back from IDT (order_optimised_codons.tsv)
#### 3. Add any necessary DNA prefix or postfix.
#### 4. Get 96-well plate positions file (xlsx)

In [39]:
import pandas as pd


opt_codons_df = pd.read_csv("/home/tadas/code/single_chain_dp_bristol/order_optimised_codons.tsv", sep="\t",header=None)
opt_codons_df.rename(columns={0:"Name",1:"Sequence"}, inplace=True)
assert all(len(x) % 3 == 0 for x in opt_codons_df["Sequence"])
display(opt_codons_df.head())
disalowed_sites = ["CTCGAG","GGATCC"]

def report_disallowed_sites(opt_codons_df, disalowed_sites,tolerance = 0):
    for name, seq in zip(opt_codons_df["Name"], opt_codons_df["Sequence"]):
        for site in disalowed_sites:
            # alow number of sites = tolerance
            num_sites = seq.count(site)
            if num_sites > tolerance:
                print(f"{name} contains disallowed site: {site}")
                print(name, seq)
                print()
        # alowed_dna =  [site not in x for x in opt_codons_df["Sequence"]]
        # print(f"{sum(alowed_dna)}/{len(opt_codons_df)} sequences do not contain disallowed site: {site}")
        # assert all(alowed_dna), f"{sum(alowed_dna)}/{len(opt_codons_df)} sequences do not contain disallowed site: {site}"

report_disallowed_sites(opt_codons_df, disalowed_sites)




Unnamed: 0,Name,Sequence
0,no_disulfide_b4inin,TCTCTGGAGGAACTGATCGAAAAAATTGAAAAATTTAACAACCTCG...
1,no_disulfide_b4innn,CCTCTTGATCCAGAGGCCGTTCGCGCTTGGTTACGCCGTGCTTTGG...
2,no_disulfide_l5innn,GAGGAGTTGAAAGAGAAACAAGAGGAACTCAAGAAGAAAGTAGACA...
3,no_disulfide_b4nnnnx,TGGGAGGAGTTGGAGGAGGCTGTCCGTTGGCTCCGGGAGCGCCTTC...
4,no_disulfide_b4nnnny,ATGAGCCGGGAGGAAAAAATTAAACTTCTGAAGGAGAAGGGCTATA...


no_disulfide_b4inin contains disallowed site: CTCGAG
no_disulfide_b4inin TCTCTGGAGGAACTGATCGAAAAAATTGAAAAATTTAACAACCTCGAGAATCCTACCGAGGAAGAGCTGGAGGAGCTCATTGAGCTCTTAAAAGAATGGTGTGAGGAACTTGGTGTAGAAGAAGAGAAGAAAGAGGAGCTGTTAGAACTGCTCTACGAGTGGTATAATAGCGAAGAAAATCGCGAAGAGAAGACAAAGGAGCTCATCGAAAAGCTCAAGGAAGTTTACGAGGAGGCG

no_disulfide_l4inn contains disallowed site: GGATCC
no_disulfide_l4inn GACGAACTGGAAGAAGGGATCCGTTATCTGTACGAGAAGTGGAAGGAAGAGAAGGGGCTGAAATCTCAAGAGGAACTTTTAGAATTACTCCTGGACGAGTCGAATGCTGAAACGGAAGAGGAGAAGGAGATCTATAAGTTGATCAAAGAGTACAAGAAGAAGGGTATGACGGACAAGGATTTATATGAAATTCTGTCGAAAGTTGAAGAGGAG

no_disulfide_b6iiniin contains disallowed site: CTCGAG
no_disulfide_b6iiniin TCCGAGGAACTCAAAAAATATGCAGAACGGGTAGCGAAGGCGCTTGCCGAGGGGAAGCTCGAGGGTTATTTGAAACTGGTAGAGGAATGGTCCAAAGTAGACAAACCTTTTTCCAAGGAGAACTTCAAAGAGATGCTGGAGAAGGCGCTTTCTACGGAGGAAGCAAAAGAGGATAAAGAACTCAAGGAGGCCCTGGAGTATATCCTCGAAAATCTGGACGAATGGTCCTTAGATGAGATTAAAGAGTACCTGGAATATGCGCTCAAACTGGCGAAGGAATACGGGGTGGAAGAGTACACAAAGTATTTGGAAAAGTTATTATCAA

#### Remove the disallowed cut sites using Benchling! 
##### then export it back to order_optimised_codons_no_cuts.csv

In [45]:
opt_codons_no_cuts_df = pd.read_csv("/home/tadas/code/single_chain_dp_bristol/order_optimised_codons_no_cuts.csv")
opt_codons_no_cuts_df.head()
assert all(len(x) % 3 == 0 for x in opt_codons_df["Sequence"])
report_disallowed_sites(opt_codons_no_cuts_df, disalowed_sites)


#### Make sure there are no cut sites in this file
#### Remove any unnecessary names


In [46]:

prefix = "ATGGGCAGCAGCCATCATCATCATCATCACAGCAGCGGCGAGAACTTATACTTTCAGAGCGGATCC"
assert len(prefix) % 3 == 0
# MGSSHHHHHHSSGENLYFQSGS
# Start codon, His-tag, flexible linkers, TEV cut site, BamHI site
postfix = "TAACTCGAG"
assert postfix.startswith("TAA")
# Stop codon, XhoI site
# add perfix and postfix to opt_codons_no_cuts_df Sequence column values
opt_codons_no_cuts_df["Sequence"] = opt_codons_no_cuts_df["Sequence"].apply(lambda x: prefix+x+postfix)
# check that all Sequence lengths are multiples of 3

display(opt_codons_no_cuts_df.head())
# assert that all sequences are <500 bp 
assert all(len(x) >= 125 for x in opt_codons_no_cuts_df["Sequence"])
assert all(len(x) <= 500 for x in opt_codons_no_cuts_df["Sequence"])

Unnamed: 0,Name,Author,Sequence
0,no_disulfide_b3iii,Tadas,ATGGGCAGCAGCCATCATCATCATCATCACAGCAGCGGCGAGAACT...
1,no_disulfide_b3nnn,Tadas,ATGGGCAGCAGCCATCATCATCATCATCACAGCAGCGGCGAGAACT...
2,no_disulfide_b4iiiix,Tadas,ATGGGCAGCAGCCATCATCATCATCATCACAGCAGCGGCGAGAACT...
3,no_disulfide_b4iiiiy,Tadas,ATGGGCAGCAGCCATCATCATCATCATCACAGCAGCGGCGAGAACT...
4,no_disulfide_b4iiin,Tadas,ATGGGCAGCAGCCATCATCATCATCATCACAGCAGCGGCGAGAACT...


### Now assign 96-well plate positions for IDT

In [51]:



def assign_well_positions(df, manual_breaks):
    """
    Assign well positions on a 96-well plate to the DataFrame.
    
    The plate is assumed to have 8 rows (A-H) and 12 columns. The DataFrame's
    rows are filled in order from left to right. If a row index is specified in
    `manual_breaks`, that sample starts at column 1 of a new row.
    
    Parameters:
        df (pd.DataFrame): DataFrame with at least columns "Name" and "Sequence".
        manual_breaks (list of int): List of zero-indexed row numbers that should
                                     start a new row in the plate (e.g. [15, 21]).
                                     
    Returns:
        pd.DataFrame: The original DataFrame with an added "Well Position" column.
    """
    # Reset index to ensure sequential numbering
    df = df.reset_index(drop=True)
    
    # Define plate rows and the maximum number of columns
    plate_rows = list("ABCDEFGH")
    max_col = 12
    
    well_positions = []
    current_row = 0  # index for plate_rows (starting with "A")
    current_col = 1  # column number, from 1 to 12
    
    for i in range(len(df)):
        # If this row index is in manual_breaks (and it's not the very first sample),
        # force a new row by incrementing the row counter and resetting the column.
        if i in manual_breaks and i != 0:
            if not current_col == 1:
                current_row += 1
                current_col = 1

        # Automatically start a new row if the current column exceeds max_col
        if current_col > max_col:
            current_row += 1
            current_col = 1
        
        # Ensure we haven't exceeded the available plate rows
        if current_row >= len(plate_rows):
            raise ValueError("Not enough plate rows available for the given samples.")
        
        # Create the well position string, e.g., "A1", "B5", etc.
        well = f"{plate_rows[current_row]}{current_col}"
        well_positions.append(well)
        
        # Move to the next column for the next sample
        current_col += 1
    
    # Add the "Well Position" column to the DataFrame and return it
    df["Well Position"] = well_positions
    return df[["Well Position", "Name", "Sequence"]]





break_after_entries = [30]

opt_codons_no_cuts_df = assign_well_positions(opt_codons_no_cuts_df, break_after_entries)
report_disallowed_sites(opt_codons_no_cuts_df, disalowed_sites, tolerance = 1)
opt_codons_no_cuts_df.to_excel("/home/tadas/code/single_chain_dp_bristol/order_optimised_codons_96_wp.xlsx", index=False)
opt_codons_no_cuts_df.to_csv("/home/tadas/code/single_chain_dp_bristol/order_optimised_codons_96_wp.csv", index=False)