# Primer Generator

This is an interactive notebook, with cells requiring user input and verification. Instructions on using each cell are given inline.

## Aim:
To generate list of new primers 


# Part I - Wild type

### Necessary package imports
Simply run the cell below by clicking inside the cell and press CTRL + ENTER

In [1]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.Alphabet import Reduced
import re
import pandas as pd

### User Inputs required below

In [2]:
#Enter the bases of your wild type in the cell below removing everything between the "" and replacing with your own
base_str = "cataatgggacaaatgctacgatgatgcaatactttgaatggcacttgcctaatgatgggaatcactggaatagattaagagatgatgctagtaatctaagaaatagaggtataaccgctatttggattccgcctgcctggaaagggacttcgcaaaatgatgtggggtatggagcctatgatctttacgatttaggggaatttaatcaaaaggggacggttcgtactaagtatgggacacgtagtcaattggagtctgccatccatgctttaaagaataatggcgttcaagtttatggggatgtagtgatgaaccataaagctggagctgatgctacagaaaacgttcttgctgtcgaggtgaatccaaataaccggaatcaagaaatatctggggactacacaattgaggcttatactaagtttgattttccagggaggggtaatacatactcagactttaaatggcgttggtatcatttcgatggtgtagattgggatcaatcacgacaattccaaaatcgtatctacaaattccgaggtaaagcttgggattgggaagtagattcggaatttggaaattatgattatttaatgtatgcagattacgatatggatcatccggaggtagtaaatgagcttagaagatggggagaatggtatacaaatacattaaatcttgatggatttaggatcgatgcggtgaagcatattaaattcagctttacacgtgattggttgacccatgtaagaaacgcaacgggaaaaggcatgtttgctgttgctgaattttggaaaaatgatttaggtgccttggagaactatttatcaaaaacaaactggaatcattctgtctttgatgtcccccttcattataatctttataacgcgtcaaatagtcggggcaactatgacatggcaaaacttcttaatggaacggttgttcaaaagcatccaatgcatgccgtaacttttgtggataatcacgattctcaacctggggaatcattagaatcatttgtacaagaatggtttaagccacttgcttatgcgcttattttaacaagagaacaaggctatccctctgtcttctatggtgactactatggaattccaacacatagtgtcccagcaatgaaagccaagattgatccaatcttagctgcgcgtcaaaattttgcatatggaacacaacatgattattttgaccatcataatataatcggatggacacgtgaaggaaataccacgcatcccaattcaggacttgcgactatcatgtcggatgggccagggggagagaaatggatgtacgtagggcaaaataaagcaggtcaagtttggcatgacataactggaaataaaccaggaacagttacgatcaatgcagatggatgggctaatttttcagtaaataagggatctgtttccatttgggtgaaacgataa"

#Enter the desired length of the primer and the length of overlap
length_of_primer = 15
overlap = 5

#Enter the numbering gaps separated by commas. DO NOT DELETE THE BRACKETS
numbering_gaps = [1, 182, 183]

### Run cell below and verify the AA sequence of your wild type

In [3]:
base_seq = Seq(base_str, IUPAC.unambiguous_dna)
wt_aa_str = str(base_seq.translate())
wt_aa_str

'HNGTNATMMQYFEWHLPNDGNHWNRLRDDASNLRNRGITAIWIPPAWKGTSQNDVGYGAYDLYDLGEFNQKGTVRTKYGTRSQLESAIHALKNNGVQVYGDVVMNHKAGADATENVLAVEVNPNNRNQEISGDYTIEAYTKFDFPGRGNTYSDFKWRWYHFDGVDWDQSRQFQNRIYKFRGKAWDWEVDSEFGNYDYLMYADYDMDHPEVVNELRRWGEWYTNTLNLDGFRIDAVKHIKFSFTRDWLTHVRNATGKGMFAVAEFWKNDLGALENYLSKTNWNHSVFDVPLHYNLYNASNSRGNYDMAKLLNGTVVQKHPMHAVTFVDNHDSQPGESLESFVQEWFKPLAYALILTREQGYPSVFYGDYYGIPTHSVPAMKAKIDPILAARQNFAYGTQHDYFDHHNIIGWTREGNTTHPNSGLATIMSDGPGGEKWMYVGQNKAGQVWHDITGNKPGTVTINADGWANFSVNKGSVSIWVKR*'

### The necessary functions
Run cell below without any changes

In [4]:
### Function create_primers 
## Splits given amino acid sequence into equal sized primers based on user defined length
## User defined overlap included in the split
## Returns a list
def create_primers(amino_acid):
    if len(amino_acid) <= length_of_primer:
        return [amino_acid]
    else:
        return [amino_acid[:length_of_primer]] + create_primers(amino_acid[length_of_primer - overlap:])

### Function generate_primer_df
## Creates pandas dataframe 
## Column1 is index for primary key, col2 is auto incremented prefix based primer name
## Returns data frame
def generate_primer_df(primer_list, prefix):
    primer_df = pd.DataFrame([[str(primer_list.index(each)+1), prefix + "_primer" + str(primer_list.index(each)+1), each] 
                              for each in primer_list], 
                             columns=['Index', prefix +'_Primer_name', prefix+'_Primer_seq'])
    return primer_df

### Function variant_dna
## Takes the mutations of a variant eg 'Q11E, F13Y', and wt mutations (as a list, gaps indicated as '-')
## Replaces mutations in the wt 
## Returns variant amino acid as string incorporating mutation changes
def variant_dna(var_mut, wt_with_gap_list):
    var_mut_list = str.split(var_mut, ",")
    mutations_in_var = [re.split(r'(\d+)', s) for s in var_mut_list]
    var_aa_list = wt_with_gap_list
    
    for each in mutations_in_var:
        org = each[0]
        pos = each[1]
        sub = each[2]
        if(var_aa_list[int(pos)-1] == org):
            var_aa_list[int(pos)-1] = sub
            
    var_aa_list = [y for y in var_aa_list if y != '-']
    var_aa_str = ''.join(var_aa_list)
    
    return var_aa_str

### Function compare_primers
## Takes 3 dataframes - wt_primers, variant primers and new_primers
## Checks if variant primer is diff from wt primer
## if yes, checks if it's already recorded as new primer
## if not, appends to the list of new primers for that index
## returns back the new primer dataframe
def compare_primers(wt_primer_df, var_primer_df, new_primer_df):
    #Iterate over the wt_primer_df
    for i in range(wt_primer_df.shape[0]):
        #If wt_primer_seq is diff from variant_primer
        if wt_primer_df.loc[i][2] != var_primer_df.loc[i][2]: 
            #Check if that primer has already been recorded
            if wt_primer_df.loc[i][0] in new_primer_df['Primer_no'].values: 
                #if its been recorded get the matching index of that record
                match_index = new_primer_df[new_primer_df['Primer_no'] == wt_primer_df.loc[i][0]].index[0]
                #Now check if it is there in the list of new primers in that index
                if var_primer_df.loc[i][2] not in new_primer_df.loc[match_index][2]:
                    #Append to the list of new primers in that index, since it's not there
                    new_primer_df.loc[match_index][2].append(var_primer_df.loc[i][2])
                    new_primer_df.loc[match_index][3].append(var_primer_df.loc[i][1])
                
            #If the primer has not been recorded, create a new record for this
            else:
                new_primer_df.loc[len(new_primer_df)] = [wt_primer_df.loc[i][0], wt_primer_df.loc[i][2],  [var_primer_df.loc[i][2]], [var_primer_df.loc[i][1]], []]
                
    return new_primer_df

### Prepare the wild type primers for reference

In [5]:
wt_primer_list = create_primers(wt_aa_str) ## Creates a list of primers from wild type sequence
wt_primer_df = generate_primer_df(wt_primer_list, "WT") ## Generates the primer data frame for WT
wt_primer_df


Unnamed: 0,Index,WT_Primer_name,WT_Primer_seq
0,1,WT_primer1,HNGTNATMMQYFEWH
1,2,WT_primer2,YFEWHLPNDGNHWNR
2,3,WT_primer3,NHWNRLRDDASNLRN
3,4,WT_primer4,SNLRNRGITAIWIPP
4,5,WT_primer5,IWIPPAWKGTSQNDV
5,6,WT_primer6,SQNDVGYGAYDLYDL
6,7,WT_primer7,DLYDLGEFNQKGTVR
7,8,WT_primer8,KGTVRTKYGTRSQLE
8,9,WT_primer9,RSQLESAIHALKNNG
9,10,WT_primer10,LKNNGVQVYGDVVMN


In [6]:
#Account for numbering gaps in WT
wt_aa_list = list(wt_aa_str)

wt_with_gap = wt_aa_list[:]
for i in numbering_gaps:
    wt_with_gap.insert(i-1, '-')

# Part II - Variants

### First get the variants from sharepoint

In [7]:
# Excel input
## Read from sharepoint the file of interest - Variants 
## Temp store it in a file
inpath = 'http://promanweb/sites/2718/Working%20Documents/Fuiji/Molecule/Diversity%20(wild%20types,%20PE,%20and%20mutagenesis)/combi_var.xlsx'
outpath = 'test.xlsx'

!curl -u : --negotiate "{inpath}" --output {outpath}
    
# Load spreadsheet: xl
xl = pd.ExcelFile(outpath)

# Load Variants sheet into a dataframe and verify if headings and data looks ok
variants_df = pd.read_excel(xl, 'Sheet1') 
variants_df.head()

## Remove outpath now that we have the df
!rm {outpath}

variants_df.head()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    16  100    16    0     0    716      0 --:--:-- --:--:-- --:--:--   727
100 18935  100 18935    0     0   238k      0 --:--:-- --:--:-- --:--:--  238k


Unnamed: 0,Sequoia_Org_code,Variant_code,Mutation
0,,VAR001,"Q11E,F13Y,W43L,N331S,H332Q,F367Y"
1,,VAR002,"Q11E,F13Y,P45L"
2,,VAR003,"Q11E,W15M,N331S,H332Q,D333E"
3,,VAR004,"Q11E,W43L,R234P"
4,,VAR005,"Q11E,E266Q,F328L,N331S,H332Q,D333E,S334T"


### Now process the variants and find the new primers

In [8]:
### Create an empty data frane for new primers
new_primers_df = pd.DataFrame(columns=['Primer_no', 'WT_Primer_seq', 'New_Primer_seq', 'Var_Primer_names', 'Named_primers'])

### Iterate through all the variants and get the new primers
for index, rows in variants_df.iterrows(): #Iterate through each row of the variant file
    mutation_str = rows['Mutation'] #Take each mutation
    var_str = variant_dna(mutation_str, wt_with_gap) #Create it's AA sequence replacing mutations on WT
    var_primer_list = create_primers(var_str)
    var_primer_df = generate_primer_df(var_primer_list, rows['Variant_code'])
    new_primers_df = compare_primers(wt_primer_df, var_primer_df, new_primers_df)

new_primers_df  


Unnamed: 0,Primer_no,WT_Primer_seq,New_Primer_seq,Var_Primer_names,Named_primers
0,1,HNGTNATMMQYFEWH,"[HNGTNATMMEYYEWH, HNGTNATMMEYYEMH, HNGTNATMMEY...","[VAR001_primer1, VAR003_primer1, VAR008_primer1]",[]
1,2,YFEWHLPNDGNHWNR,"[YYEWHLPNDGNHWNR, YYEMHLPNDGNHWNR, YYTMYLPNDGN...","[VAR001_primer2, VAR003_primer2, VAR008_primer2]",[]
2,4,SNLRNRGITAIWIPP,"[SNLRNRGITAILIPP, SNLRNRGITAILILP, SNLRNRGITAI...","[VAR001_primer4, VAR002_primer4, VAR012_primer4]",[]
3,5,IWIPPAWKGTSQNDV,"[ILIPPAWKGTSQNDV, ILILPAWKGTSQNDV, ILILPAWKGTS...","[VAR001_primer5, VAR002_primer5, VAR010_primer...",[]
4,33,HAVTFVDNHDSQPGE,"[HAVTFVDSQDSQPGE, HAVTFVDSQESQPGE, HAVTLVDSQET...","[VAR001_primer33, VAR003_primer33, VAR005_prim...",[]
5,36,ALILTREQGYPSVFY,[ALILTREQGYPSVYY],[VAR001_primer36],[]
6,37,PSVFYGDYYGIPTHS,[PSVYYGDYYGIPTHS],[VAR001_primer37],[]
7,23,YTNTLNLDGFRIDAV,"[YTNTLNLDGFPIDAV, YTNTLNLDGFPIEAV, YTNTLNLDGFP...","[VAR004_primer23, VAR014_primer23, VAR033_prim...",[]
8,24,RIDAVKHIKFSFTRD,"[PIDAVKHIKFSFTRD, PIEAVKHIKFSFTRD, PIEGVKHIKFS...","[VAR004_primer24, VAR014_primer24, VAR033_prim...",[]
9,26,RNATGKGMFAVAEFW,"[RNATGKGMFAVAQFW, RNATGKGMFAVGQFW, RNATGKGMFAV...","[VAR005_primer26, VAR041_primer26, VAR082_prim...",[]


In [None]:
#a = Seq('HNGTNATMMQYFEWH', DNAAlphabet())
#transcribe("HNGTNATMMQYFEWH")
base_seq = Seq('HNGTNATMMQYFEWH', IUPAC.IUPACProtein())
base_seq.alphabet
#wt_aa_str = str(base_seq.translate())
#wt_aa_str

In [None]:
#new_primers_df['Named_seqs'] = []
#tmp_df = pd.DataFrame()
for index, rows in new_primers_df.iterrows():
    counter = len(rows['New_Primer_seq'])
    concat_seqs = ''
    for i in range(counter):
        named_seq = rows['Var_Primer_names'][i] + " - " + rows['New_Primer_seq'][i]
        #print(named_seq)
        concat_seqs = concat_seqs + ", " + named_seq
        #new_primers_df['Named_seqs'].append(named_seq)
        #tmp_df['Named_seqs'] 
    new_primers_df['Named_seqs'] = concat_seqs
new_primers_df

In [10]:
for i in range(new_primers_df.shape[0]):
        #If wt_primer_seq is diff from variant_primer
        #if wt_primer_df.loc[i][2] != var_primer_df.loc[i][2]:
        counter = len(new_primers_df.loc[i][3])
        for j in range(counter):
            named_seq = new_primers_df.loc[i][3][j] + " - " + new_primers_df.loc[i][2][j]
            new_primers_df.loc[i][4].append(named_seq)
      #  new_primer_df.loc[match_index][3].append(var_primer_df.loc[i][1])
    
new_primers_df

Unnamed: 0,Primer_no,WT_Primer_seq,New_Primer_seq,Var_Primer_names,Named_primers
0,1,HNGTNATMMQYFEWH,"[HNGTNATMMEYYEWH, HNGTNATMMEYYEMH, HNGTNATMMEY...","[VAR001_primer1, VAR003_primer1, VAR008_primer1]","[VAR008_primer1 - HNGTNATMMEYYTMY, VAR001_prim..."
1,2,YFEWHLPNDGNHWNR,"[YYEWHLPNDGNHWNR, YYEMHLPNDGNHWNR, YYTMYLPNDGN...","[VAR001_primer2, VAR003_primer2, VAR008_primer2]","[VAR008_primer2 - YYTMYLPNDGNHWNR, VAR001_prim..."
2,4,SNLRNRGITAIWIPP,"[SNLRNRGITAILIPP, SNLRNRGITAILILP, SNLRNRGITAI...","[VAR001_primer4, VAR002_primer4, VAR012_primer4]","[VAR012_primer4 - SNLRNRGITAILILR, VAR001_prim..."
3,5,IWIPPAWKGTSQNDV,"[ILIPPAWKGTSQNDV, ILILPAWKGTSQNDV, ILILPAWKGTS...","[VAR001_primer5, VAR002_primer5, VAR010_primer...","[VAR028_primer5 - ILILRANKAMDQSGN, VAR001_prim..."
4,33,HAVTFVDNHDSQPGE,"[HAVTFVDSQDSQPGE, HAVTFVDSQESQPGE, HAVTLVDSQET...","[VAR001_primer33, VAR003_primer33, VAR005_prim...","[VAR013_primer33 - HAVTLVASQETVPSQ, VAR001_pri..."
5,36,ALILTREQGYPSVFY,[ALILTREQGYPSVYY],[VAR001_primer36],"[VAR001_primer36 - ALILTREQGYPSVYY, VAR001_pri..."
6,37,PSVFYGDYYGIPTHS,[PSVYYGDYYGIPTHS],[VAR001_primer37],"[VAR001_primer37 - PSVYYGDYYGIPTHS, VAR001_pri..."
7,23,YTNTLNLDGFRIDAV,"[YTNTLNLDGFPIDAV, YTNTLNLDGFPIEAV, YTNTLNLDGFP...","[VAR004_primer23, VAR014_primer23, VAR033_prim...","[VAR041_primer23 - YTNTLNLDGFPLEGI, VAR004_pri..."
8,24,RIDAVKHIKFSFTRD,"[PIDAVKHIKFSFTRD, PIEAVKHIKFSFTRD, PIEGVKHIKFS...","[VAR004_primer24, VAR014_primer24, VAR033_prim...","[VAR087_primer24 - PLEGIQNYKFSFTRD, VAR004_pri..."
9,26,RNATGKGMFAVAEFW,"[RNATGKGMFAVAQFW, RNATGKGMFAVGQFW, RNATGKGMFAV...","[VAR005_primer26, VAR041_primer26, VAR082_prim...","[VAR092_primer26 - RNATGKGMFAVGQYL, VAR005_pri..."


In [11]:
new_primers_df.to_excel('Test.xlsx', sheet_name='test')