In [None]:
import pandas as pd
# import seaborn as sns

In [None]:
df = pd.read_csv("fpbase_names.csv")

In [None]:
df.head()

In [None]:
import urllib.request
import concurrent.futures

In [None]:
import tqdm
import json

def protein_name2url(protein_name):
    url = f"https://www.fpbase.org/protein/{protein_name.split(' ')[0].lower()}/"
    url = url.replace('(','').replace(')','')
    return url

def get_seq(name, timeout=30):
    name = name.split(' ')[0]
    try:
        url = f"https://www.fpbase.org/api/proteins/?name__iexact={name}&format=json"
        with urllib.request.urlopen(url, timeout=timeout) as conn:
            response = conn.read()
        json_dict = json.loads(response.decode("utf-8"))
        if len(json_dict) < 1:
            print(f"{name}: seq not available")
            return None
        seq = json_dict[0]['seq']
        return seq
    except Exception as ex:
        print(f'|exception {ex}, name {name}')
        return None
      
seqs = []
for name in tqdm.tqdm(df["Name"]):
    seqs.append(get_seq(name))
df["Seq"] = seqs
    
# with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
#     df["Seq"] = list(executor.map(get_seq,df["Name"]))

In [None]:
df.head()

In [None]:
print(df.shape)
df = df[df['Seq'].notnull()]
print(df.shape)

In [None]:
df.drop(df.columns.difference(['Name','Brightness','Quantum Yield','Stokes Shift (nm)','Seq']), 1, inplace=True)
df.head()

In [None]:
import re
codon2amino = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    }
amino2codon = {v:k for k,v in codon2amino.items()}
rep = dict((re.escape(k), v) for k, v in amino2codon.items()) 
pattern = re.compile("|".join(rep.keys()))
def translate2codon(string):
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], string)

In [None]:
df['codonSeq'] = df['Seq'].apply(translate2codon)
df.head()

In [None]:
df['Stokes Shift'] = df['Stokes Shift (nm)']
del df['Stokes Shift (nm)']
df.head()

In [None]:
import editdistance
avGFP_seq = df[df['Name'] == 'avGFP']['Seq'].values[0]
df["num_mutations"] = df['Seq'].apply(lambda x: editdistance.eval(avGFP_seq, x))
df.head()

In [None]:
df.to_csv("fpbase_sequences.csv")