In [1]:
import sys
sys.path.append("./dbaasp_api_helper_libraries/python")
sys.path.append("./dbaasp_api_helper_libraries/python/request")
%load_ext autoreload
%autoreload 2

In [2]:
import json
import pandas as pd
from pandas import Series, DataFrame
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
import APICaller,Complexity, FormatType, LookupType, MathOperationTypes
import random
import numpy as np
import os
folder = "/data/AIpep-clean/"

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
SEED = 12456789
np.random.seed(SEED)
random.seed(SEED)

In [3]:
def print_terminus(T_list, T_acc):
    for t in T_list:
        if t not in T_acc:
            print(t)

In [4]:
aminoacids = ["A","C","D","E","F","G","H","I","L","M","N","P","K","Q","R","S","T","V","W","Y"]
def is_natural(seq):
    try:
        seq = seq.upper()
        for aa in seq:
            if aa not in aminoacids:
                return False
        return True
    except:
        return False

In [5]:
def float_ignore_plus_minus(mynumber):
    try:
        return sum(map(float,mynumber.split("±")))
    except:
        return float("inf")

def is_active(identifier):
    results = []
    try:
        peptideCardRequest = APICaller.PeptideCardRequest()
        peptideCardRequest.peptide_id = identifier;
        peptideCardRequest.format = FormatType.FormatType.JSON;
        dbaasp_peptide = json.loads(peptideCardRequest.request())
        NAs = ["NA", "na", "Na", "nA", "N/A", "n/a"]

        if 'errorCode' in dbaasp_peptide:
#            print("db error", identifier)
            return []
        if "targetActivities" not in dbaasp_peptide["peptideCard"]:
#            print("no target", identifier)#, dbaasp_peptide["peptideCard"])
            return []  
    
        species = dbaasp_peptide["peptideCard"]["targetActivities"]
        for specie in species:
            if not ("unit" and "concentration" and "targetSpecies" and "activityMeasure" in specie):
                continue
            unit = specie["unit"]

            #aaaaaaaaaaaaaaaaaaaaaaaaaaaaaAAAaaaaaaaaaargh
            concentration_str = specie["concentration"].replace(" ","")
            concentration_str = concentration_str.replace("–","-")
            concentration_str = concentration_str.replace("->","-") 
            concentration_str = concentration_str.replace("0,","0.")
            concentration_str = concentration_str.replace(",","") 


            if concentration_str[0] == '<':
                if concentration_str[1] == '=':
                    concentration_tmp = float_ignore_plus_minus(concentration_str[2:])
                else:
                    concentration_tmp = float_ignore_plus_minus(concentration_str[1:])
                concentration = concentration_tmp
            elif concentration_str[0] == '>' or concentration_str in NAs:
                concentration = float("inf")
            elif "-"  in concentration_str:
                concentrations = concentration_str.split("-")
                concentration =  float_ignore_plus_minus(concentrations[0]) + float_ignore_plus_minus(concentrations[1])
                concentration /= 2
            else:
                concentration = float_ignore_plus_minus(concentration_str)

            if (unit == "µM" and concentration < 10) or (unit == "nM" and concentration < 10000) or (unit == "µg/ml" and concentration < 32): 
                results.append([concentration, unit, specie["targetSpecies"], specie["activityMeasure"]])

            elif unit != "µM" and unit != "nM" and unit != "µg/ml" and concentration_str not in NAs:
                pass 
                # print("no unit",unit, identifier)#, species)

        return results
    except:
        return results
    
def is_inactive(identifier):
    results = []
    try:
        peptideCardRequest = APICaller.PeptideCardRequest()
        peptideCardRequest.peptide_id = identifier;
        peptideCardRequest.format = FormatType.FormatType.JSON;
        dbaasp_peptide = json.loads(peptideCardRequest.request())
        NAs = ["NA", "na", "Na", "nA", "N/A", "n/a"]

        if 'errorCode' in dbaasp_peptide:
#            print("db error", identifier)
            return []
        if "targetActivities" not in dbaasp_peptide["peptideCard"]:
#            print("no target", identifier)#, dbaasp_peptide["peptideCard"])
            return []  
    
        species = dbaasp_peptide["peptideCard"]["targetActivities"]
        for specie in species:
            if not ("unit" and "concentration" and "targetSpecies" and "activityMeasure" in specie):
                continue
            unit = specie["unit"]

            #aaaaaaaaaaaaaaaaaaaaaaaaaaaaaAAAaaaaaaaaaargh
            concentration_str = specie["concentration"].replace(" ","")
            concentration_str = concentration_str.replace("–","-")
            concentration_str = concentration_str.replace("->","-") 
            #concentration_str = concentration_str.replace("0,","0.")
            concentration_str = concentration_str.replace(",",".") 


            if concentration_str[0] == '<':
                if concentration_str[1] == '=':
                    concentration_tmp = float_ignore_plus_minus(concentration_str[2:])
                else:
                    concentration_tmp = float_ignore_plus_minus(concentration_str[1:])
                concentration = concentration_tmp
            elif concentration_str[0] == '>' or concentration_str in NAs:
                if concentration_str[1] == '=':
                    concentration_tmp = float_ignore_plus_minus(concentration_str[2:])
                else:
                    concentration_tmp = float_ignore_plus_minus(concentration_str[1:])
                concentration = concentration_tmp
            elif "-"  in concentration_str:
                concentrations = concentration_str.split("-")
                concentration =  float_ignore_plus_minus(concentrations[0]) + float_ignore_plus_minus(concentrations[1])
                concentration /= 2
            else:
                concentration = float_ignore_plus_minus(concentration_str)

            if (unit == "µM" and concentration > 10) or (unit == "nM" and concentration > 10000) or (unit == "µg/ml" and concentration > 32): 
                results.append([concentration, unit, specie["targetSpecies"], specie["activityMeasure"]])


            elif unit != "µM" and unit != "nM" and unit != "µg/ml" and concentration_str not in NAs:
                pass 
                # print("no unit",unit, identifier)#, species)
            else:
                return []

        return results
    except:
        return results
    

# Actives from DBAASP dataset

In [6]:
db_path = folder+"pickles/daasp_with_activities.pkl"
if not os.path.exists(db_path):
    
    pandarallel.initialize(nb_workers=64, progress_bar=False)
    # read
    df_dbaasp = pd.read_csv("data/DBAASP_nointrabond.csv", sep =';')
    
    # remove seq duplicates
    df_dbaasp = df_dbaasp.drop_duplicates("Sequence").copy()

    # check that N and C terminus are absent or ACT (actetyl) and AMD (amide) respectively
    cols = df_dbaasp.columns
    cols = cols.map(lambda x: x.replace(' ', '_'))
    df_dbaasp.columns = cols
    df_dbaasp["C_terminus"]= df_dbaasp["C_terminus"].map(str)
    df_dbaasp["N_terminus"]= df_dbaasp["N_terminus"].map(str)
    df_dbaasp = df_dbaasp.query(" ((N_terminus ==  'nan' or N_terminus ==  'ACT') and (C_terminus ==  'nan' or C_terminus == 'AMD'))")

    # only natural aminoacid sequences are kept
    df_dbaasp['isNatural'] = df_dbaasp.Sequence.map(is_natural)
    df_dbaasp = df_dbaasp.loc[df_dbaasp['isNatural'] == True].copy()
    del df_dbaasp["isNatural"]

    # check that entries have an associated target with activity below 10 µM, 10000 nM and 32 µg/ml
    df_dbaasp['isActive'] = df_dbaasp.ID.parallel_map(is_active)
    s = df_dbaasp["isActive"].apply(Series,1).stack()
    s.index = s.index.droplevel(-1)
    s.name = "isActive"
    df_dbaasp = df_dbaasp.copy(deep=True)
    del df_dbaasp["isActive"]
    df_dbaasp = df_dbaasp.join(s.apply(lambda x: Series(x)))
    df_dbaasp.columns = ['ID', 'Name', 'N terminus', 'Sequence', 'C terminus', "concentration", "unit", "targetSpecies", "activityMeasure"]
    df_dbaasp = df_dbaasp.dropna(subset = ['Sequence',"concentration", "unit", "targetSpecies", "activityMeasure"]).copy()

    df_dbaasp.to_pickle(db_path)
    pandarallel.initialize(progress_bar=True)

else:
    df_dbaasp = pd.read_pickle(db_path)

In [None]:
df_dbaasp = pd.read_pickle(db_path)

In [7]:
group = df_dbaasp.groupby(["ID","Name", "N terminus", "Sequence", "C terminus"], as_index=False)
actives = pd.DataFrame(group["targetSpecies"].aggregate(list))

In [8]:
def kills_bacteria(row, bacteria):
    targets = row.targetSpecies
    for target in targets:
        if bacteria in target:
            return True    
    return False

In [9]:
for bacteria in ["baumannii", "aureus", "aeruginosa"]:
    actives[bacteria] =  actives.parallel_apply(lambda x: kills_bacteria(x, bacteria), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=597), Label(value='0 / 597'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=597), Label(value='0 / 597'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=597), Label(value='0 / 597'))), HB…

In [10]:
actives["activity"] = 1

In [11]:
actives_list = actives.Sequence.to_list()

# Inactives dataset

## Inactives from DBAASP dataset

In [12]:
db_path = folder+"pickles/daasp_inactives_with_activities.pkl"
if not os.path.exists(db_path):
    
    pandarallel.initialize(nb_workers=64, progress_bar=False)
    # read
    df_dbaasp = pd.read_csv("data/DBAASP_nointrabond.csv", sep =';')
    
    # remove seq duplicates
    df_dbaasp = df_dbaasp.drop_duplicates("Sequence").copy()

    # check that N and C terminus are absent or ACT (actetyl) and AMD (amide) respectively
    cols = df_dbaasp.columns
    cols = cols.map(lambda x: x.replace(' ', '_'))
    df_dbaasp.columns = cols
    df_dbaasp["C_terminus"]= df_dbaasp["C_terminus"].map(str)
    df_dbaasp["N_terminus"]= df_dbaasp["N_terminus"].map(str)
    df_dbaasp = df_dbaasp.query(" ((N_terminus ==  'nan' or N_terminus ==  'ACT') and (C_terminus ==  'nan' or C_terminus == 'AMD'))")

    # only natural aminoacid sequences are kept
    df_dbaasp['isNatural'] = df_dbaasp.Sequence.map(is_natural)
    df_dbaasp = df_dbaasp.loc[df_dbaasp['isNatural'] == True].copy()
    del df_dbaasp["isNatural"]

    # check that entries have an associated target with activity below 10 µM, 10000 nM and 32 µg/ml
    df_dbaasp['isInactive'] = df_dbaasp.ID.parallel_map(is_inactive)
    s = df_dbaasp["isInactive"].apply(Series,1).stack()
    s.index = s.index.droplevel(-1)
    s.name = "isInactive"
    df_dbaasp = df_dbaasp.copy(deep=True)
    del df_dbaasp["isInactive"]
    df_dbaasp = df_dbaasp.join(s.apply(lambda x: Series(x)))
    df_dbaasp.columns = ['ID', 'Name', 'N terminus', 'Sequence', 'C terminus', "concentration", "unit", "targetSpecies", "activityMeasure"]
    df_dbaasp = df_dbaasp.dropna(subset = ['Sequence',"concentration", "unit", "targetSpecies", "activityMeasure"]).copy()

    df_dbaasp.to_pickle(db_path)
    pandarallel.initialize(progress_bar=True)

else:
    df_dbaasp = pd.read_pickle(db_path)

In [13]:
group = df_dbaasp.groupby(["ID","Name", "N terminus", "Sequence", "C terminus"], as_index=False)
inactives_confirmed = pd.DataFrame(group["targetSpecies"].aggregate(list))

In [14]:
for bacteria in ["baumannii", "aureus", "aeruginosa"]:
    inactives_confirmed[bacteria] =  inactives_confirmed.parallel_apply(lambda x: kills_bacteria(x, bacteria), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=234), Label(value='0 / 234'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=234), Label(value='0 / 234'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=234), Label(value='0 / 234'))), HB…

In [15]:
number_confirmed_actives = len(actives)
number_confirmed_inactives = len(inactives_confirmed)
number_scrambled_inactives = (number_confirmed_actives - number_confirmed_inactives) // 2
number_swissprot_inactives = number_confirmed_actives - (number_confirmed_inactives + number_scrambled_inactives)

## Inactives from scrambled actives dataset

In [16]:
def scramble(seq):
    new_seq = ''
    aas = list(seq)
    while len(new_seq) < len(seq):
        aa = random.choice(aas)
        aas.remove(aa)
        new_seq += aa
    return new_seq

def scramble_less(seq):
    lengths = [1,2]
    n = random.choice(lengths)
    seq_list = [seq[i:i+n] for i in range(0, len(seq), n)]
    new_seq = ''
    parts = seq_list
    while len(new_seq) < len(seq):
        part = random.choice(parts)
        parts.remove(part)
        new_seq += part
    return new_seq

def new_inactive_scrambled(row, actives_list = actives_list):
    seq = scramble_less(row["Sequence"])
    while seq in actives_list:
        seq = scramble_less(row["Sequence"])
    cid = "scr_{}".format(row["ID"])
    new_row = {"ID":[cid], "Sequence":[seq]}
    return pd.DataFrame(new_row)

In [17]:
# sample actives to make scrambled inactives until you have enough unique scrambled sequences

def get_scrambled_samples(df_actives, n, SEED=SEED):
    actives_subset = df_actives.sample(n, random_state=SEED).reset_index(drop = True)
    inactives_scrambled = actives_subset.apply(new_inactive_scrambled, axis = 1)
    inactives_scrambled = pd.concat(inactives_scrambled.tolist()).reset_index(drop = True)
    return inactives_scrambled

SEED_counter = SEED
inactives_scrambled = get_scrambled_samples(actives, number_scrambled_inactives).drop_duplicates("Sequence")
while len(inactives_scrambled) < number_scrambled_inactives:
    SEED_counter += 1
    new_samples = get_scrambled_samples(actives, number_scrambled_inactives-len(inactives_scrambled) , SEED=SEED_counter).drop_duplicates("Sequence")
    inactives_scrambled = inactives_scrambled.append(new_samples)
    inactives_scrambled.drop_duplicates("Sequence").reset_index(drop = True)

## Inactives from Swissprot dataset

In [18]:
def random_fragment(longseq, length):
    index1 = random.randrange(len(longseq))
    index2 = index1+length
    seq = longseq[index1:index2]
    return seq

def new_inactive_swissprot(row, df_actives, df_inactives, actives_list = actives_list):
    row_act = df_actives.iloc[row]
    row_inact = df_inactives.iloc[row]
    longseq = row_inact["Sequence"]
    length = len(row_act["Sequence"])
    seq = random_fragment(longseq, length)
    while seq in actives_list:
        seq = random_fragment(longseq, length)
    old_cid = row_act["ID"]
    cid = "frag_{}_{}".format(len(longseq), old_cid)
    new_row = {"ID":[cid], "Sequence":[seq]}
    return pd.DataFrame(new_row)

seqs = []
seq = ''
newvalues_dictionary = {}
first = True

with open(folder+"data/uniprot_sprot.fasta") as inFile:
    for line in inFile:
        line = line.strip()
        if line[0] == ">":
            if first ==  True:
                cid = line.replace(">", "")
                first = False
                continue
            newvalues_dictionary[cid] = seq
            cid = line.replace(">", "")
            seq = ""
            continue
        else:
            seq+=line
            continue

swissprot = pd.DataFrame(newvalues_dictionary.items(), columns=['ID', 'Sequence']) 
swissprot['length'] = swissprot.Sequence.map(len)
swissprot["isNatural"] = swissprot["Sequence"].map(is_natural)
swissprot = swissprot[swissprot['isNatural'] == True].reset_index(drop = True)

In [19]:
# generate inactives picking fragments from uniprot
# sample actives to make uniprot inactives until you have enough unique uniprot sequences

def get_uniprot_samples(df_actives, n, SEED=SEED):
    actives_subset = df_actives.sample(n, random_state=SEED).reset_index(drop = True)
    swissprot_subset =  swissprot.sample(n, random_state=SEED).reset_index(drop = True)
    inactives_swissprot = [new_inactive_swissprot(i, actives_subset, swissprot_subset) for i in range(n)]
    inactives_swissprot = pd.concat(inactives_swissprot).reset_index(drop = True)
    return inactives_swissprot

SEED_counter = SEED
inactives_swissprot = get_uniprot_samples(actives, number_swissprot_inactives).drop_duplicates("Sequence")
n_of_samples = (len(actives.append(inactives_scrambled).append(inactives_swissprot).Sequence.unique()) - len(actives) - len(inactives_scrambled))
while n_of_samples < number_swissprot_inactives:
    SEED_counter += 1
    new_samples = get_uniprot_samples(actives, number_swissprot_inactives - n_of_samples, SEED=SEED_counter).drop_duplicates("Sequence")
    inactives_swissprot = inactives_swissprot.append(new_samples)
    inactives_swissprot.drop_duplicates("Sequence").reset_index(drop = True)
    n_of_samples = (len(actives.append(inactives_scrambled).append(inactives_swissprot).Sequence.unique()) - len(actives) - len(inactives_scrambled))

# Finalize and save dataset

In [20]:
# all inactives together
inactives = pd.concat([inactives_confirmed, inactives_scrambled, inactives_swissprot]).reset_index(drop=True)

inactives["activity"]=0

# assign to training or test set inactives
inactives["Set"] = "test"
training_inactives = inactives.sample(frac=0.75, random_state=SEED)
inactives.loc[training_inactives.index, 'Set'] = "training"

# assign to training or test set actives
actives["Set"] = "test"
training_actives = actives.sample(frac=0.75, random_state=SEED)
actives.loc[training_actives.index, 'Set'] = "training"

# actives and inactives together and saved
dataset_actives_inactives = inactives.append(actives).reset_index(drop=True)
dataset_actives_inactives.to_csv(folder + "data/DAASP_RNN_dataset.csv", index=False)
dataset_actives_inactives.to_pickle(folder + "pickles/DAASP_RNN_dataset.plk")