In [1]:
import pandas as pd
from pandas import Series, DataFrame
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
import sys
import json
sys.path.append("./dbaasp_api_helper_libraries/python")
sys.path.append("./dbaasp_api_helper_libraries/python/request")
%load_ext autoreload
%autoreload 2
import APICaller,Complexity, FormatType, LookupType, MathOperationTypes
import random
import numpy as np
import os
folder = "/data/AIpep-clean/"

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
SEED = 12456789
np.random.seed(SEED)
random.seed(SEED)

In [89]:
def float_ignore_plus_minus(mynumber):
    try:
        return sum(map(float,mynumber.split("±")))
    except:
        return float("inf")

def hemolytic_data(identifier):
    results = []

    try:
        peptideCardRequest = APICaller.PeptideCardRequest()
        peptideCardRequest.peptide_id = identifier;
        peptideCardRequest.format = FormatType.FormatType.JSON;
        dbaasp_peptide = json.loads(peptideCardRequest.request())

        NAs = ["NA", "na", "Na", "nA", "N/A", "n/a"]

        if 'errorCode' in dbaasp_peptide:
            #print("db error", identifier)
            return []
        if "hemoliticCytotoxicActivities" not in dbaasp_peptide["peptideCard"]:
            #print("no target", identifier)#, dbaasp_peptide["peptideCard"])
            return []  

        hemoliticCytotoxicActivities = dbaasp_peptide["peptideCard"]["hemoliticCytotoxicActivities"]
        for activity in hemoliticCytotoxicActivities:
            
            if not ("unit" and "concentration" and "targetCell" and "lysis" in activity):
                print(hemoliticCytotoxicActivities)
                continue
            
            target = activity["targetCell"] 
            if target != 'Human erythrocytes':
                continue
                
            lysis = float(activity["lysis"].split("%")[0])
            unit = activity["unit"]
            
            #aaaaaaaaaaaaaaaaaaaaaaaaaaaaaAAAaaaaaaaaaargh
            concentration_str = activity["concentration"].replace(" ","")
            concentration_str = concentration_str.replace("–","-")
            concentration_str = concentration_str.replace("->","-") 
            concentration_str = concentration_str.replace(",",".") 


            if concentration_str[0] == '<':
                if concentration_str[1] == '=':
                    concentration_tmp = float_ignore_plus_minus(concentration_str[2:])
                else:
                    concentration_tmp = float_ignore_plus_minus(concentration_str[1:])
                concentration = concentration_tmp
            elif concentration_str[0] == '>' or concentration_str in NAs:
                if concentration_str[1] == '=':
                    concentration_tmp = float_ignore_plus_minus(concentration_str[2:])
                else:
                    concentration_tmp = float_ignore_plus_minus(concentration_str[1:])
                concentration = concentration_tmp
            elif "-"  in concentration_str:
                concentrations = concentration_str.split("-")
                concentration =  float_ignore_plus_minus(concentrations[0]) + float_ignore_plus_minus(concentrations[1])
                concentration /= 2
            else:
                concentration = float_ignore_plus_minus(concentration_str)


            results.append([concentration, unit, lysis])

            #elif unit != "µM" and unit != "nM" and unit != "µg/ml" and concentration_str not in NAs:
                #pass 
                # print("no unit",unit, identifier)#, species)

        return results
    except:
        return results

In [20]:
if not os.path.exists(folder+"pickles/DAASP_RNN_dataset_with_hemolysis.plk"):
    dataset_actives_inactives = pd.read_pickle(folder + "pickles/DAASP_RNN_dataset.plk")
    dataset_actives_inactives["hemolysis"] = dataset_actives_inactives.ID.parallel_map(hemolytic_data)
    dataset_actives_inactives.to_pickle(folder + "pickles/DAASP_RNN_dataset_with_hemolysis.plk")
else:
    dataset_actives_inactives = pd.read_pickle(folder + "pickles/DAASP_RNN_dataset_with_hemolysis.plk")

In [88]:
hemolytic_data(11)

[[6.0, 'µM', 50.0],
 [4.0, 'µM', 15.6],
 [8.0, 'µM', 23.1],
 [16.0, 'µM', 33.1],
 [32.0, 'µM', 43.6],
 [64.0, 'µM', 74.3],
 [128.0, 'µM', 87.4]]

In [8]:
dataset_actives_inactives["len_hemolysis"] = dataset_actives_inactives["hemolysis"].map(len)

In [9]:
len(dataset_actives_inactives[dataset_actives_inactives["len_hemolysis"]>0])

2571

In [10]:
len(dataset_actives_inactives)

9548

In [92]:
from rdkit import Chem
from rdkit.Chem.rdmolfiles import MolFromFASTA, MolToSmiles, MolFromSmiles
from rdkit.Chem import Descriptors
import tmap as tm
from map4 import MAP4Calculator

def seq_to_smiles(seq):
    mol = MolFromFASTA(seq, flavor=True, sanitize = True)
    smiles = MolToSmiles(mol, isomericSmiles=True)
    return smiles

def MW(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mw = Descriptors.ExactMolWt(mol)
    return mw

def isnothemolytic(row):
    list_ = row.hemolysis
    mw = MW(seq_to_smiles(row.Sequence))
    if len(list_)==0:
        return -1
    for data in list_:
        unit = data[1]
        hem = data[2]
        if unit == "µg/ml":
            conc = data[0]/(mw/1000)
        else:
            conc = data[0]
        if hem < 20 and conc >= 50:
            return 1
        elif hem >= 20:
            return 0
    return -2  

def seq_to_smiles(seq):
    mol = MolFromFASTA(seq, flavor=True, sanitize = True)
    smiles = MolToSmiles(mol, isomericSmiles=True)
    return smiles

MAP4 = MAP4Calculator(dimensions=1024)
def calc_map4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    map4 = MAP4.calculate(mol)
    return np.array(map4)

In [75]:
dataset_actives_inactives["isNotHemolytic"] = dataset_actives_inactives.apply(isnothemolytic, axis=1) 

In [94]:
dataset_actives_inactives["smiles"] = dataset_actives_inactives.Sequence.parallel_map(seq_to_smiles)

In [95]:
dataset_actives_inactives["MAP4"] = dataset_actives_inactives.smiles.parallel_map(calc_map4)

In [76]:
dataset_actives_inactives.to_pickle(folder + "pickles/DAASP_RNN_dataset_with_hemolysis.plk")