In [1]:
import pandas as pd
import json
import sys
sys.path.append("../../../ddt/")
from utility import FeatureGenerator
import numpy as np
from tqdm import *
import multiprocessing as mp
from functools import partial

In [2]:
df = pd.read_csv('../../model_info.csv')
df.head()

Unnamed: 0,id,name,tdl,fam,famext,uniprot_name,description,uniprot,seq,sym
0,23,3 beta-hydroxysteroid dehydrogenase/Delta 5-->...,Tchem,Enzyme,3-beta-HSD,3BHS1_HUMAN,3 beta-hydroxysteroid dehydrogenase/Delta 5-->...,P14060,MTGWSCLVTGAGGFLGQRIIRLLVKEKELKEIRVLDKAFGPELREE...,HSD3B1
1,26,5-hydroxytryptamine receptor 2B,Tclin,GPCR,GPCR,5HT2B_HUMAN,5-hydroxytryptamine receptor 2B,P41595,MALSYRVSELQSTIPEHILQSTFVHVISSNWSGLQTESIPEEMKQI...,HTR2B
2,27,5-hydroxytryptamine receptor 2C,Tclin,GPCR,GPCR,5HT2C_HUMAN,5-hydroxytryptamine receptor 2C,P28335,MVNLRNAVHSFLVHLIGLLVWQCDISVSPVAAIVTDIFNTSDGGRF...,HTR2C
3,30,5'-nucleotidase,Tchem,Enzyme,,5NTD_HUMAN,5'-nucleotidase,P21589,MCPRAARAPATLLLALGAVLWPAAGAWELTILHTNDVHSRLEQTSE...,NT5E
4,39,Adenosine receptor A3,Tchem,GPCR,GPCR,AA3R_HUMAN,Adenosine receptor A3,P0DMS8,MPNNSTALSLANVTYITMEIFIGLCAIVGNVLVICVVKLNPSLQTT...,ADORA3


In [3]:
print("Total model count: {}".format(len(df)))

Total model count: 864


In [4]:
# df.name.apply(lambda x: model_names[x])

In [2]:
uniprot_to_model = json.load(open('../../uniprot_to_protein_name.json', 'r'))
model_names = json.load(open('../../model_to_uniprot.json', 'r'))

In [136]:
pharos_drugs = pd.read_csv("pharos_drug_activity.csv", usecols=[1, 6], index_col=[0])
pharos_drugs.dropna(inplace=True)
pharos_drugs.head()

Unnamed: 0_level_0,smiles
target_id,Unnamed: 1_level_1
11838,CCCCC1(CCCC)CN(C2=CC=CC=C2)C2=C(C=C(OCC(=O)N[C...
3299,CC(C)(CO)C1=CC2=C(C=C(F)C(NC(=O)C3(CC3)C3=CC=C...
6977,[H]C(=O)N[C@@H](CC1=CNC2=C1C=CC=C2)NC(=O)[C@@H...
13260,COC1=C(OCCCN2CCOCC2)C=CC2=C1N=C(NC(=O)C1=CN=C(...
13069,COC1=C(OCCCN2CCOCC2)C=CC2=C1N=C(NC(=O)C1=CN=C(...


In [16]:
def get_features(row):
    smiles = row['smiles']
    ft = FeatureGenerator()
    ft.load_smiles(smiles)
    try:
        _, features = ft.extract_tpatf()
        return features
    except: return None


def parallelize(data, func, num_of_processes=mp.cpu_count()):
    data_split = np.array_split(data, num_of_processes)
    pool = mp.Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def run_on_subset(func, data_subset):
    return data_subset.apply(func, axis=1)

def parallelize_on_rows(data, func, num_of_processes=mp.cpu_count()):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

In [17]:
pharos_drugs['features'] = parallelize_on_rows(pharos_drugs, get_features)

In [18]:
pharos_drugs.dropna(inplace=True)

In [23]:
pharos_drugs.to_csv("pharos_drugs_features.csv")

In [104]:
f = np.array(pharos_drugs.features.apply(lambda x: x.flatten().tolist()).values.tolist())

In [105]:
np.save('features', f)

In [137]:
pharos_drugs.to_csv("pharos_drugs.csv")

In [5]:
import random
import pandas as pd
import numpy as np
import sys
from sklearn.externals import joblib
import os
import json
import warnings
import time
warnings.filterwarnings("ignore")

home_dir = '../..'
MODELS_DIR = '../../uniprot_models'
PY_VERSION = '27' if sys.version_info[0] < 3 else '35'

In [7]:
# Load the models
def get_models():    
    model_names = open(os.path.join(home_dir, 'py' + PY_VERSION + '_uniprot_models.txt'), 'r').readlines()[0].split(',')
    for model in model_names:
        with open(os.path.join(MODELS_DIR, model), 'rb') as f:
            yield model, joblib.load(f) #, mmap_mode='r+')

In [8]:
def get_prediction(features):
    confidence = 0.9
    actives = []
    for model_name, model in get_models():
        if type(model).__name__ == "SVC": 
            pred = model.predict(features)
            pred = pred.reshape((-1, 1))
            if pred[:, 0] > confidence: actives.append(model_name)
        else:
            pred = model.predict_proba(features)
            pred = pred.reshape((-1, 2))
            if pred[:, 1] > confidence: actives.append(model_name)
            
    return actives

In [9]:
tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))




In [114]:
features = np.load('features.npy')

In [110]:
predictions = [get_prediction(feat[np.newaxis, :]) for feat in features]

In [113]:
pharos_drugs['actives'] = pd.Series.from_array(predictions)

dtype('O')

In [10]:
pharos_drugs['actives'] = pharos_drugs.features.progress_apply(get_prediction)
pharos_drugs.to_csv('pharos_drugs_actives_'+PY_VERSION+'.csv')

HBox(children=(IntProgress(value=0, max=3669), HTML(value=u'')))

ValueError: could not convert string to float: [[24. 55. 10. ...  0.  0.  0.]]

In [142]:
pharos_drugs =pd.read_csv("pharos_drugs_actives_27.csv")

In [143]:
pharos_drugs.head()

Unnamed: 0.1,Unnamed: 0,target_id,smiles,actives
0,0,11838,CCCCC1(CCCC)CN(C2=CC=CC=C2)C2=C(C=C(OCC(=O)N[C...,['Q8WUI4']
1,1,3299,CC(C)(CO)C1=CC2=C(C=C(F)C(NC(=O)C3(CC3)C3=CC=C...,['P35916']
2,2,6977,[H]C(=O)N[C@@H](CC1=CNC2=C1C=CC=C2)NC(=O)[C@@H...,[]
3,3,13260,COC1=C(OCCCN2CCOCC2)C=CC2=C1N=C(NC(=O)C1=CN=C(...,"['P42336', 'P42338']"
4,4,13069,COC1=C(OCCCN2CCOCC2)C=CC2=C1N=C(NC(=O)C1=CN=C(...,"['P42336', 'P42338']"
