In [141]:
import pandas as pd
import numpy as np
from collections import Counter

### 1. Get activities data for each drug in the dataset and set filtering values

In [142]:

## IC50
df = pd.read_csv('data_IC50.csv', index_col=0)
IC50 = df.set_index('chembl_id')

## Ki
df = pd.read_csv('data_Ki.csv', index_col=0)
Ki = df.set_index('chembl_id')

## Inhibition
df = pd.read_csv('data_Inhibition.csv', index_col=0)
Inhibition = df.set_index('chembl_id')

total = list(IC50.index.unique())
total.extend(Ki.index.unique())
total = set(total)
print(len(total))

FILTRO_IC50 = 831763771102671
FILTRO_Ki = 831763771102671

1999


In [143]:
IC50_filtrado = []
for grupo in IC50.groupby('chembl_id'):
    droga, sub_df = grupo
    for blanco, actividad in sub_df.groupby('target_chembl_id'):
        lista_de_valores = actividad.standard_value.to_list()
        
        #valor_umbral = min(lista_de_valores)  # best experiment
        valor_umbral = max(lista_de_valores)  # worst experiment
        
        if valor_umbral <= FILTRO_IC50:
            IC50_filtrado.append([droga, blanco, valor_umbral])


IC50_filtrado = pd.DataFrame(IC50_filtrado, columns=['ChEMBL ID', 'target_chembl_id', 'IC50'])


IC50_filtrado["identificador"] = IC50_filtrado["ChEMBL ID"] + '_' + IC50_filtrado["target_chembl_id"]

IC50_filtrado = IC50_filtrado.set_index('identificador')

In [144]:
print('Drugs after filtering: ', IC50_filtrado['ChEMBL ID'].nunique())

print('Targets after filtering: ', IC50_filtrado['target_chembl_id'].nunique())

Drugs after filtering:  1848
Targets after filtering:  1177


In [145]:
Ki_filtrado = []
for grupo in Ki.groupby('chembl_id'):
    droga, sub_df = grupo
    for blanco, actividad in sub_df.groupby('target_chembl_id'):
        lista_de_valores = actividad.standard_value.to_list()
        #valor_umbral = min(lista_de_valores)
        valor_umbral = max(lista_de_valores)   # worst experiment
        if valor_umbral <= FILTRO_Ki:
            Ki_filtrado.append([droga, blanco, valor_umbral])


Ki_filtrado = pd.DataFrame(Ki_filtrado, columns=['ChEMBL ID', 'target_chembl_id', 'Ki'])

Ki_filtrado["identificador"] = Ki_filtrado["ChEMBL ID"] + '_' + Ki_filtrado["target_chembl_id"]

Ki_filtrado = Ki_filtrado.set_index('identificador')

print(Ki_filtrado)



                               ChEMBL ID target_chembl_id           Ki
identificador                                                         
CHEMBL1002_CHEMBL210          CHEMBL1002        CHEMBL210    5623.4100
CHEMBL100259_CHEMBL1997     CHEMBL100259       CHEMBL1997   51000.0000
CHEMBL100259_CHEMBL3509606  CHEMBL100259    CHEMBL3509606  242000.0000
CHEMBL100259_CHEMBL4502     CHEMBL100259       CHEMBL4502       0.0012
CHEMBL100259_CHEMBL5551     CHEMBL100259       CHEMBL5551    5200.0000
...                                  ...              ...          ...
CHEMBL9967_CHEMBL216          CHEMBL9967        CHEMBL216      25.0000
CHEMBL9967_CHEMBL245          CHEMBL9967        CHEMBL245     158.0000
CHEMBL998_CHEMBL1833           CHEMBL998       CHEMBL1833     156.0000
CHEMBL998_CHEMBL231            CHEMBL998        CHEMBL231     414.0000
CHEMBL998_CHEMBL240            CHEMBL998        CHEMBL240   32022.8000

[9004 rows x 3 columns]


In [146]:
print('Drugs after filtering: ', Ki_filtrado['ChEMBL ID'].nunique())

print('Targets after filtering: ', Ki_filtrado['target_chembl_id'].nunique())

Drugs after filtering:  1051
Targets after filtering:  657


### 2. Concat activities values filtered

In [147]:
tabla_final = pd.concat([IC50_filtrado, Ki_filtrado], axis=1)

In [148]:
#tabla_final = tabla_final.replace({np.nan: 'n.d.'})

# uncomment to filter both values
# tabla_final = tabla_final.dropna()

In [149]:
tabla_final = tabla_final[['IC50', 'Ki']]

In [150]:
tabla_final = tabla_final.reset_index()
tabla_final[['droga', 'target']] = tabla_final['identificador'].str.split('_', 1, expand=True)
tabla_final = tabla_final.drop('identificador', 1)
tabla_final = tabla_final[['droga', 'target', 'IC50', 'Ki']]

#tabla_final

In [151]:
tabla_final.to_csv("drogas_filtradas.tsv", sep="\t", index=False)

In [152]:
print('Drugs after filtering: ',tabla_final.droga.nunique())
print('Targets after filtering: ', tabla_final.target.nunique())

Drugs after filtering:  1999
Targets after filtering:  1279



Parte 1.

IC50 <= 500
Ki   <= 500
. # worst experiment
Drugs after filtering:  1148
Targets after filtering:  712


IC50 <= 1000
Ki   <= 1000
. # worst experiment
Drugs after filtering:  1253
Targets after filtering:  790


IC50 <= 2500
Ki   <= 2500
. # worst experiment
Drugs after filtering:  1348
Targets after filtering:  872


IC50 <= 5000
Ki   <= 5000
. # worst experiment
Drugs after filtering:  1429
Targets after filtering:  927



Parte 2.


IC50 <= 500
Ki   <= 500
best experiment
Drugs after filtering:  1243
Targets after filtering:  760


IC50 <= 1000
Ki   <= 1000
best experiment
Drugs after filtering:  1326
Targets after filtering:  839


IC50 <= 2500
Ki   <= 2500
best experiment
Drugs after filtering:  1409
Targets after filtering:  912


IC50 <= 5000
Ki   <= 5000
best experiment
Drugs after filtering:  1475
Targets after filtering:  963


In [153]:
tabla_final

Unnamed: 0,droga,target,IC50,Ki
0,CHEMBL100259,CHEMBL1997,52000.0,51000.0
1,CHEMBL100259,CHEMBL3509606,234000.0,242000.0
2,CHEMBL100259,CHEMBL5551,5800.0,5200.0
3,CHEMBL100259,CHEMBL5707,5400.0,3400.0
4,CHEMBL100259,CHEMBL5780,32000.0,31000.0
...,...,...,...,...
17620,CHEMBL98,CHEMBL5023,,20000.0
17621,CHEMBL981,CHEMBL4879,,1000.0
17622,CHEMBL99,CHEMBL2163182,,450000.0
17623,CHEMBL991,CHEMBL2883,,615000.0


In [154]:
revisar = tabla_final.droga.to_list()
revisar = set(revisar)
print(len(revisar))

#tabla_final.to_csv("drogas_filtradas.tsv", sep="\t", index=False)

1999


In [155]:
tabla_final

Unnamed: 0,droga,target,IC50,Ki
0,CHEMBL100259,CHEMBL1997,52000.0,51000.0
1,CHEMBL100259,CHEMBL3509606,234000.0,242000.0
2,CHEMBL100259,CHEMBL5551,5800.0,5200.0
3,CHEMBL100259,CHEMBL5707,5400.0,3400.0
4,CHEMBL100259,CHEMBL5780,32000.0,31000.0
...,...,...,...,...
17620,CHEMBL98,CHEMBL5023,,20000.0
17621,CHEMBL981,CHEMBL4879,,1000.0
17622,CHEMBL99,CHEMBL2163182,,450000.0
17623,CHEMBL991,CHEMBL2883,,615000.0


In [156]:
tabla_final.to_csv("droga_target_actividad_sin_filtrar.tsv", sep="\t", index=False)

In [157]:
print(tabla_final.target.nunique())

1279


In [158]:
chembl_raw_data = pd.read_csv('chembl_covid_raw.csv', sep=';')
chembl_raw_data = dict(zip(chembl_raw_data['ChEMBL ID'], chembl_raw_data['Max Phase']))


In [159]:
tabla_final['fase'] = tabla_final.apply(lambda x: chembl_raw_data.get(x['droga']), axis=1)


In [160]:
tabla_final.to_csv("droga_target_actividad_sin_filtrar.tsv", sep="\t", index=False)

In [161]:
print('Max phase distribution: ')
print(Counter(tabla_final.groupby('droga')['fase'].describe()['mean'].values))

Max phase distribution: 
Counter({4.0: 1325, 2.0: 294, 3.0: 255, 1.0: 125})


In [162]:
tabla_final

Unnamed: 0,droga,target,IC50,Ki,fase
0,CHEMBL100259,CHEMBL1997,52000.0,51000.0,3
1,CHEMBL100259,CHEMBL3509606,234000.0,242000.0,3
2,CHEMBL100259,CHEMBL5551,5800.0,5200.0,3
3,CHEMBL100259,CHEMBL5707,5400.0,3400.0,3
4,CHEMBL100259,CHEMBL5780,32000.0,31000.0,3
...,...,...,...,...,...
17620,CHEMBL98,CHEMBL5023,,20000.0,4
17621,CHEMBL981,CHEMBL4879,,1000.0,4
17622,CHEMBL99,CHEMBL2163182,,450000.0,1
17623,CHEMBL991,CHEMBL2883,,615000.0,4


In [163]:
tabla_final_fase4 = tabla_final.loc[(tabla_final['fase'] == 4)]
tabla_final_fase4

Unnamed: 0,droga,target,IC50,Ki,fase
5,CHEMBL1003,CHEMBL6020,1000000.0,,4
6,CHEMBL1006,CHEMBL234,1929.0,655.0,4
7,CHEMBL1009,CHEMBL1743128,133000.0,,4
8,CHEMBL1009,CHEMBL1841,2146.0,,4
9,CHEMBL1009,CHEMBL1973,8400000.0,,4
...,...,...,...,...,...
17619,CHEMBL978,CHEMBL1821,,1600.0,4
17620,CHEMBL98,CHEMBL5023,,20000.0,4
17621,CHEMBL981,CHEMBL4879,,1000.0,4
17623,CHEMBL991,CHEMBL2883,,615000.0,4


In [164]:
tabla_final_fase4.target.nunique()

1012

In [165]:
drogas_filtradas = tabla_final_fase4.loc[(tabla_final_fase4['IC50'] <= 5000) | (tabla_final_fase4['Ki'] <= 5000)]

In [166]:
drogas_filtradas

Unnamed: 0,droga,target,IC50,Ki,fase
6,CHEMBL1006,CHEMBL234,1929.00,655.0,4
8,CHEMBL1009,CHEMBL1841,2146.00,,4
11,CHEMBL1009,CHEMBL258,3729.00,,4
18,CHEMBL101,CHEMBL221,3000.00,,4
35,CHEMBL1014,CHEMBL1697668,724.44,400.0,4
...,...,...,...,...,...
17613,CHEMBL95,CHEMBL218,,1000.0,4
17614,CHEMBL95,CHEMBL220,,225.0,4
17615,CHEMBL95,CHEMBL253,,1000.0,4
17619,CHEMBL978,CHEMBL1821,,1600.0,4


In [169]:
print('Drugs after filtering: ', drogas_filtradas.droga.nunique())
print('Targets after filtering: ', drogas_filtradas.target.nunique())

Drugs after filtering:  859
Targets after filtering:  671


ChEMBL ID Name_Drug Type

CHEMBL1319139            BROMHEXINE HYDROCHLORIDE DAVID
--> CHEMBL219916             DOMPERIDONE DAVID
--> CHEMBL1292               CLOFAZIMINE DAVID
--> CHEMBL19215              METERGOLINE DAVID
--> CHEMBL1642               IMATINIB MESYLATE DAVID
--> CHEMBL941                IMATINIB DAVID
CHEMBL404849             SULOCTIDIL DAVID
CHEMBL1475252            TENATOPRAZOLE DAVID

In [170]:
drogas_filtradas.to_csv("droga_target_filtrada.tsv", sep="\t", index=False)


---> Agregar manualmente drogas de Tabata y David <---
---> Agregar manualmente drogas de Tabata y David <---
---> Agregar manualmente drogas de Tabata y David <---

In [197]:
drogas_filtradas = pd.read_csv('droga_target_filtrada.tsv', sep="\t")
drogas_filtradas

Unnamed: 0,droga,target,IC50,Ki,fase
0,CHEMBL1006,CHEMBL234,1929.00,655.0,4.0
1,CHEMBL1009,CHEMBL1841,2146.00,,4.0
2,CHEMBL1009,CHEMBL258,3729.00,,4.0
3,CHEMBL101,CHEMBL221,3000.00,,4.0
4,CHEMBL1014,CHEMBL1697668,724.44,400.0,4.0
...,...,...,...,...,...
5013,CHEMBL404849,CHEMBL340,,,
5014,CHEMBL404849,CHEMBL3577,,,
5015,CHEMBL404849,CHEMBL1075138,,,
5016,CHEMBL404849,CHEMBL2903,,,


### Generate attribute files for target-organism and drug-interaction (Human, Virus, HumanVirus)

In [198]:
orgs = {}
coronavirus = ['CHEMBL4295557', 'CHEMBL4523582']  # ids checked in UniProt
targets = list(drogas_filtradas.target.unique())
for i in targets:
    if i in coronavirus:
        orgs[i] = 'Coronavirus'
    else:
        orgs[i] = 'Human'

organismo = pd.DataFrame(orgs.items(), columns=['target', 'label']) 
organismo.to_csv("organismo.tsv", sep="\t", index=False)

In [199]:
drogas_filtradas['target_organism'] = drogas_filtradas.apply(lambda x: orgs.get(x['target']), axis=1)

In [200]:
drogas_filtradas

Unnamed: 0,droga,target,IC50,Ki,fase,target_organism
0,CHEMBL1006,CHEMBL234,1929.00,655.0,4.0,Human
1,CHEMBL1009,CHEMBL1841,2146.00,,4.0,Human
2,CHEMBL1009,CHEMBL258,3729.00,,4.0,Human
3,CHEMBL101,CHEMBL221,3000.00,,4.0,Human
4,CHEMBL1014,CHEMBL1697668,724.44,400.0,4.0,Human
...,...,...,...,...,...,...
5013,CHEMBL404849,CHEMBL340,,,,Human
5014,CHEMBL404849,CHEMBL3577,,,,Human
5015,CHEMBL404849,CHEMBL1075138,,,,Human
5016,CHEMBL404849,CHEMBL2903,,,,Human


In [201]:
droga_organismo = drogas_filtradas[['droga', 'target_organism']]
# agrupar por droga y organismo target, luego parsear la lista de cada uno de los organismos target
droga_organismo = droga_organismo.groupby(['droga'])['target_organism'].unique()

In [202]:
droga_organismo

droga
CHEMBL1006    [Human]
CHEMBL1009    [Human]
CHEMBL101     [Human]
CHEMBL1014    [Human]
CHEMBL1017    [Human]
               ...   
CHEMBL989     [Human]
CHEMBL990     [Human]
CHEMBL995     [Human]
CHEMBL9967    [Human]
CHEMBL998     [Human]
Name: target_organism, Length: 867, dtype: object

In [203]:
interaccion = {}
for droga in droga_organismo.index:
    anotacion = None
    organismos = droga_organismo.loc[droga]
    orgs = ' '.join(organismos)
    if 'Human' in orgs:
        if 'Coronavirus' in orgs:
            anotacion = 'DrugHumanVirus'
        else:
            anotacion = 'Drug_Human'
    else:
        anotacion = 'Drug_Virus'
    
    interaccion[droga] = anotacion

interaccion = pd.DataFrame(interaccion.items(), columns=['droga', 'label']) 
interaccion.to_csv("interaccion.tsv", sep="\t", index=False)   

In [204]:
organismo

Unnamed: 0,target,label
0,CHEMBL234,Human
1,CHEMBL1841,Human
2,CHEMBL258,Human
3,CHEMBL221,Human
4,CHEMBL1697668,Human
...,...,...
677,CHEMBL1977,Human
678,CHEMBL1293232,Human
679,CHEMBL2457,Human
680,CHEMBL4096,Human
