### 1. Load activities files, which contain drug-target pairs and bioactivity values, to merge them

In [41]:
import pandas as pd
import numpy as np
from collections import Counter

In [54]:
## IC50
df = pd.read_csv('data_IC50.csv', index_col=0)
IC50 = df.set_index('chembl_id')

## Ki
df = pd.read_csv('data_Ki.csv', index_col=0)
Ki = df.set_index('chembl_id')


total = list(IC50.index.unique())
total.extend(Ki.index.unique())
total = set(total)

print('Total drugs: ', len(total))



Total drugs:  1999


### 2. In each case, we can select the best or worst value of activity, given that different experiment were perfomed on the same target. In this case, worst experiment value (max activity value) was selected for IC50 and Ki.

In [55]:
IC50_selected = []

for group in IC50.groupby('chembl_id'):
    drug, sub_df = group
    for target, activity in sub_df.groupby('target_chembl_id'):
        values = activity.standard_value.to_list()
        #selected_value = min(values)                      # --> best experiment
        selected_value = max(values)                       # --> worst experiment
        IC50_selected.append([drug, target, selected_value])


IC50_selected = pd.DataFrame(IC50_selected, columns=['ChEMBL ID', 'target_chembl_id', 'IC50'])
IC50_selected["drug_target"] = IC50_selected["ChEMBL ID"] + '_' + IC50_selected["target_chembl_id"]
IC50_selected = IC50_selected.set_index('drug_target')

In [56]:
print('Drugs with IC50 data: ', IC50_selected['ChEMBL ID'].nunique())
print('Targets with IC50 data: ', IC50_selected['target_chembl_id'].nunique())

Drugs with IC50 data:  1848
Targets with IC50 data:  1177


In [57]:
Ki_selected = []
for grupo in Ki.groupby('chembl_id'):
    drug, sub_df = grupo
    for target, activity in sub_df.groupby('target_chembl_id'):
        values = activity.standard_value.to_list()
        #selected_value = min(values)  # --> best experiment
        selected_value = max(values)   # --> worst experiment
        Ki_selected.append([drug, target, selected_value])


Ki_selected = pd.DataFrame(Ki_selected, columns=['ChEMBL ID', 'target_chembl_id', 'Ki'])

Ki_selected["drug_target"] = Ki_selected["ChEMBL ID"] + '_' + Ki_selected["target_chembl_id"]

Ki_selected = Ki_selected.set_index('drug_target')

print('Drugs with Ki data: ', Ki_selected['ChEMBL ID'].nunique())
print('Targets with Ki data: ', Ki_selected['target_chembl_id'].nunique())

Drugs with Ki data:  1053
Targets with Ki data:  657


### 3. Then, we merge those activities values for each pairs to consolidated our dataset.

In [70]:
IC50_selected

Unnamed: 0_level_0,ChEMBL ID,target_chembl_id,IC50
drug_target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CHEMBL100259_CHEMBL1997,CHEMBL100259,CHEMBL1997,52000.0
CHEMBL100259_CHEMBL3509606,CHEMBL100259,CHEMBL3509606,234000.0
CHEMBL100259_CHEMBL5551,CHEMBL100259,CHEMBL5551,5800.0
CHEMBL100259_CHEMBL5707,CHEMBL100259,CHEMBL5707,5400.0
CHEMBL100259_CHEMBL5780,CHEMBL100259,CHEMBL5780,32000.0
...,...,...,...
CHEMBL998_CHEMBL3721,CHEMBL998,CHEMBL3721,2950.0
CHEMBL998_CHEMBL4302,CHEMBL998,CHEMBL4302,11400.0
CHEMBL998_CHEMBL5748,CHEMBL998,CHEMBL5748,133000.0
CHEMBL998_CHEMBL5918,CHEMBL998,CHEMBL5918,133000.0


In [86]:
Ki_selected

Unnamed: 0_level_0,ChEMBL ID,target_chembl_id,Ki
drug_target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CHEMBL1002_CHEMBL210,CHEMBL1002,CHEMBL210,5623.4100
CHEMBL100259_CHEMBL1997,CHEMBL100259,CHEMBL1997,51000.0000
CHEMBL100259_CHEMBL3509606,CHEMBL100259,CHEMBL3509606,242000.0000
CHEMBL100259_CHEMBL4502,CHEMBL100259,CHEMBL4502,0.0012
CHEMBL100259_CHEMBL5551,CHEMBL100259,CHEMBL5551,5200.0000
...,...,...,...
CHEMBL9967_CHEMBL216,CHEMBL9967,CHEMBL216,25.0000
CHEMBL9967_CHEMBL245,CHEMBL9967,CHEMBL245,158.0000
CHEMBL998_CHEMBL1833,CHEMBL998,CHEMBL1833,156.0000
CHEMBL998_CHEMBL231,CHEMBL998,CHEMBL231,414.0000


In [87]:
consolidated = pd.concat([IC50_selected, Ki_selected], axis=1)

In [88]:
consolidated

Unnamed: 0_level_0,ChEMBL ID,target_chembl_id,IC50,ChEMBL ID,target_chembl_id,Ki
drug_target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CHEMBL100259_CHEMBL1997,CHEMBL100259,CHEMBL1997,52000.0,CHEMBL100259,CHEMBL1997,51000.0
CHEMBL100259_CHEMBL3509606,CHEMBL100259,CHEMBL3509606,234000.0,CHEMBL100259,CHEMBL3509606,242000.0
CHEMBL100259_CHEMBL5551,CHEMBL100259,CHEMBL5551,5800.0,CHEMBL100259,CHEMBL5551,5200.0
CHEMBL100259_CHEMBL5707,CHEMBL100259,CHEMBL5707,5400.0,CHEMBL100259,CHEMBL5707,3400.0
CHEMBL100259_CHEMBL5780,CHEMBL100259,CHEMBL5780,32000.0,CHEMBL100259,CHEMBL5780,31000.0
...,...,...,...,...,...,...
CHEMBL98_CHEMBL5023,,,,CHEMBL98,CHEMBL5023,20000.0
CHEMBL981_CHEMBL4879,,,,CHEMBL981,CHEMBL4879,1000.0
CHEMBL99_CHEMBL2163182,,,,CHEMBL99,CHEMBL2163182,450000.0
CHEMBL991_CHEMBL2883,,,,CHEMBL991,CHEMBL2883,615000.0


### 4. Clean up our data, select IC50 and Ki columns and parse index column to re obtain drug-target pairs on different columns. After that, we save our datafile on a csv file.

In [89]:
consolidated = consolidated[['IC50', 'Ki']]

In [90]:
consolidated

Unnamed: 0_level_0,IC50,Ki
drug_target,Unnamed: 1_level_1,Unnamed: 2_level_1
CHEMBL100259_CHEMBL1997,52000.0,51000.0
CHEMBL100259_CHEMBL3509606,234000.0,242000.0
CHEMBL100259_CHEMBL5551,5800.0,5200.0
CHEMBL100259_CHEMBL5707,5400.0,3400.0
CHEMBL100259_CHEMBL5780,32000.0,31000.0
...,...,...
CHEMBL98_CHEMBL5023,,20000.0
CHEMBL981_CHEMBL4879,,1000.0
CHEMBL99_CHEMBL2163182,,450000.0
CHEMBL991_CHEMBL2883,,615000.0


In [91]:
consolidated = consolidated.reset_index()
consolidated[['drug', 'target']] = consolidated['drug_target'].str.split('_', 1, expand=True)
consolidated = consolidated.drop('drug_target', 1)
consolidated = consolidated[['drug', 'target', 'IC50', 'Ki']]

In [92]:
consolidated

Unnamed: 0,drug,target,IC50,Ki
0,CHEMBL100259,CHEMBL1997,52000.0,51000.0
1,CHEMBL100259,CHEMBL3509606,234000.0,242000.0
2,CHEMBL100259,CHEMBL5551,5800.0,5200.0
3,CHEMBL100259,CHEMBL5707,5400.0,3400.0
4,CHEMBL100259,CHEMBL5780,32000.0,31000.0
...,...,...,...,...
17622,CHEMBL98,CHEMBL5023,,20000.0
17623,CHEMBL981,CHEMBL4879,,1000.0
17624,CHEMBL99,CHEMBL2163182,,450000.0
17625,CHEMBL991,CHEMBL2883,,615000.0


In [93]:
consolidated.to_csv("drugs_targets_activities.tsv", sep="\t", index=False)

In [94]:
print('Total drugs on dataset: ',consolidated.drug.nunique())
print('Total targets on dataset: ', consolidated.target.nunique())

Total drugs on dataset:  1999
Total targets on dataset:  1279


### 5. Now, we load chembl_covid_raw.csv previously obtained to add a new column corresponding to development phase of each drug.

In [95]:
chembl_raw_data = pd.read_csv('chembl_covid_raw.csv', sep=';')
chembl_raw_data = dict(zip(chembl_raw_data['ChEMBL ID'], chembl_raw_data['Max Phase']))


In [96]:
consolidated['phase'] = consolidated.apply(lambda x: chembl_raw_data.get(x['drug']), axis=1)


In [97]:
consolidated

Unnamed: 0,drug,target,IC50,Ki,phase
0,CHEMBL100259,CHEMBL1997,52000.0,51000.0,3
1,CHEMBL100259,CHEMBL3509606,234000.0,242000.0,3
2,CHEMBL100259,CHEMBL5551,5800.0,5200.0,3
3,CHEMBL100259,CHEMBL5707,5400.0,3400.0,3
4,CHEMBL100259,CHEMBL5780,32000.0,31000.0,3
...,...,...,...,...,...
17622,CHEMBL98,CHEMBL5023,,20000.0,4
17623,CHEMBL981,CHEMBL4879,,1000.0,4
17624,CHEMBL99,CHEMBL2163182,,450000.0,1
17625,CHEMBL991,CHEMBL2883,,615000.0,4


In [98]:
consolidated.to_csv("drugs_targets_activities_phase.tsv", sep="\t", index=False)

In [101]:
consolidated

Unnamed: 0,drug,target,IC50,Ki,phase
0,CHEMBL100259,CHEMBL1997,52000.0,51000.0,3
1,CHEMBL100259,CHEMBL3509606,234000.0,242000.0,3
2,CHEMBL100259,CHEMBL5551,5800.0,5200.0,3
3,CHEMBL100259,CHEMBL5707,5400.0,3400.0,3
4,CHEMBL100259,CHEMBL5780,32000.0,31000.0,3
...,...,...,...,...,...
17622,CHEMBL98,CHEMBL5023,,20000.0,4
17623,CHEMBL981,CHEMBL4879,,1000.0,4
17624,CHEMBL99,CHEMBL2163182,,450000.0,1
17625,CHEMBL991,CHEMBL2883,,615000.0,4


### 6. With our data ready, we filtered drugs by phase (equal to 4) and IC50 equal or less than 5000nM, as well as Ki using the same value.

In [103]:
consolidated_filtered = consolidated.loc[(consolidated['phase'] == 4)]
consolidated_filtered

Unnamed: 0,drug,target,IC50,Ki,phase
5,CHEMBL1003,CHEMBL6020,1000000.0,,4
6,CHEMBL1006,CHEMBL234,1929.0,655.0,4
7,CHEMBL1009,CHEMBL1743128,133000.0,,4
8,CHEMBL1009,CHEMBL1841,2146.0,,4
9,CHEMBL1009,CHEMBL1973,8400000.0,,4
...,...,...,...,...,...
17621,CHEMBL978,CHEMBL1821,,1600.0,4
17622,CHEMBL98,CHEMBL5023,,20000.0,4
17623,CHEMBL981,CHEMBL4879,,1000.0,4
17625,CHEMBL991,CHEMBL2883,,615000.0,4


In [105]:
print("Total drugs after filtering by phase 4:", consolidated_filtered.drug.nunique())
print("Total targets after filtering by phase 4:", consolidated_filtered.target.nunique())

Total drugs after filtering by phase 4: 1325
Total targets after filtering by phase 4: 1012


In [106]:
consolidated_filtered = consolidated_filtered.loc[(tabla_final_fase4['IC50'] <= 5000) | (tabla_final_fase4['Ki'] <= 5000)]

In [107]:
consolidated_filtered

Unnamed: 0,drug,target,IC50,Ki,phase
6,CHEMBL1006,CHEMBL234,1929.00,655.0,4
8,CHEMBL1009,CHEMBL1841,2146.00,,4
11,CHEMBL1009,CHEMBL258,3729.00,,4
18,CHEMBL101,CHEMBL221,3000.00,,4
35,CHEMBL1014,CHEMBL1697668,724.44,400.0,4
...,...,...,...,...,...
17612,CHEMBL94454,CHEMBL231,,2.7,4
17616,CHEMBL95,CHEMBL218,,1000.0,4
17617,CHEMBL95,CHEMBL253,,1000.0,4
17621,CHEMBL978,CHEMBL1821,,1600.0,4


In [108]:
print('Drugs after filtering by IC50 and Ki: ', consolidated_filtered.drug.nunique())
print('Targets after filtering by IC50 and Ki: ', consolidated_filtered.target.nunique())

Drugs after filtering by IC50 and Ki:  859
Targets after filtering by IC50 and Ki:  671


In [109]:
consolidated_filtered.to_csv("drugs_targets_filtered.tsv", sep="\t", index=False)


### 7. After that, we manually added drugs interacting with SARS-CoV-2 Spike protein and 4 with know antiviral activity. Then, we load this modified file.


Data to add manually.

drug    target

CHEMBL196	P0DTC2
CHEMBL313006	P0DTC2			
CHEMBL1366	P0DTC2			
CHEMBL1448	P0DTC2			
CHEMBL1401	P0DTC2			
CHEMBL496	P0DTC2			
CHEMBL939	P0DTC2			
CHEMBL422	P0DTC2			
CHEMBL1200633	P0DTC2			
CHEMBL1319139	CHEMBL1697668			
CHEMBL1319139	CHEMBL1743121			
CHEMBL1319139	CHEMBL4523582			
CHEMBL1319139	CHEMBL5514			
CHEMBL1319139	CHEMBL340			
CHEMBL1319139	CHEMBL1293235			
CHEMBL1319139	CHEMBL1293278			
CHEMBL1319139	CHEMBL1075138			
CHEMBL253376	CHEMBL1697668			
CHEMBL253376	CHEMBL1743121			
CHEMBL253376	CHEMBL4523582			
CHEMBL253376	CHEMBL5514			
CHEMBL253376	CHEMBL340			
CHEMBL253376	CHEMBL1293235			
CHEMBL253376	CHEMBL1293278			
CHEMBL253376	CHEMBL1075138			
CHEMBL1475252	CHEMBL4523582	
CHEMBL1475252	CHEMBL4523350				
CHEMBL1475252	CHEMBL4295557		
CHEMBL1475252	CHEMBL6036		
CHEMBL1475252	CHEMBL5172			
CHEMBL404849	CHEMBL4523582			
CHEMBL404849	CHEMBL1795087			
CHEMBL404849	CHEMBL4158			
CHEMBL404849	CHEMBL4040			
CHEMBL404849	CHEMBL1977			
CHEMBL404849	CHEMBL1293232			
CHEMBL404849	CHEMBL1293278			
CHEMBL404849	CHEMBL2457			
CHEMBL404849	CHEMBL4096			
CHEMBL404849	CHEMBL340			
CHEMBL404849	CHEMBL3577			
CHEMBL404849	CHEMBL1075138			
CHEMBL404849	CHEMBL2903			
CHEMBL404849	CHEMBL2760

In [129]:
consolidated_filtered = pd.read_csv('drugs_targets_filtered_manual.tsv', sep="\t")
consolidated_filtered

Unnamed: 0,drug,target,IC50,Ki,fase
0,CHEMBL1006,CHEMBL234,1929.00,655.0,4.0
1,CHEMBL1009,CHEMBL1841,2146.00,,4.0
2,CHEMBL1009,CHEMBL258,3729.00,,4.0
3,CHEMBL101,CHEMBL221,3000.00,,4.0
4,CHEMBL1014,CHEMBL1697668,724.44,400.0,4.0
...,...,...,...,...,...
5020,CHEMBL404849,CHEMBL340,,,
5021,CHEMBL404849,CHEMBL3577,,,
5022,CHEMBL404849,CHEMBL1075138,,,
5023,CHEMBL404849,CHEMBL2903,,,


In [131]:
print('Total drugs for network: ', consolidated_filtered.drug.nunique())
print('Total targets for network: ', consolidated_filtered.target.nunique())

Total drugs for network:  868
Total targets for network:  683


### 8. To continue, with our data already filtered,  we are going to generate attribute files for Cytoscape:
#### a) target-organism 
#### b) drug-interaction (Human, Virus, HumanVirus)

In [132]:
orgs = {}
coronavirus = ['CHEMBL4295557', 'CHEMBL4523582', 'P0DTC2']  # ids checked in UniProt
targets = list(drogas_filtradas.target.unique())
for i in targets:
    if i in coronavirus:
        orgs[i] = 'Coronavirus'
    else:
        orgs[i] = 'Human'

target_organism = pd.DataFrame(orgs.items(), columns=['target', 'label']) 
target_organism.to_csv("target_organism.tsv", sep="\t", index=False)

In [133]:
consolidated_filtered['target_organism'] = consolidated_filtered.apply(lambda x: orgs.get(x['target']), axis=1)

In [134]:
consolidated_filtered

Unnamed: 0,drug,target,IC50,Ki,fase,target_organism
0,CHEMBL1006,CHEMBL234,1929.00,655.0,4.0,Human
1,CHEMBL1009,CHEMBL1841,2146.00,,4.0,Human
2,CHEMBL1009,CHEMBL258,3729.00,,4.0,Human
3,CHEMBL101,CHEMBL221,3000.00,,4.0,Human
4,CHEMBL1014,CHEMBL1697668,724.44,400.0,4.0,Human
...,...,...,...,...,...,...
5020,CHEMBL404849,CHEMBL340,,,,Human
5021,CHEMBL404849,CHEMBL3577,,,,Human
5022,CHEMBL404849,CHEMBL1075138,,,,Human
5023,CHEMBL404849,CHEMBL2903,,,,Human


In [135]:
drug_organism = consolidated_filtered[['drug', 'target_organism']]
drug_organism = drug_organism.groupby(['drug'])['target_organism'].unique()

In [138]:
interaction_type = {}
for drug in drug_organism.index:
    interaction = None
    organisms = drug_organism.loc[drug]
    organisms = ' '.join(organisms)
    if 'Human' in organisms:
        if 'Coronavirus' in organisms:
            interaction = 'DrugHumanVirus'
        else:
            interaction = 'Drug_Human'
    else:
        interaction = 'Drug_Virus'
    
    interaction_type[drug] = interaction


interaction_type = pd.DataFrame(interaction_type.items(), columns=['droga', 'label']) 
interaction_type.to_csv("interaction_type.tsv", sep="\t", index=False)   