In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os

In [2]:
os.listdir("../results")

['filtered_drug_profiles_123.csv',
 'filtered_drug_profiles_12.csv',
 'filtered_drug_profiles_13.csv',
 'filtered_drug_profiles_23.csv',
 '.ipynb_checkpoints',
 'merged_drug_profiles_sigmoid4_123.csv',
 'drug_features_pubchem_id.csv',
 'drug_features_with_pubchem_properties.csv',
 'merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'merged_fitted_sigmoid4_123_with_drugs_properties.csv']

In [3]:
df_profiles = pd.read_csv("../results/merged_drug_profiles_sigmoid4_123.csv").drop("Unnamed: 0", axis=1)
df_drugs_0 = pd.read_csv("../results/drug_features_pubchem_id.csv").drop(["Unnamed: 0", "deriv_found"], axis=1)
df_drugs_1 = pd.read_csv("../results/drug_features_with_pubchem_properties.csv").drop("Unnamed: 0", axis=1)

In [4]:
df_profiles.head(2)

Unnamed: 0,COSMIC_ID,DRUG_ID,fd_num_0,fd_num_1,fd_num_2,fd_num_3,fd_num_4,fd_num_5,fd_num_6,fd_num_7,...,chr9:104248247-104249501(C9orf125)_HypMET,"chr9:115875199-115875738(C9orf109, C9orf110)_HypMET",chr9:123555399-123555899(FBXW2)_HypMET,chr9:140310894-140312457(EXD3)_HypMET,chr9:21974578-21975306(CDKN2A)_HypMET,chr9:35756948-35757339(MSMP)_HypMET,chr9:35791584-35791924(NPR2)_HypMET,chr9:4984543-4985630(JAK2)_HypMET,chr9:86571047-86572027(C9orf64)_HypMET,chr9:98783216-98784364(NCRNA00092)_HypMET
0,909704,308,0,0.111111,0.222222,0.333333,0.444444,0.555556,0.666667,0.777778,...,0,0,0,0,0,0,0,0,0,0
1,909704,133,0,0.111111,0.222222,0.333333,0.444444,0.555556,0.666667,0.777778,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_drugs_0.head(2)

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,Target_Pathway,PubChem_ID
0,1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling,176870
1,3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling,5284616


### Merge data

In [6]:
column_not_to_use = ["Synonyms", "deriv_found", "PubChem_ID",
                     "elements", "inchi_key", "canonical_smiles", "inchi_string"]
columns_to_use_0 = set(df_drugs_0.columns) - set(column_not_to_use)
columns_to_use_1 = set(df_drugs_1.columns) - set(column_not_to_use)

merged_0 = pd.merge(left = df_profiles, right = df_drugs_0[columns_to_use_0], on = "DRUG_ID") 
merged_1 = pd.merge(left = df_profiles, right = df_drugs_1[columns_to_use_1], on = "DRUG_ID") 
merged_0.shape, merged_1.shape

((2579, 1102), (2579, 1154))

### Exclude drugs that don't have PubChem_id

In [7]:
print("All drugs: %d, With known PubChem_Id: %d" % (df_drugs_0.shape[0], df_drugs_0[df_drugs_0["PubChem_ID"]!=0].shape[0]))

All drugs: 265, With known PubChem_Id: 249


In [8]:
drugs_with_pubchem_id = df_drugs_0[df_drugs_0["PubChem_ID"]!=0]["DRUG_ID"].values
drugs_with_no_pubchem_id = df_drugs_0[df_drugs_0["PubChem_ID"]==0]["DRUG_ID"].values

In [9]:
filtered_drugs_with_pubchem_id = set(drugs_with_pubchem_id) & set(merged_0["DRUG_ID"].unique())

In [10]:
merged_0 = merged_0.set_index("DRUG_ID").loc[filtered_drugs_with_pubchem_id, :].reset_index()
merged_1 = merged_1.set_index("DRUG_ID").loc[filtered_drugs_with_pubchem_id, :].reset_index()
merged_0.shape, merged_1.shape

((2552, 1102), (2552, 1154))

### Save the data

In [11]:
merged_0.drop("Target", axis=1).to_csv("../results/merged_fitted_sigmoid4_123_with_drugs_description.csv")
merged_1.drop("Target", axis=1).to_csv("../results/merged_fitted_sigmoid4_123_with_drugs_properties.csv")

### Dealing with Target column

In [12]:
def SplitTargetColumn(df):
    elements = set(df["Target"].str.split(",", expand=True).fillna(0).values.flatten()) - set([0])
    targets = [target.strip(" ").strip("'") for target in elements]

    list_targets = list(set(targets))
    a = sorted(list_targets)
    a_3 = [x[:3] for x in a]
    target_groups = np.unique(a_3)
    exceptions = []
    for drug_index in df.index:
        compound_elements = df.loc[drug_index, "Target"]
        try:
            for target in target_groups:
                if target in target_groups:
                    df.loc[drug_index, target] = 1
                else:
                    df.loc[drug_index, target] = 0
        except:
            exceptions.append(drug_index)
            df.loc[drug_index, target] = 0
    return exceptions

In [15]:
%%time
exceptions_1 = SplitTargetColumn(merged_0)
exceptions_2 = SplitTargetColumn(merged_1)

CPU times: user 4min, sys: 111 ms, total: 4min
Wall time: 4min 8s


In [16]:
merged_0.drop("Target", axis=1).to_csv("../results/merged_fitted_sigmoid4_123_with_drugs_description_split_target.csv")
merged_1.drop("Target", axis=1).to_csv("../results/merged_fitted_sigmoid4_123_with_drugs_properties_split_target.csv")