In [2]:
#!pip install pubchempy



# Libraries

In [1]:
!pwd

/data/user/rsharma3/nbotw/Project-D2GNETs/Code


In [20]:
import os, sys
sys.path.insert(0, os.path.abspath("."))

In [14]:

from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, QuantileTransformer, PowerTransformer,Normalizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from rdkit import Chem

In [3]:
import pandas as pd
import numpy as np
import pubchempy as pcp

In [28]:
from io import StringIO
sio = sys.stderr = StringIO()

# Loading data

### Cell Line Gene Expression data from COSMIC

In [26]:
data = pd.read_csv('../../D2GNets/data/AllGeneExpression.csv')

### Drug REsponse and Drug Information data from GDSC

In [27]:
df_GDSC = pd.read_csv('../../D2GNets/data/GDSC2_fitted_dose_response_25Feb20_3.csv')
df_GDSC_DrugInfo = pd.read_csv('../../D2GNets/data/Drug_Desciption_GDSC.csv')

# Drug Data Preparation

In [None]:
##incomment is you want to extract features from Pubchem
# require #!pip install pubchempy
'''
lst = []

ctr =0
for x in df_GDSC_DrugInfo['drug_name'].unique().tolist():
  c = pcp.get_compounds(x, 'name')
  
  if len(c) != 0:
    tlst = []
    tlst.append(x)
    tlst.append(int(c[0].cid))
    tlst.append(c[0].isomeric_smiles)
    lst.append(tlst)
  else:
    ctr += 1
    tlst = []
    tlst.append(x)
    tlst.append(None)
    tlst.append(None)
    lst.append(tlst)

df = pd.DataFrame(lst, columns=['DRUG_NAME','PUBCHEM_ID','SMILES'])

df.dropna(inplace=True)
df.isna().sum()

df.to_csv('../../D2GNets/data/SMILES_FeatureEngineered.csv',index=False)

'''

# NLP-based Feature Extraction from Drug SMILES

In [29]:
df= pd.read_csv('../../D2GNets/data/SMILES_FeatureEngineered.csv')
df = df[['DRUG_NAME', 'PUBCHEM_ID', 'SMILES']].copy(deep=True)

In [30]:
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=6):
  if sequence!=None:
    lst =[]
    X =None
    for i in [1]:
        for x in range(len(sequence) - i + 1):
            try:
                
                X=Chem.MolFromSmiles(sequence[x:x+i])
                sio = sys.stderr = StringIO()
            except SyntaxError:
                pass
            if X is None:
                continue
            else:
                lst.append(sequence[x:x+i])
    return lst
  else:
    return None

In [31]:
# Selecting single character-based legitimate molecule
df['WORDS'] = df.apply(lambda x: getKmers(x['SMILES']), axis=1)

In [66]:
setx

['F', 'N', 'B', 'S', 'P', 'C', 'I', 'O']

In [32]:

mols = [] # all legit single character molecule for all drug SMILES
for x in list(df['WORDS']):
    mols = mols + x
    
setx = list(set(mols)) # Unique Single Character Molecule set
molName = [f'Mol{i}' for i in range(1,len(setx)+1)] # aliases for all single character Molecules
molDict = {} # Dictionary to map the Single Character Molecule to their respective aliases

ctr=1
for i in setx:
    molDict.update({i:f'Mol{ctr}'})
    ctr+=1   
mol = pd.DataFrame(np.asarray([setx,molName]).T,columns=['Molecule','tokenName'])
mol.to_csv('../../D2GNets/data/Molecule_token_map.csv',index=False) 

In [38]:
def mapWords(lst=None):
    if lst!=None:
        tlst = []
        for x in lst:
            tlst.append(molDict[x])
        return tlst
    else:
        return None

In [39]:
# words based on sequence of aliases of the drug SMILE 
df['WordMap']=df.apply(lambda x: mapWords(x['WORDS']), axis=1)

In [40]:

df_texts = list(df['WordMap']) # all aliases for all drug SMILES
for item in range(len(df_texts)):
    df_texts[item] = ' '.join(df_texts[item])


In [41]:
n_gram_list = [1,2,3,4,5,6,7,8]
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
for ind in n_gram_list:  #looping to create 8 type of n-gram vectors
    #feature extraction based on Counts
    cv = CountVectorizer(ngram_range=(ind,ind))
    X = cv.fit_transform(df_texts)
    #Conveting the extracted features to thier term frequencies
    tf_transformer = TfidfTransformer(use_idf=False).fit(X)
    X = tf_transformer.transform(X)
    count_vect_df = pd.DataFrame(X.todense(), columns=cv.get_feature_names())
    if ind ==1:
        dff = pd.concat([df.reset_index(), count_vect_df], axis =1, ignore_index= False)
    else:
        dff = pd.concat([dff, count_vect_df], axis =1, ignore_index= False)

In [42]:
print('number of drug selected: %d'%dff.shape[0])
print('number of features created: %d'%dff.shape[1])

number of drug selected: 173
number of features created: 2992


In [43]:
#saving the dataFrame for future reference
dff.to_csv('../../D2GNets/data/SMILES_FeatureEngineered_new.csv',index=False)

In [44]:
#Z-transforamtion of each feature of NLP-extracted drug features
max_abs_scaler = StandardScaler()
d1_1 = max_abs_scaler.fit_transform(dff[dff.columns[6:]])
d1 = pd.DataFrame(d1_1,columns=dff.columns[6:].to_list())
dff1 = pd.concat([dff[dff.columns[:6]],d1],axis=1)
#dff1= dff.copy(deep=True)

In [45]:
#Saving the intemediate dataframe for future reference
dff1.to_csv('../../D2GNets/data/SMILES_FeatureEngineered_new_scaled.csv',index=False)

In [46]:
#saving the names of the drug features
dff1[dff.columns[6:]][:0].to_csv('../../D2GNets/data/moleculeNames_v1.csv',index=False)

# Extracting One-Hot coded Disease Features

In [48]:
s = pd.Series(list(df_GDSC['TCGA_DESC'].unique()))
s.dropna(inplace=True)
s1=pd.get_dummies(s)
disease = pd.concat([s,s1],axis=1)
disease.rename(columns={0:'TCGA_DESC'}, inplace=True)
disease[disease.columns[1:]][:0].to_csv('../../D2GNets/data/diseaseNames.csv',index=False)

# Data Integration

## Joining the GDSC Dataframe with Disease Features

In [None]:
df1 = pd.merge(df_GDSC,disease, on=['TCGA_DESC'], how='left')

## Joining the GDSC Dataframe with Disease Features to Drug Features

In [53]:
df1 = pd.merge(df1,dff1, on=['DRUG_NAME'], how='left')

(135242, 50)
(135242, 19)
(135242, 3041)


In [54]:
# Removing unnecessary columns
df1.drop(columns=['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID','PUTATIVE_TARGET', 'CELL_LINE_NAME', 'SANGER_MODEL_ID',
                         'PATHWAY_NAME', 'COMPANY_ID','WEBRELEASE', 'MIN_CONC', 'MAX_CONC','SMILES', 'WORDS', 'AUC', 'RMSE',
                         'Z_SCORE', 'index'],inplace=True)

In [55]:
# Removing all rows with null values from the remaining data
df1.drop(index=df1[df1.isna().any(axis=1)].index,inplace=True)

### Saving the merged DataFrame of Drug Respnse and Feature Engineered drugs

In [56]:

df1.to_csv('../../D2GNets/data/GDSC_Drug_Feature_Engineered_new_scaled.csv',index=False)

## Integrating Drug and Disease data to Gene Expression data

In [60]:

def dataPrep(ExpName, GE_df,  Drug_df):
    GE_df.dropna(inplace=True)

    Drug_df.rename(columns={'COSMIC_ID':'SAMPLE_ID'},inplace=True)
    Drug_df_GBM = Drug_df[Drug_df['TCGA_DESC']=='GBM'].copy(deep=True)

    Drug_df.drop(Drug_df_GBM.index,inplace=True)
    panCancer_train_df = pd.merge(Drug_df,GE_df, on=['SAMPLE_ID'], how='left')
    GBM_train_df = pd.merge(Drug_df_GBM,GE_df, on=['SAMPLE_ID'], how='left')
    
    Drug_df= None
    GE_df = None
    Drug_df_GBM = None
    GBM_known_drugs_test_df = None

    panCancer_train_df.to_csv(f'../../D2GNets/data/panCancer_train_{ExpName}_df.csv',index=False)
    panCancer_train_df=None

    GBM_train_df.dropna(inplace=True)
    GBM_train_df.to_csv(f'../../D2GNets/data/GBM_train_{ExpName}_df.csv',index=False)
    GBM_train_df = None

In [59]:
GE_df = data
Drug_df = df1
censusGN = pd.read_csv('../../D2GNets/data/Census_allFri_May_6_15_00_18_2022.csv')

### Knowledged-based Feature Engineering. Selecting 657 genes from 16248 COSMIC Gene Set based on Cancer Gene Census Gene lest

In [61]:
X_cols = list(set(GE_df.columns).intersection(set(censusGN['Gene Symbol'].values)))
GE_df = GE_df[['SAMPLE_ID']+X_cols]

In [62]:
dataPrep('CGC_657_DR_Drug_features_new', GE_df, Drug_df)

In [64]:
coldf = pd.DataFrame(columns=X_cols)
coldf.to_csv('../../D2GNets/data/657_Gene_name.csv',index=False)