In [2]:
#!pip install pubchempy



In [1]:
!pwd

/data/user/rsharma3/nbotw/Project-D2GNETs/Code


In [23]:
import sys
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, QuantileTransformer, PowerTransformer,Normalizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from rdkit import Chem

In [3]:
import pandas as pd
import numpy as np
import pubchempy as pcp

# Loading data

### Cell Line Gene Expression data from COSMIC

In [4]:
data = pd.read_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/AllGeneExpression.csv')

### Drug REsponse and Drug Information data from GDSC

In [5]:
df_GDSC = pd.read_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/GDSC2_fitted_dose_response_25Feb20_3.csv')
df_GDSC_DrugInfo = pd.read_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/Drug_Desciption_GDSC.csv')

# Drug Data Preparation

In [None]:
lst = []

ctr =0
for x in df_GDSC_DrugInfo['drug_name'].unique().tolist():
  c = pcp.get_compounds(x, 'name')
  
  if len(c) != 0:
    tlst = []
    tlst.append(x)
    tlst.append(int(c[0].cid))
    tlst.append(c[0].isomeric_smiles)
    lst.append(tlst)
  else:
    ctr += 1
    tlst = []
    tlst.append(x)
    tlst.append(None)
    tlst.append(None)
    lst.append(tlst)

df = pd.DataFrame(lst, columns=['DRUG_NAME','PUBCHEM_ID','SMILES'])

In [None]:
df.dropna(inplace=True)
df.isna().sum()

In [289]:
df= pd.read_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/SMILES_FeatureEngineered.csv')
df = df[['DRUG_NAME', 'PUBCHEM_ID', 'SMILES']].copy(deep=True)

In [200]:
### Using NLP to extract features from Drug SMILES

In [8]:
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=6):
  if sequence!=None:
    
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
  else:
    return None

In [8]:
from io import StringIO
sio = sys.stderr = StringIO()

In [20]:
lst

['C',
 'C',
 'N',
 'C',
 'O',
 'C',
 'N',
 'O',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'N',
 'C',
 'C',
 'O',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'O',
 'O',
 'C',
 'C',
 'C']

In [290]:
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=6):
  if sequence!=None:
    lst =[]
    X =None
    for i in [1]:
    #for i in range(1,int(len(sequence)*0.50)):
        for x in range(len(sequence) - i + 1):
            try:
                
                X=Chem.MolFromSmiles(sequence[x:x+i])
                sio = sys.stderr = StringIO()
            except SyntaxError:
                pass
            if X is None:
                continue
            else:
                lst.append(sequence[x:x+i])
    return lst
  else:
    return None

In [291]:
lst = getKmers('CCNC(=O)C1=NOC(=C1C2=CC=C(C=C2)CN3CCOCC3)C4=CC(=C(C=C4O)O)C(C)C')

In [292]:
df['WORDS'] = df.apply(lambda x: getKmers(x['SMILES']), axis=1)

In [293]:
mols = []
for x in list(df['WORDS']):
    mols = mols + x
    

In [294]:
setx = list(set(mols))

In [295]:
len(setx)

8

In [296]:
molName = [f'Mol{i}' for i in range(1,len(setx)+1)]

In [297]:
molDict = {}
ctr=1
for i in setx:
    molDict.update({i:f'Mol{ctr}'})
    ctr+=1
    

In [298]:
mol = pd.DataFrame(np.asarray([setx,molName]).T,columns=['Molecule','tokenName'])

In [299]:
mol.to_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/Molecule_token_map.csv',index=False)

In [300]:
def mapWords(lst=None):
    if lst!=None:
        tlst = []
        for x in lst:
            tlst.append(molDict[x])
        return tlst
    else:
        return None

In [301]:

df['WordMap']=df.apply(lambda x: mapWords(x['WORDS']), axis=1)

In [302]:
df_texts = list(df['WordMap'])
for item in range(len(df_texts)):
    df_texts[item] = ' '.join(df_texts[item])


In [361]:
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
cv = CountVectorizer(ngram_range=(8,8))
X = cv.fit_transform(df_texts)

In [362]:
#from sklearn.feature_extraction.text import CountVectorizer
#tf_transformer = TfidfTransformer(use_idf=False).fit(X)
#cv = TfidfTransformer(ngram_range=(1,1),use_idf=False)
#X = tf_transformer.transform(X)

In [363]:
count_vect_df = pd.DataFrame(X.todense(), columns=cv.get_feature_names())

In [364]:
#dff = pd.concat([df.reset_index(), count_vect_df], axis =1, ignore_index= False)

In [365]:
dff = pd.concat([dff, count_vect_df], axis =1, ignore_index= False)

In [366]:
print('number of drug selected: %d'%dff.shape[0])
print('number of features created: %d'%dff.shape[1])

number of drug selected: 173
number of features created: 2992


In [367]:
dff[dff.columns[6:]].max().max()

62

In [368]:
dff[dff.columns[6:]]

Unnamed: 0,mol1,mol2,mol3,mol4,mol5,mol6,mol7,mol8,mol1 mol1,mol1 mol2,...,mol8 mol8 mol8 mol8 mol8 mol8 mol5 mol8,mol8 mol8 mol8 mol8 mol8 mol8 mol6 mol3,mol8 mol8 mol8 mol8 mol8 mol8 mol7 mol4,mol8 mol8 mol8 mol8 mol8 mol8 mol7 mol8,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol1,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol3,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol4,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol5,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol7,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol8
0,3,0,0,0,5,0,0,26,0,0,...,0,0,0,0,1,0,0,1,0,3
1,5,0,1,0,4,1,0,26,0,0,...,0,1,0,0,0,0,0,0,0,0
2,7,0,0,1,7,0,0,46,0,0,...,0,0,0,0,1,0,0,1,0,10
3,4,0,1,0,3,0,0,24,1,0,...,0,0,0,0,1,0,0,0,0,1
4,4,0,0,1,1,0,0,22,1,0,...,0,0,0,0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,5,0,1,0,2,0,0,26,0,0,...,0,0,0,0,1,0,0,0,0,0
169,2,0,2,0,2,0,0,21,0,0,...,0,0,0,0,1,1,0,0,0,3
170,2,0,3,0,4,1,0,16,0,0,...,0,1,0,0,0,0,0,0,0,0
171,6,0,0,0,3,0,0,25,0,0,...,0,0,0,0,0,0,0,1,0,0


In [369]:
#saving the dataFrame for future reference
dff.to_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/SMILES_FeatureEngineered_new.csv',index=False)

In [370]:
max_abs_scaler = StandardScaler()
d1_1 = max_abs_scaler.fit_transform(dff[dff.columns[6:]])
d1 = pd.DataFrame(d1_1,columns=dff.columns[6:].to_list())
dff1 = pd.concat([dff[dff.columns[:6]],d1],axis=1)
#dff1= dff.copy(deep=True)

In [371]:
dff1.to_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/SMILES_FeatureEngineered_new_scaled.csv',index=False)

In [372]:
dff1[dff.columns[6:]][:0].to_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/moleculeNames_v1.csv',index=False)

In [374]:
dff1[dff.columns[6:]][:0]

Unnamed: 0,mol1,mol2,mol3,mol4,mol5,mol6,mol7,mol8,mol1 mol1,mol1 mol2,...,mol8 mol8 mol8 mol8 mol8 mol8 mol5 mol8,mol8 mol8 mol8 mol8 mol8 mol8 mol6 mol3,mol8 mol8 mol8 mol8 mol8 mol8 mol7 mol4,mol8 mol8 mol8 mol8 mol8 mol8 mol7 mol8,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol1,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol3,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol4,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol5,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol7,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol8


In [375]:
s = pd.Series(list(df_GDSC['TCGA_DESC'].unique()))

In [376]:
s.dropna(inplace=True)

In [377]:
s1=pd.get_dummies(s)

In [378]:
disease = pd.concat([s,s1],axis=1)

In [379]:
disease.rename(columns={0:'TCGA_DESC'}, inplace=True)

In [380]:
disease[disease.columns[1:]][:0].to_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/diseaseNames.csv',index=False)

In [381]:
#joining the GDSC Dataframe with Feature Engineered Drug Smiles
df1 = pd.merge(df_GDSC,disease, on=['TCGA_DESC'], how='left')
print(df1.shape)
df1 = pd.merge(df1,dff1, on=['DRUG_NAME'], how='left')
print(df_GDSC.shape)
print(df1.shape)

(135242, 50)
(135242, 19)
(135242, 3041)


In [382]:
df1.drop(columns=['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID','PUTATIVE_TARGET', 'CELL_LINE_NAME', 'SANGER_MODEL_ID',
                         'PATHWAY_NAME', 'COMPANY_ID','WEBRELEASE', 'MIN_CONC', 'MAX_CONC','SMILES', 'WORDS', 'AUC', 'RMSE',
                         'Z_SCORE', 'index'],inplace=True)

In [383]:
df1.drop(index=df1[df1.isna().any(axis=1)].index,inplace=True)

### Saving the merged DataFrame of Drug Respnse and Feature Engineered drugs

In [384]:

df1.to_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/data/GDSC_Drug_Feature_Engineered_new_scaled.csv',index=False)

In [186]:
dfx = df1[(df1['TCGA_DESC']=='GBM')&(df1['DRUG_NAME']=='Carmustine')]

Unnamed: 0,COSMIC_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,LN_IC50,ACC,ALL,BLCA,BRCA,CESC,...,mol8 mol8 mol8 mol8 mol8 mol8 mol6 mol6,mol8 mol8 mol8 mol8 mol8 mol8 mol6 mol8,mol8 mol8 mol8 mol8 mol8 mol8 mol7 mol3,mol8 mol8 mol8 mol8 mol8 mol8 mol7 mol8,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol3,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol4,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol5,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol6,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol7,mol8 mol8 mol8 mol8 mol8 mol8 mol8 mol8


In [None]:
GDSC = pd.merge(df_GDSC, df_GDSC_DrugInfo, on=[])

In [None]:
GE_df = pd.read_csv('/content/drive/MyDrive/COSMIC/GE_dimension_reduced_RANS_657.csv')
Drug_df = pd.read_csv('/content/drive/MyDrive/GDSCdata/GDSC_Drug_Feature_Engineered_with_RANs.csv')


In [None]:
censusGN = pd.read_csv('/content/drive/MyDrive/GDSCdata/Census_allSun May 22 17_29_44 2022.csv')

In [None]:
censusGN.head()

In [None]:
GE_df = pd.read_csv('/content/drive/MyDrive/COSMIC/AllGeneExpression.csv')
Drug_df = pd.read_csv('/content/drive/MyDrive/GDSCdata/GDSC_Drug_Feature_Engineered_new.csv')

In [None]:
GE_df.columns

In [None]:
X_cols = list(set(GE_df.columns).intersection(set(censusGN['Gene Symbol'].values)))

In [None]:
GE_df = GE_df[['SAMPLE_ID']+X_cols]


In [None]:
#Drug_df = pd.read_csv('/content/drive/MyDrive/GDSCdata/GDSC_Drug_Feature_Engineered_with_RANs.csv')

In [None]:
GE_df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:

#GE_df = pd.read_csv('/content/drive/MyDrive/COSMIC/AllGeneExpression.csv')
#Drug_df = pd.read_csv('/content/drive/MyDrive/GDSCdata/GDSC_Drug_Feature_Engineered.csv')


GE_df.dropna(inplace=True)

Drug_df.rename(columns={'COSMIC_ID':'SAMPLE_ID'},inplace=True)
Drug_df.dropna(inplace=True)

GBM_DRUG_LIST = ['Crizotinib','Docetaxel','Temozolomide', 'Camptothecin','Carmustine', 'Irinotecan','Nimotuzumab','Temozolomide']
TARGET = 'LN_IC50'
IRRELEVANT_FETURES = ['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COSMIC_ID',
       'CELL_LINE_NAME', 'SANGER_MODEL_ID', 'TCGA_DESC', 'DRUG_ID',
       'DRUG_NAME', 'PUTATIVE_TARGET', 'PATHWAY_NAME', 'COMPANY_ID',
       'WEBRELEASE', 'MIN_CONC', 'MAX_CONC', 'AUC', 'RMSE',
       'Z_SCORE', 'index', 'PUBCHEM_ID', 'SMILES', 'WORDS', 'SAMPLE_ID',]

Drug_df_GBM = Drug_df[Drug_df['TCGA_DESC']=='GBM'].copy(deep=True)

Drug_df.drop(Drug_df_GBM.index,inplace=True)

GBM_known_drugs_test_df =  Drug_df_GBM[(Drug_df_GBM['DRUG_NAME'].isin(GBM_DRUG_LIST))].copy(deep=True)

Drug_df_GBM.drop(GBM_known_drugs_test_df.index, inplace=True)

panCancer_train_df = pd.merge(Drug_df,GE_df, on=['SAMPLE_ID'], how='left')
GBM_train_df = pd.merge(Drug_df_GBM,GE_df, on=['SAMPLE_ID'], how='left')
GBM_test_df = pd.merge(GBM_known_drugs_test_df,GE_df, on=['SAMPLE_ID'], how='left')

Drug_df= None
GE_df = None
Drug_df_GBM = None
GBM_known_drugs_test_df = None

In [None]:
panCancer_train_df.columns[:50]

In [None]:
panCancer_train_df.dropna(inplace=True)

In [None]:
panCancer_train_df.to_csv('/content/drive/MyDrive/COSMIC/panCancer_train_657_RANS_df.csv',index=False)

In [None]:
panCancer_train_df=None

In [None]:
GBM_train_df.dropna(inplace=True)
GBM_train_df.to_csv('/content/drive/MyDrive/COSMIC/GBM_train_657_RANS_df.csv',index=False)
#GBM_train_df = None

In [None]:
GBM_test_df.dropna(inplace=True)
GBM_test_df.to_csv('/content/drive/MyDrive/COSMIC/GBM_test_657_RANS_df.csv',index=False)
#GBM_test_df = None

In [None]:
len(X_cols)

In [5]:
genes = np.asarray(data.columns[1:])

In [12]:
genes.shape

(16248,)

In [17]:
gene1 = np.reshape(genes,(-1,24))

In [23]:
gene1 = pd.DataFrame(gene1)

In [25]:
gene1.to_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/Supplementary_Material/genes16248.csv', index=False, header=False)

In [26]:
df_GDSC.columns

Index(['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COSMIC_ID',
       'CELL_LINE_NAME', 'SANGER_MODEL_ID', 'TCGA_DESC', 'DRUG_ID',
       'DRUG_NAME', 'PUTATIVE_TARGET', 'PATHWAY_NAME', 'COMPANY_ID',
       'WEBRELEASE', 'MIN_CONC', 'MAX_CONC', 'LN_IC50', 'AUC', 'RMSE',
       'Z_SCORE'],
      dtype='object')

In [30]:
((df_GDSC[['COSMIC_ID','CELL_LINE_NAME','TCGA_DESC']].drop_duplicates()).dropna()).to_csv('/data/user/rsharma3/nbotw/Project-D2GNETs/Supplementary_Material/TCGA_classes_of_Cell_lines.csv',index=False)