In [10]:
import json
import pandas as pd

# Data Loading


In [11]:
df = pd.read_csv('data/GeneID_7297_bioactivity_gene.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


# Data Filtering

In [12]:
df = df[df['aidtype']=='Confirmatory']
df = df[list(map(lambda act: (act in ['Active', 'Inactive']), df['activity']))]
df[['activity']].value_counts()

activity
Active      2074
Inactive     231
dtype: int64

In [13]:
#df[list(map(lambda act: (act in ['Active', 'Inactive']), df['activity']))]['cid'].to_csv('data/tyk2_cid.csv', index=False, header=False)

In [14]:
with open ('data/tyk2_cid_smi.txt', 'r') as f:
    data = f.read().split('\n')
data = [i.split() for i in data][:-1]
cid_smi = pd.DataFrame(data, columns=['cid','smiles'])

In [15]:
cid_smi['cid'] = cid_smi['cid'].astype('int64')

In [16]:
act_cid_smi = pd.merge(df[['activity', 'cid']], cid_smi, how='inner', on='cid')
act_cid_smi = act_cid_smi.replace({'Active':1, 'Inactive':0})
act_cid_smi

Unnamed: 0,activity,cid,smiles
0,1,5494425,CC(C)(C)C1=NC2=C(N1)C3=C(C=C(C=C3)F)C4=C2C=CNC4=O
1,1,5494425,CC(C)(C)C1=NC2=C(N1)C3=C(C=C(C=C3)F)C4=C2C=CNC4=O
2,1,5494425,CC(C)(C)C1=NC2=C(N1)C3=C(C=C(C=C3)F)C4=C2C=CNC4=O
3,1,5494425,CC(C)(C)C1=NC2=C(N1)C3=C(C=C(C=C3)F)C4=C2C=CNC4=O
4,1,5494425,CC(C)(C)C1=NC2=C(N1)C3=C(C=C(C=C3)F)C4=C2C=CNC4=O
...,...,...,...
9562,1,50914062,C[C@H]1CN([C@]12CCN(C2)C3=NC=NC4=C3C=CN4)C(=O)...
9563,1,50914062,C[C@H]1CN([C@]12CCN(C2)C3=NC=NC4=C3C=CN4)C(=O)...
9564,1,50914062,C[C@H]1CN([C@]12CCN(C2)C3=NC=NC4=C3C=CN4)C(=O)...
9565,1,50914062,C[C@H]1CN([C@]12CCN(C2)C3=NC=NC4=C3C=CN4)C(=O)...


In [17]:
act_cid_smi.drop_duplicates(subset=['cid'])[['smiles','activity']].to_csv('data/tyk2_smi_act.csv', index=False)

# Data Generation

In [18]:
from tqdm import tqdm
from rdkit import Chem
from feature import mol_to_feature
import numpy as np
import pickle
from mordred import Calculator, descriptors

In [19]:
MAX_LEN = 150
INPUT_SMILES = 'data/tyk2_smi_act.csv'
#OUTPUT = 'data/tyk2.pickle'
SMILE = 'smiles'
ACTIVE = 'activity'
ID = 'id'
DEBUG = False

In [20]:
df = pd.read_csv(INPUT_SMILES)
print(df.head())
print ('Shape before dropping duplicates: ', df.shape)
df = df.drop_duplicates(subset=[SMILE])
print ('Shape after dropping duplicates: ', df.shape)

if DEBUG:
    df = df.head(10)

                                              smiles  activity
0  CC(C)(C)C1=NC2=C(N1)C3=C(C=C(C=C3)F)C4=C2C=CNC4=O         1
1  CC(C)S(=O)(=O)C1=CC=CC=C1NC2=NC(=NC=C2Cl)NC3=C...         1
2  CC1=C(C(=CC=C1)C)N2C(=O)C3=CN=C(N=C3N4C2=NC5=C...         1
3  C1CC1NC(=O)NC2=C(NN=C2)C3=NC4=C(N3)C=C(C=C4)CN...         1
4  CC1=C(C=C(C=C1)C(=O)NC2=C(C=CC(=C2)C(F)(F)F)OC...         1
Shape before dropping duplicates:  (1545, 2)
Shape after dropping duplicates:  (1545, 2)


In [21]:
mols = [Chem.MolFromSmiles(i) for i in df.smiles]

In [138]:
calc = Calculator(descriptors, ignore_3D=True)
md = calc.pandas(mols)

 72%|████████████████████████████████████████████████████████▏                     | 1113/1545 [02:04<03:32,  2.03it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 79%|█████████████████████████████████████████████████████████████▎                | 1214/1545 [02:18<03:12,  1.72it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 81%|██████████████████████████████████████████████████████████████▊               | 1245/1545 [02:19<00:27, 10.76it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1545/1545 [02:58<00:00,  8.64it/s]


In [139]:
smile_ft = [mol_to_feature(mol,-1, 150) for mol in mols]

In [140]:
md['active'] = df[ACTIVE]
md['smiles'] = df[SMILE]
md['smile_ft'] = smile_ft
md['id'] = list(md.index)
md['subset'] = np.full(len(md),10)

In [141]:
md.head()

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,active,smiles,smile_ft,id,subset
0,18.735621,14.924099,0,0,29.154095,2.588788,5.118091,29.154095,1.267569,4.106965,...,43,134.0,165.0,8.0625,4.694444,1,CC(C)(C)C1=NC2=C(N1)C3=C(C=C(C=C3)F)C4=C2C=CNC4=O,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,...",0,10
1,32.956889,23.001891,0,2,54.329075,2.436897,4.873793,54.329075,1.293549,4.666985,...,72,224.0,265.0,13.756944,9.138889,1,CC(C)S(=O)(=O)C1=CC=CC=C1NC2=NC(=NC=C2Cl)NC3=C...,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,...",1,10
2,32.47919,22.13502,0,1,53.591166,2.612384,5.154666,53.591166,1.339779,4.653678,...,75,228.0,280.0,10.777778,8.5,1,CC1=C(C(=CC=C1)C)N2C(=O)C3=CN=C(N=C3N4C2=NC5=C...,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,...",2,10
3,22.615486,17.171952,0,1,37.534456,2.468234,4.768875,37.534456,1.340516,4.309899,...,39,154.0,182.0,6.5,6.0,1,C1CC1NC(=O)NC2=C(NN=C2)C3=NC4=C(N3)C=C(C=C4)CN...,"[0, 0, 1, 0, 0, 0.0, 0.25, 0.0, 0.25, 0, 0, 0,...",3,10
4,31.653814,22.422106,0,1,51.410761,2.442893,4.885786,51.410761,1.285269,4.622886,...,67,216.0,254.0,13.256944,8.555556,1,CC1=C(C=C(C=C1)C(=O)NC2=C(C=CC(=C2)C(F)(F)F)OC...,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,...",4,10


# Data Saving

In [142]:
from itertools import compress

In [143]:
with open('data/egfr_10_full_ft_pd_lines.json', 'r') as f:
    data = [json.loads(line) for line in f]

In [144]:
cols = pd.DataFrame(data).columns
print(f'{len(cols) - sum([(col in list(md.columns)) for col in cols])}')

2


In [145]:
print(list(compress(cols, [(col not in list(md.columns)) for col in cols])))
obmit_cols = [col for col in cols if col not in list(md.columns)]

['pyrimidine', 'quinazoline']


Obmit columns은 어떻게 만들었는지 논문에 나와있지 않음<Br>
추후 feature selection에서 제거하거나 해야할 듯

In [146]:
for col in obmit_cols:
    md[col] = np.full(len(md), 0)

In [147]:
print(list(compress(cols, [(col not in list(md.columns)) for col in cols])))
obmit_cols = [col for col in cols if col in list(md.columns)]

[]


In [160]:
md[cols].to_json(r'data/tyk2.json',orient='records', lines=True)

# Data Check

In [153]:
def read_data(data_path):
    data = None
    if data_path.endswith('.json'):
        try:
            data = pd.read_json(data_path, lines=True)
        except ValueError:
            data = pd.read_json(data_path)
    return data

In [154]:
data_path = 'data/tyk2.json'
#data_path = 'data/egfr_10_full_ft_pd_lines.json'

data = read_data(data_path)

In [156]:
temp = pd.DataFrame(data)
temp

Unnamed: 0,AATS0Z,AATS0are,AATS0d,AATS0dv,AATS0i,AATS0m,AATS0p,AATS0pe,AATS0se,AATS0v,...,piPC5,piPC6,piPC7,piPC8,piPC9,pyrimidine,quinazoline,smile_ft,smiles,subset
0,24.512821,6.340377,3.846154,8.974359,163.171956,97.910428,1.587114,6.407277,7.773880,237.026191,...,6.738041,7.355032,7.929636,8.497942,8.953173,0,0,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,...",CC(C)(C)C1=NC2=C(N1)C3=C(C=C(C=C3)F)C4=C2C=CNC4=O,10
1,26.829268,6.070571,3.219512,6.342066,164.769028,107.248642,1.524555,6.164744,7.649270,212.219226,...,6.139750,6.327714,6.685197,7.026177,7.376469,0,0,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,...",CC(C)S(=O)(=O)C1=CC=CC=C1NC2=NC(=NC=C2Cl)NC3=C...,10
2,22.885714,6.094274,3.685714,7.800000,162.193234,90.402944,1.573107,6.179199,7.575353,231.777721,...,7.064893,7.630803,8.253616,8.865921,9.369196,0,0,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,...",CC1=C(C(=CC=C1)C)N2C(=O)C3=CN=C(N=C3N4C2=NC5=C...,10
3,23.098039,6.285182,3.470588,7.764706,166.897682,91.170512,1.430816,6.337763,7.762929,213.680942,...,5.971102,6.429694,6.921219,7.392900,7.665589,0,0,"[0, 0, 1, 0, 0, 0.0, 0.25, 0.0, 0.25, 0, 0, 0,...",C1CC1NC(=O)NC2=C(NN=C2)C3=NC4=C(N3)C=C(C=C4)CN...,10
4,24.657143,6.496493,3.514286,9.085714,166.987500,99.059140,1.503893,6.538163,7.906872,225.978626,...,6.467213,6.803540,7.235326,7.656696,8.102889,0,0,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,...",CC1=C(C=C(C=C1)C(=O)NC2=C(C=CC(=C2)C(F)(F)F)OC...,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1540,23.263158,6.225868,3.491228,7.964912,162.781755,91.870610,1.509273,6.294189,7.719251,224.688464,...,5.765583,5.930918,6.283434,6.683283,6.990832,0,0,"[0, 0, 0, 1, 0, 0.0, 0.25, 0.0, 0.375, 0, 0, 0...",C1CC1[C@@]2([C@H]3CN(C2=O)C4=CC(=NC=C4)NC5=CC=...,10
1541,23.263158,6.225868,3.491228,7.964912,162.781755,91.870610,1.509273,6.294189,7.719251,224.688464,...,5.765583,5.930918,6.283434,6.683283,6.990832,0,0,"[0, 0, 0, 1, 0, 0.0, 0.25, 0.0, 0.375, 0, 0, 0...",C1CC1[C@@]2([C@H]3CN(C2=O)C4=CC(=NC=C4)NC5=CC=...,10
1542,23.892857,6.307311,3.571429,8.357143,163.894430,94.420368,1.500090,6.369070,7.784642,224.927785,...,5.759296,5.930918,6.282500,6.686407,6.979611,0,0,"[0, 0, 0, 1, 0, 0.0, 0.25, 0.0, 0.375, 0, 0, 0...",C1CC1[C@@]2([C@H]3CN(C2=O)C4=CC(=NC=C4)NC5=CC=...,10
1543,22.508475,6.178890,3.406780,7.525424,163.532124,88.790795,1.473182,6.244895,7.685326,218.125592,...,5.749791,5.906723,6.252145,6.639140,6.938042,0,0,"[0, 0, 0, 1, 0, 0.0, 0.25, 0.0, 0.375, 0, 0, 0...",C1CCOCC2=NC(=CC=C2)NC3=NC=CC(=C3)N4C[C@@H](COC...,10


# Data Observe

In [108]:
egfr_df = pd.DataFrame(data)

In [117]:
egfr_df.to_csv('temp.csv')

In [113]:
egfr_df[['id','quinazoline','pyrimidine','subset']]

Unnamed: 0,id,quinazoline,pyrimidine,subset
0,EGIN0000009,1,0,10
1,EGIN0000014,1,0,10
2,EGIN0000043,1,0,10
3,EGIN0000667,0,1,10
4,EGIN0000847,0,0,10
...,...,...,...,...
3487,EGIN0000842,0,0,10
3488,EGIN0000843,0,0,10
3489,EGIN0000844,0,0,10
3490,EGIN0000845,0,0,10


In [114]:
egfr_df[['subset']].value_counts()

subset
10        3492
dtype: int64