In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
%matplotlib inline

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")
import time 

# MACCS

In [2]:
#Reading in grep files - Maccs
grep_maccs = pd.read_csv('domain_appl_maccs.grep', sep = ',')
grep_maccs = grep_maccs.T.reset_index(drop = True).T
grep_maccs = grep_maccs.drop([0])

#converting to str 
grep_maccs = grep_maccs.astype(str)

#splitting scores 
str_split_maccs = grep_maccs[0].tolist()
i = 0
while (i < len(str_split_maccs)):
    str_split_maccs[i] = str_split_maccs[i].split()
    i = i + 1
    

#making df
data_maccs = pd.DataFrame(str_split_maccs)
data_maccs = data_maccs.rename(columns = {0 : 'Set', 1 : 'IDX', 2 : 'CID', 3 : 'Min Dist', 4 : 'Index at Min Dist', 5 : 'CID at Min Dist', 6 : 'Num_Neighbors'})

#splitting into train, test, ext1, ext2
data_maccs_train = data_maccs.loc[data_maccs['Set'] == 'Train']
data_maccs_test = data_maccs.loc[data_maccs['Set'] == 'Test']
data_maccs_ncgc = data_maccs.loc[data_maccs['Set'] == 'Ext1']
data_maccs_chembl = data_maccs.loc[data_maccs['Set'] == 'Ext2']

In [3]:
print(data_maccs_test.shape)
print(data_maccs_test['Num_Neighbors'].value_counts())

# data_maccs_test has 568 total
#176 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_maccs_test = data_maccs_test[data_maccs_test.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_maccs_test = data_maccs_test['CID'].tolist()

#reading in original input files
full_maccs_test = pd.read_csv('input_test_maccs.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_maccs_test = full_maccs_test.loc[full_maccs_test['Name'].isin(cids_maccs_test)]

#Writing new input file to csv
appl_maccs_test.to_csv('input_test_maccs_appl.csv', index = False)
appl_maccs_test.shape

(568, 8)
0     176
1      87
2      47
3      27
4      18
7      15
5      15
19     11
11     10
6       9
12      9
22      9
13      8
14      8
16      7
9       7
18      7
23      7
25      6
21      6
10      6
24      6
51      5
27      5
17      5
28      4
8       4
20      4
38      4
43      4
32      4
29      3
34      3
31      3
42      2
26      2
67      2
30      2
15      2
33      2
62      1
36      1
58      1
46      1
73      1
57      1
48      1
Name: Num_Neighbors, dtype: int64


(392, 168)

NCGC

In [4]:
print(data_maccs_ncgc.shape)
print(data_maccs_ncgc['Num_Neighbors'].value_counts())

# data_maccs_ncgc has 328 total
#138 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_maccs_ncgc = data_maccs_ncgc[data_maccs_ncgc.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_maccs_ncgc = data_maccs_ncgc['CID'].tolist()

#reading in original input files
full_maccs_ncgc = pd.read_csv('input_ncgc_maccs.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_maccs_ncgc = full_maccs_ncgc.loc[full_maccs_ncgc['Name'].isin(cids_maccs_ncgc)]

#Writing new input file to csv
appl_maccs_ncgc.to_csv('input_ncgc_maccs_appl.csv', index = False)
appl_maccs_ncgc.shape

(328, 8)
0     138
1      37
3      19
2      17
4      15
5      11
6      11
17      8
7       8
11      7
21      6
20      4
14      4
10      4
8       4
12      3
19      3
18      3
9       3
24      3
23      3
13      2
29      2
22      2
15      2
31      2
43      1
28      1
34      1
35      1
41      1
26      1
27      1
Name: Num_Neighbors, dtype: int64


(190, 168)

ChEMBL

In [5]:
print(data_maccs_chembl.shape)
print(data_maccs_chembl['Num_Neighbors'].value_counts())

# data_maccs_chembl has 1456 total
#1266 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_maccs_chembl = data_maccs_chembl[data_maccs_chembl.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_maccs_chembl = data_maccs_chembl['CID'].tolist()

#reading in original input files
full_maccs_chembl = pd.read_csv('input_chembl_maccs.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_maccs_chembl = full_maccs_chembl.loc[full_maccs_chembl['Name'].isin(cids_maccs_chembl)]

#Writing new input file to csv
appl_maccs_chembl.to_csv('input_chembl_maccs_appl.csv', index = False)
appl_maccs_chembl.shape

(1456, 8)
0     1266
1       77
2        7
5        7
3        7
4        7
8        6
17       6
58       5
28       5
9        5
23       5
6        3
34       3
7        3
14       3
13       3
10       3
48       2
21       2
32       2
29       2
50       2
36       2
31       1
52       1
38       1
24       1
35       1
42       1
27       1
55       1
25       1
67       1
16       1
54       1
59       1
63       1
43       1
33       1
19       1
37       1
61       1
11       1
12       1
15       1
53       1
Name: Num_Neighbors, dtype: int64


(190, 168)

# TOP

In [6]:
#Reading in grep files - top
grep_top = pd.read_csv('domain_appl_top.grep', sep = ',')
grep_top = grep_top.T.reset_index(drop = True).T
grep_top = grep_top.drop([0])

#converting to str 
grep_top = grep_top.astype(str)

#splitting scores 
str_split_top = grep_top[0].tolist()
i = 0
while (i < len(str_split_top)):
    str_split_top[i] = str_split_top[i].split()
    i = i + 1
    

#making df
data_top = pd.DataFrame(str_split_top)
data_top = data_top.rename(columns = {0 : 'Set', 1 : 'IDX', 2 : 'CID', 3 : 'Min Dist', 4 : 'Index at Min Dist', 5 : 'CID at Min Dist', 6 : 'Num_Neighbors'})

#splitting into train, test, ext1, ext2
data_top_train = data_top.loc[data_top['Set'] == 'Train']
data_top_test = data_top.loc[data_top['Set'] == 'Test']
data_top_ncgc = data_top.loc[data_top['Set'] == 'Ext1']
data_top_chembl = data_top.loc[data_top['Set'] == 'Ext2']

In [7]:
print(data_top_test.shape)
print(data_top_test['Num_Neighbors'].value_counts())

# data_top_test has 568 total
#99 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_top_test = data_top_test[data_top_test.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_top_test = data_top_test['CID'].tolist()

#reading in original input files
full_top_test = pd.read_csv('input_test_top.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_top_test = full_top_test[full_top_test['Name'].isin(cids_top_test)]

#Writing new input file to csv
appl_top_test.to_csv('input_test_top_appl.csv', index = False)
appl_top_test.shape

(568, 8)
0      99
1      63
2      17
225     9
221     8
209     8
202     7
175     7
6       7
3       7
226     6
228     6
217     6
203     6
10      6
234     5
163     5
224     5
227     5
8       5
214     4
208     4
188     4
138     4
162     4
128     4
216     4
7       4
182     4
239     4
       ..
172     1
48      1
140     1
116     1
236     1
40      1
206     1
13      1
51      1
235     1
107     1
219     1
118     1
44      1
46      1
91      1
117     1
232     1
53      1
134     1
193     1
71      1
150     1
192     1
176     1
87      1
212     1
49      1
95      1
11      1
Name: Num_Neighbors, Length: 172, dtype: int64


(469, 2050)

NCGC

In [8]:
print(data_top_ncgc.shape)
print(data_top_ncgc['Num_Neighbors'].value_counts())

# data_top_ncgc has 328 total
#115 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_top_ncgc = data_top_ncgc[data_top_ncgc.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_top_ncgc = data_top_ncgc['CID'].tolist()

#reading in original input files
full_top_ncgc = pd.read_csv('input_ncgc_top.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_top_ncgc = full_top_ncgc.loc[full_top_ncgc['Name'].isin(cids_top_ncgc)]

#Writing new input file to csv
appl_top_ncgc.to_csv('input_ncgc_top_appl.csv', index = False)
appl_top_ncgc.shape

(328, 8)
0      115
1       22
225      6
165      5
2        5
6        4
187      4
177      4
3        4
176      4
219      4
168      3
209      3
231      3
167      3
198      3
179      3
156      3
180      3
196      3
183      3
117      3
189      2
210      2
202      2
160      2
74       2
188      2
169      2
191      2
      ... 
87       1
131      1
138      1
233      1
218      1
100      1
184      1
50       1
136      1
12       1
178      1
68       1
197      1
125      1
9        1
157      1
229      1
220      1
228      1
159      1
154      1
185      1
124      1
83       1
238      1
108      1
51       1
175      1
203      1
29       1
Name: Num_Neighbors, Length: 111, dtype: int64


(213, 2050)

ChEMBL

In [9]:
print(data_top_chembl.shape)
print(data_top_chembl['Num_Neighbors'].value_counts())

# data_top_chembl has 1456 total
#1314 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_top_chembl = data_top_chembl[data_top_chembl.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_top_chembl = data_top_chembl['CID'].tolist()

#reading in original input files
full_top_chembl = pd.read_csv('input_chembl_top.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_top_chembl = full_top_chembl.loc[full_top_chembl['Name'].isin(cids_top_chembl)]

#Writing new input file to csv
appl_top_chembl.to_csv('input_chembl_top_appl.csv', index = False)
appl_top_chembl.shape

(1456, 8)
0      1314
1        50
2         9
10        8
40        7
4         7
38        5
7         4
36        4
3         4
30        3
5         3
6         2
163       2
28        2
27        2
31        2
25        2
8         2
9         2
12        2
56        2
39        1
29        1
11        1
37        1
64        1
170       1
69        1
26        1
17        1
87        1
18        1
66        1
169       1
21        1
57        1
123       1
178       1
60        1
Name: Num_Neighbors, dtype: int64


(142, 2050)

# ECFP

In [10]:
#Reading in grep files - ecfp
grep_ecfp = pd.read_csv('domain_appl_ecfp.grep', sep = ',')
grep_ecfp = grep_ecfp.T.reset_index(drop = True).T
grep_ecfp = grep_ecfp.drop([0])

#converting to str 
grep_ecfp = grep_ecfp.astype(str)

#splitting scores 
str_split_ecfp = grep_ecfp[0].tolist()
i = 0
while (i < len(str_split_ecfp)):
    str_split_ecfp[i] = str_split_ecfp[i].split()
    i = i + 1
    

#making df
data_ecfp = pd.DataFrame(str_split_ecfp)
data_ecfp = data_ecfp.rename(columns = {0 : 'Set', 1 : 'IDX', 2 : 'CID', 3 : 'Min Dist', 4 : 'Index at Min Dist', 5 : 'CID at Min Dist', 6 : 'Num_Neighbors'})

#splitting into train, test, ext1, ext2
data_ecfp_train = data_ecfp.loc[data_ecfp['Set'] == 'Train']
data_ecfp_test = data_ecfp.loc[data_ecfp['Set'] == 'Test']
data_ecfp_ncgc = data_ecfp.loc[data_ecfp['Set'] == 'Ext1']
data_ecfp_chembl = data_ecfp.loc[data_ecfp['Set'] == 'Ext2']

In [11]:
print(data_ecfp_test.shape)
print(data_ecfp_test['Num_Neighbors'].value_counts())

# data_ecfp_test has 568 total
#44 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_ecfp_test = data_ecfp_test[data_ecfp_test.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_ecfp_test = data_ecfp_test['CID'].tolist()

#reading in original input files
full_ecfp_test = pd.read_csv('input_test_ecfp.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_ecfp_test = full_ecfp_test.loc[full_ecfp_test['Name'].isin(cids_ecfp_test)]

#Writing new input file to csv
appl_ecfp_test.to_csv('input_test_ecfp_appl.csv', index = False)
appl_ecfp_test.shape

(568, 8)
0      44
1      40
2      10
5       9
266     6
292     5
271     5
222     4
305     4
314     4
185     4
335     4
355     4
371     4
283     4
3       4
275     4
116     4
241     4
302     4
299     3
336     3
215     3
120     3
308     3
27      3
81      3
223     3
366     3
237     3
       ..
131     1
164     1
316     1
132     1
367     1
333     1
146     1
60      1
313     1
326     1
25      1
199     1
17      1
135     1
158     1
243     1
216     1
35      1
85      1
387     1
22      1
253     1
234     1
280     1
341     1
194     1
134     1
39      1
15      1
345     1
Name: Num_Neighbors, Length: 264, dtype: int64


(524, 1026)

NCGC

In [12]:
print(data_ecfp_ncgc.shape)
print(data_ecfp_ncgc['Num_Neighbors'].value_counts())

# data_ecfp_ncgc has 328 total
#51 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_ecfp_ncgc = data_ecfp_ncgc[data_ecfp_ncgc.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_ecfp_ncgc = data_ecfp_ncgc['CID'].tolist()

#reading in original input files
full_ecfp_ncgc = pd.read_csv('input_ncgc_ecfp.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_ecfp_ncgc = full_ecfp_ncgc.loc[full_ecfp_ncgc['Name'].isin(cids_ecfp_ncgc)]

#Writing new input file to csv
appl_ecfp_ncgc.to_csv('input_ncgc_ecfp_appl.csv', index = False)

(328, 8)
0      51
1       8
3       7
13      4
332     4
314     4
294     3
66      3
225     3
144     3
146     3
201     3
216     3
163     3
321     3
322     3
245     3
389     3
264     2
204     2
17      2
211     2
355     2
4       2
297     2
129     2
348     2
309     2
315     2
255     2
       ..
237     1
289     1
118     1
32      1
329     1
180     1
175     1
176     1
174     1
29      1
251     1
23      1
367     1
218     1
318     1
384     1
126     1
266     1
186     1
82      1
57      1
354     1
136     1
241     1
353     1
43      1
63      1
205     1
331     1
396     1
Name: Num_Neighbors, Length: 185, dtype: int64


ChEMBL

In [13]:
print(data_ecfp_chembl.shape)
print(data_ecfp_chembl['Num_Neighbors'].value_counts())

# data_ecfp_chembl has 1456 total
#1279 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_ecfp_chembl = data_ecfp_chembl[data_ecfp_chembl.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_ecfp_chembl = data_ecfp_chembl['CID'].tolist()

#reading in original input files
full_ecfp_chembl = pd.read_csv('input_chembl_ecfp.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_ecfp_chembl = full_ecfp_chembl.loc[full_ecfp_chembl['Name'].isin(cids_ecfp_chembl)]

#Writing new input file to csv
appl_ecfp_chembl.to_csv('input_chembl_ecfp_appl.csv', index = False)

(1456, 8)
0      1279
1        56
2        27
4        14
10       11
3        10
5         7
6         6
12        5
11        4
8         4
16        4
9         2
28        2
13        2
31        2
14        2
18        2
7         2
43        1
187       1
42        1
22        1
26        1
38        1
40        1
23        1
32        1
24        1
27        1
21        1
25        1
57        1
50        1
Name: Num_Neighbors, dtype: int64


# FCFP

In [14]:
#Reading in grep files - fcfp
grep_fcfp = pd.read_csv('domain_appl_fcfp.grep', sep = ',')
grep_fcfp = grep_fcfp.T.reset_index(drop = True).T
grep_fcfp = grep_fcfp.drop([0])

#converting to str 
grep_fcfp = grep_fcfp.astype(str)

#splitting scores 
str_split_fcfp = grep_fcfp[0].tolist()
i = 0
while (i < len(str_split_fcfp)):
    str_split_fcfp[i] = str_split_fcfp[i].split()
    i = i + 1
    

#making df
data_fcfp = pd.DataFrame(str_split_fcfp)
data_fcfp = data_fcfp.rename(columns = {0 : 'Set', 1 : 'IDX', 2 : 'CID', 3 : 'Min Dist', 4 : 'Index at Min Dist', 5 : 'CID at Min Dist', 6 : 'Num_Neighbors'})

#splitting into train, test, ext1, ext2
data_fcfp_train = data_fcfp.loc[data_fcfp['Set'] == 'Train']
data_fcfp_test = data_fcfp.loc[data_fcfp['Set'] == 'Test']
data_fcfp_ncgc = data_fcfp.loc[data_fcfp['Set'] == 'Ext1']
data_fcfp_chembl = data_fcfp.loc[data_fcfp['Set'] == 'Ext2']

In [15]:
print(data_fcfp_test.shape)
print(data_fcfp_test['Num_Neighbors'].value_counts())

# data_fcfp_test has 568 total
#58 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_fcfp_test = data_fcfp_test[data_fcfp_test.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_fcfp_test = data_fcfp_test['CID'].tolist()

#reading in original input files
full_fcfp_test = pd.read_csv('input_test_fcfp.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_fcfp_test = full_fcfp_test.loc[full_fcfp_test['Name'].isin(cids_fcfp_test)]

#Writing new input file to csv
appl_fcfp_test.to_csv('input_test_fcfp_appl.csv', index = False)

(568, 8)
0      58
1      30
2      13
256     9
3       8
4       8
223     6
353     5
282     5
311     5
271     4
10      4
7       4
274     4
187     4
213     4
247     4
254     4
316     4
8       4
6       4
261     4
313     4
364     3
36      3
128     3
301     3
197     3
66      3
184     3
       ..
117     1
37      1
200     1
103     1
217     1
258     1
348     1
253     1
317     1
129     1
163     1
161     1
321     1
283     1
77      1
349     1
352     1
328     1
42      1
226     1
234     1
242     1
322     1
332     1
246     1
105     1
237     1
389     1
276     1
61      1
Name: Num_Neighbors, Length: 274, dtype: int64


In [16]:
appl_fcfp_test.shape


(510, 1026)

NCGC

In [17]:
print(data_fcfp_ncgc.shape)
print(data_fcfp_ncgc['Num_Neighbors'].value_counts())

# data_fcfp_ncgc has 328 total
#50 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_fcfp_ncgc = data_fcfp_ncgc[data_fcfp_ncgc.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_fcfp_ncgc = data_fcfp_ncgc['CID'].tolist()

#reading in original input files
full_fcfp_ncgc = pd.read_csv('input_ncgc_fcfp.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_fcfp_ncgc = full_fcfp_ncgc.loc[full_fcfp_ncgc['Name'].isin(cids_fcfp_ncgc)]

#Writing new input file to csv
appl_fcfp_ncgc.to_csv('input_ncgc_fcfp_appl.csv', index = False)

(328, 8)
0      50
1      11
3       7
207     6
4       4
298     3
5       3
198     3
133     3
243     3
24      3
37      3
2       3
282     3
60      2
132     2
173     2
208     2
216     2
257     2
74      2
194     2
367     2
333     2
129     2
327     2
300     2
21      2
219     2
366     2
       ..
44      1
118     1
51      1
353     1
95      1
184     1
23      1
238     1
301     1
277     1
144     1
47      1
274     1
192     1
130     1
307     1
332     1
136     1
241     1
145     1
340     1
228     1
281     1
205     1
127     1
48      1
161     1
150     1
320     1
213     1
Name: Num_Neighbors, Length: 196, dtype: int64


ChEMBL

In [18]:
print(data_fcfp_chembl.shape)
print(data_fcfp_chembl['Num_Neighbors'].value_counts())

# data_fcfp_chembl has 1456 total
#1257 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_fcfp_chembl = data_fcfp_chembl[data_fcfp_chembl.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_fcfp_chembl = data_fcfp_chembl['CID'].tolist()

#reading in original input files
full_fcfp_chembl = pd.read_csv('input_chembl_fcfp.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_fcfp_chembl = full_fcfp_chembl.loc[full_fcfp_chembl['Name'].isin(cids_fcfp_chembl)]

#Writing new input file to csv
appl_fcfp_chembl.to_csv('input_chembl_fcfp_appl.csv', index = False)

(1456, 8)
0      1257
1        64
2        27
4        16
3        15
7        14
5        10
6         7
10        6
13        5
8         5
9         3
16        3
14        3
33        2
39        2
23        2
11        2
12        2
36        2
28        1
53        1
26        1
27        1
31        1
34        1
32        1
51        1
144       1
Name: Num_Neighbors, dtype: int64


# Pub

In [19]:
#Reading in grep files - pub
grep_pub = pd.read_csv('domain_appl_pub.grep', sep = ',')
grep_pub = grep_pub.T.reset_index(drop = True).T
grep_pub = grep_pub.drop([0])

#converting to str 
grep_pub = grep_pub.astype(str)

#splitting scores 
str_split_pub = grep_pub[0].tolist()
i = 0
while (i < len(str_split_pub)):
    str_split_pub[i] = str_split_pub[i].split()
    i = i + 1
    

#making df
data_pub = pd.DataFrame(str_split_pub)
data_pub = data_pub.rename(columns = {0 : 'Set', 1 : 'IDX', 2 : 'CID', 3 : 'Min Dist', 4 : 'Index at Min Dist', 5 : 'CID at Min Dist', 6 : 'Num_Neighbors'})

#splitting into train, test, ext1, ext2
data_pub_train = data_pub.loc[data_pub['Set'] == 'Train']
data_pub_test = data_pub.loc[data_pub['Set'] == 'Test']
data_pub_ncgc = data_pub.loc[data_pub['Set'] == 'Ext1']
data_pub_chembl = data_pub.loc[data_pub['Set'] == 'Ext2']

In [24]:
print(data_pub_test.shape)
print(data_pub_test['Num_Neighbors'].value_counts())

#data_pub_test has 568 total
#128 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_pub_test = data_pub_test[data_pub_test.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_pub_test = data_pub_test['CID'].tolist()

#reading in original input files
full_pub_test = pd.read_csv('input_test_pub.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_pub_test = full_pub_test.loc[full_pub_test['Name'].isin(cids_pub_test)]

#Writing new input file to csv
appl_pub_test.to_csv('input_test_pub_appl.csv', index = False)

(568, 8)
0     128
1      72
2      47
3      32
4      24
7      11
12     10
5      10
6      10
10      9
22      8
18      8
17      8
9       7
11      7
33      6
8       6
14      6
34      5
46      5
24      5
87      5
13      5
15      5
28      5
45      5
19      5
43      4
71      4
16      4
     ... 
50      2
30      2
44      2
79      2
37      2
72      2
80      2
40      2
25      2
27      2
88      1
23      1
49      1
26      1
36      1
83      1
47      1
58      1
86      1
84      1
59      1
52      1
62      1
81      1
73      1
63      1
48      1
32      1
85      1
75      1
Name: Num_Neighbors, Length: 81, dtype: int64


NCGC

In [22]:
print(data_pub_ncgc.shape)
print(data_pub_ncgc['Num_Neighbors'].value_counts())

# data_pub_ncgc has 328 total
#96 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_pub_ncgc = data_pub_ncgc[data_pub_ncgc.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_pub_ncgc = data_pub_ncgc['CID'].tolist()

#reading in original input files
full_pub_ncgc = pd.read_csv('input_ncgc_pub.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_pub_ncgc = full_pub_ncgc.loc[full_pub_ncgc['Name'].isin(cids_pub_ncgc)]

#Writing new input file to csv
appl_pub_ncgc.to_csv('input_ncgc_pub_appl.csv', index = False)

(328, 8)
0     96
1     32
2     28
3     16
6      9
22     9
7      8
18     7
4      6
28     5
15     5
19     5
34     5
37     4
27     4
25     4
12     4
39     3
10     3
9      3
44     3
36     3
42     3
11     3
5      3
23     3
43     3
14     2
24     2
68     2
8      2
38     2
21     2
48     2
26     2
67     2
31     2
41     2
16     2
45     2
61     2
46     2
30     2
80     2
29     2
71     1
81     1
83     1
47     1
77     1
70     1
75     1
32     1
89     1
35     1
50     1
33     1
17     1
20     1
13     1
Name: Num_Neighbors, dtype: int64


ChEMBL

In [25]:
print(data_pub_chembl.shape)
print(data_pub_chembl['Num_Neighbors'].value_counts())

# data_pub_chembl has 1456 total
#1272 compounds with no neighbors
#dropping values where Num Neighbors = 0
data_pub_chembl = data_pub_chembl[data_pub_chembl.Num_Neighbors != '0']

#only keeping CIDs of remaining
cids_pub_chembl = data_pub_chembl['CID'].tolist()

#reading in original input files
full_pub_chembl = pd.read_csv('input_chembl_pub.csv')

#getting fingerprints of remaining molecules (from original input file)
appl_pub_chembl = full_pub_chembl.loc[full_pub_chembl['Name'].isin(cids_pub_chembl)]

#Writing new input file to csv
appl_pub_chembl.to_csv('input_chembl_pub_appl.csv', index = False)

(1456, 8)
0     1272
1       96
8       11
2        8
3        7
79       5
22       5
6        4
23       4
7        3
24       3
36       3
20       3
49       3
43       3
26       3
21       2
32       2
39       2
17       1
42       1
12       1
38       1
89       1
19       1
4        1
27       1
51       1
81       1
31       1
76       1
68       1
50       1
78       1
59       1
45       1
Name: Num_Neighbors, dtype: int64
