In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('drugs.csv')
print(dataset.info())
dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8820 entries, 0 to 8819
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   <DATABASE_ID>                   8820 non-null   object 
 1   <DATABASE_NAME>                 8820 non-null   object 
 2   <MOLECULAR_WEIGHT>              8820 non-null   float64
 3   <JCHEM_ACCEPTOR_COUNT>          8820 non-null   int64  
 4   <JCHEM_AVERAGE_POLARIZABILITY>  8819 non-null   float64
 5   <JCHEM_DONOR_COUNT>             8820 non-null   int64  
 6   <JCHEM_FORMAL_CHARGE>           8820 non-null   int64  
 7   <JCHEM_LOGP>                    8819 non-null   float64
 8   <JCHEM_NUMBER_OF_RINGS>         8820 non-null   int64  
 9   <JCHEM_PHYSIOLOGICAL_CHARGE>    8820 non-null   int64  
 10  <JCHEM_PKA>                     4726 non-null   float64
 11  <JCHEM_PKA_STRONGEST_ACIDIC>    8820 non-null   float64
 12  <JCHEM_PKA_STRONGEST_BASIC>     69

Unnamed: 0,<DATABASE_ID>,<DATABASE_NAME>,<MOLECULAR_WEIGHT>,<JCHEM_ACCEPTOR_COUNT>,<JCHEM_AVERAGE_POLARIZABILITY>,<JCHEM_DONOR_COUNT>,<JCHEM_FORMAL_CHARGE>,<JCHEM_LOGP>,<JCHEM_NUMBER_OF_RINGS>,<JCHEM_PHYSIOLOGICAL_CHARGE>,<JCHEM_PKA>,<JCHEM_PKA_STRONGEST_ACIDIC>,<JCHEM_PKA_STRONGEST_BASIC>,<JCHEM_POLAR_SURFACE_AREA>,<JCHEM_REFRACTIVITY>,<JCHEM_ROTATABLE_BOND_COUNT>,<DRUGBANK_ID>
0,DB00114,drugbank,247.1419,6,20.898887,3,0,-2.085609,1,-2,6.691277,1.680299,4.109012,116.95,54.7463,4,DB00114
1,DB00116,drugbank,445.4292,12,42.953129,8,0,-4.21263,3,-2,3.863701,3.509569,3.584683,207.27,121.3866,9,DB00116
2,DB00117,drugbank,155.1546,4,14.670259,3,0,-3.616022,1,0,12.941172,1.849973,9.437137,92.0,38.059,3,DB00117
3,DB00118,drugbank,398.44,10,39.260985,4,0,-5.318678,3,1,12.444251,1.70354,9.414366,185.46,107.072,7,DB00118
4,DB00119,drugbank,88.0621,3,7.310506,1,0,0.065874,0,-1,,2.930123,-9.58317,54.37,17.9897,1,DB00119


In [3]:
nan_not_accepted = ['<JCHEM_AVERAGE_POLARIZABILITY>','<JCHEM_LOGP>','<JCHEM_PKA>','<JCHEM_PKA_STRONGEST_ACIDIC>','<JCHEM_PKA_STRONGEST_BASIC>','<JCHEM_REFRACTIVITY>']

In [4]:
for column in nan_not_accepted:
    mean = dataset[column].mean(skipna=True)
    dataset[column]=dataset[column].replace(np.NaN,mean)

In [5]:
print(dataset.info())
dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8820 entries, 0 to 8819
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   <DATABASE_ID>                   8820 non-null   object 
 1   <DATABASE_NAME>                 8820 non-null   object 
 2   <MOLECULAR_WEIGHT>              8820 non-null   float64
 3   <JCHEM_ACCEPTOR_COUNT>          8820 non-null   int64  
 4   <JCHEM_AVERAGE_POLARIZABILITY>  8820 non-null   float64
 5   <JCHEM_DONOR_COUNT>             8820 non-null   int64  
 6   <JCHEM_FORMAL_CHARGE>           8820 non-null   int64  
 7   <JCHEM_LOGP>                    8820 non-null   float64
 8   <JCHEM_NUMBER_OF_RINGS>         8820 non-null   int64  
 9   <JCHEM_PHYSIOLOGICAL_CHARGE>    8820 non-null   int64  
 10  <JCHEM_PKA>                     8820 non-null   float64
 11  <JCHEM_PKA_STRONGEST_ACIDIC>    8820 non-null   float64
 12  <JCHEM_PKA_STRONGEST_BASIC>     88

Unnamed: 0,<DATABASE_ID>,<DATABASE_NAME>,<MOLECULAR_WEIGHT>,<JCHEM_ACCEPTOR_COUNT>,<JCHEM_AVERAGE_POLARIZABILITY>,<JCHEM_DONOR_COUNT>,<JCHEM_FORMAL_CHARGE>,<JCHEM_LOGP>,<JCHEM_NUMBER_OF_RINGS>,<JCHEM_PHYSIOLOGICAL_CHARGE>,<JCHEM_PKA>,<JCHEM_PKA_STRONGEST_ACIDIC>,<JCHEM_PKA_STRONGEST_BASIC>,<JCHEM_POLAR_SURFACE_AREA>,<JCHEM_REFRACTIVITY>,<JCHEM_ROTATABLE_BOND_COUNT>,<DRUGBANK_ID>
0,DB00114,drugbank,247.1419,6,20.898887,3,0,-2.085609,1,-2,6.691277,1.680299,4.109012,116.95,54.7463,4,DB00114
1,DB00116,drugbank,445.4292,12,42.953129,8,0,-4.21263,3,-2,3.863701,3.509569,3.584683,207.27,121.3866,9,DB00116
2,DB00117,drugbank,155.1546,4,14.670259,3,0,-3.616022,1,0,12.941172,1.849973,9.437137,92.0,38.059,3,DB00117
3,DB00118,drugbank,398.44,10,39.260985,4,0,-5.318678,3,1,12.444251,1.70354,9.414366,185.46,107.072,7,DB00118
4,DB00119,drugbank,88.0621,3,7.310506,1,0,0.065874,0,-1,11.917731,2.930123,-9.58317,54.37,17.9897,1,DB00119


In [6]:
dt = pd.read_csv('all.csv')
print(len(dt))
dt.head()

5220


Unnamed: 0,ID,Name,Gene Name,GenBank Protein ID,GenBank Gene ID,UniProt ID,Uniprot Title,PDB ID,GeneCard ID,GenAtlas ID,HGNC ID,Species,Drug IDs
0,1,Peptidoglycan synthase FtsI,ftsI,1574687.0,L42023,P45059,FTSI_HAEIN,,,,,Haemophilus influenzae (strain ATCC 51907 / DS...,DB00303
1,2,Histidine decarboxylase,HDC,32109.0,X54297,P19113,DCHS_HUMAN,4E1O,,HDC,HGNC:4855,Humans,DB00114; DB00117
2,3,"Glutaminase liver isoform, mitochondrial",GLS2,6650606.0,AF110330,Q9UI32,GLSL_HUMAN,4BQM,,GLS2,HGNC:29570,Humans,DB00142
3,4,Coagulation factor XIII A chain,F13A1,182309.0,M22001,P00488,F13A_HUMAN,1EVU; 1EX0; 1F13; 1FIE; 1GGT; 1GGU; 1GGY; 1QRK...,,F13A1,HGNC:3531,Humans,DB01839; DB02340; DB11300; DB11311; DB11571; D...
4,5,"Nitric oxide synthase, inducible",NOS2,292242.0,L09210,P35228,NOS2_HUMAN,1NSI; 2LL6; 2NSI; 3E7G; 3EJ8; 3HR4; 4CX7; 4NOS,,NOS2A,HGNC:7873,Humans,DB00125; DB00155; DB01017; DB01110; DB01234; D...


In [7]:
dt=dt[['UniProt ID','Drug IDs']]

In [8]:
print(len(dt))
dt.head()

5220


Unnamed: 0,UniProt ID,Drug IDs
0,P45059,DB00303
1,P19113,DB00114; DB00117
2,Q9UI32,DB00142
3,P00488,DB01839; DB02340; DB11300; DB11311; DB11571; D...
4,P35228,DB00125; DB00155; DB01017; DB01110; DB01234; D...


In [76]:
reaction = dt.loc[dt['UniProt ID']=='P21453']
val = reaction['Drug IDs'].values[0]
drugovi = "".join(val).split("; ")
dataset['<DATABASE_ID>'].values

array(['DB00114', 'DB00116', 'DB00117', ..., 'DB15672', 'DB15674',
       'DB15675'], dtype=object)

In [77]:
y=[]
for m in dataset['<DATABASE_ID>'].values:
    da=False
    for c in drugovi:
        if m==c:
            y.append(1)
            da=True
    if not da:
        y.append(0)

In [78]:
x=dataset.iloc[:,2:16]


In [79]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0,test_size=0.35)

In [80]:
clf = svm.SVC(kernel='poly',degree = 13)

In [81]:
clf.fit(x_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=13, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [82]:
y_pred=clf.predict(x_test)

In [83]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[3086    0]
 [   1    0]]


In [84]:
print(f1_score(y_test,y_pred))

0.0


In [85]:
print(accuracy_score(y_test,y_pred))

0.9996760609005507
