In [2]:
from __future__ import print_function
from Bio.PDB import *
import os
import numpy as np
import collections
import pandas as pd
from scipy.spatial import distance
from pygsp import graphs, features
import networkx as nx
import matplotlib.pyplot as plt
import subprocess
from pygsp import utils
from pygsp import graphs, filters
from sklearn.linear_model import LinearRegression

In [3]:
signals_and_cutoffs = {'molecular_weight': 0.42, 
                       'hydrophobicity': 0.42, 
                       'node_degree': 0.42, 
                       'node_weighted_degree': 0.42, 
                       'residue_count': 0.42, 
                       'clustering_coeff': 0.42,
                      'conservation_score': 0.42}

signals_wavelet = ['molecular_weight_1','molecular_weight_2','molecular_weight_3','molecular_weight_4', 'hydrophobicity_1','hydrophobicity_2','hydrophobicity_3','hydrophobicity_4','node_degree_1','node_degree_2','node_degree_3','node_degree_4', 'node_weighted_degree_1', 'node_weighted_degree_2','node_weighted_degree_3' , 'node_weighted_degree_4','residue_count_1','residue_count_2','residue_count_3','residue_count_4' ,'clustering_coeff_1','clustering_coeff_2','clustering_coeff_3','clustering_coeff_4','conservation_score_1','conservation_score_2','conservation_score_3','conservation_score_4']
signals = ['molecular_weight', 'hydrophobicity', 'node_degree', 'node_weighted_degree', 'residue_count', 'clustering_coeff','conservation_score']

In [4]:
amino_lookup = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
     'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M','CCS':'C','AC5':'L'}
amino_molecular_mass = {'A': 89.09404, 'R': 174.20274, 'N': 132.11904, 'D': 133.10384, 'C': 121.15404,
                        'Q': 146.14594, 'E': 147.13074, 'G': 75.06714, 'H': 155.15634, 'I': 131.17464,
                        'L': 131.17464, 'K': 146.18934, 'M': 149.20784, 'F': 165.19184, 'P': 115.13194,
                        'S': 105.09344, 'T': 119.12034, 'W': 204.22844, 'Y': 181.19124, 'V': 117.14784}
amino_hydrophobicity = {'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
                        'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
                        'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
                        'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2}

In [5]:
def crawl_pdb(path):
    '''This function reads pdb files and stores their distance matrix and sequence'''
    parser = PDBParser()
    pdb_files = sorted(os.listdir(path))
    pdbinfo_dict = dict()
    for pdb in pdb_files:
        info = dict()
        info[id] = pdb
        structure = parser.get_structure('pdb_file', path  + pdb )
        coordinates = []
        labels = list()
        for model in structure:
            for chain in model:
                for residue in chain:
                    try:
                        if residue.get_resname() in amino_lookup:
                            coordinates.append(residue['CA'].get_coord())
                            labels.append(residue.get_resname())
                    except KeyError:
                        pass
                break  ## working on chain id A only
            break      ## Working on model id 0 only
        coords = np.asmatrix(coordinates)
        distance_matrix = distance.squareform(distance.pdist(coords))
        info['coords'] = coords
        info['distance_matrix'] = distance_matrix
#         print(np.unique(labels))
        info['sequence'] = ''.join([amino_lookup[s] for s in labels if s in amino_lookup])
        pdbinfo_dict[pdb] = info
    return pdbinfo_dict
 
def get_graph(distance_matrix, network_type, rig_cutoff=8, lin_cutoff=12):
    distance_matrix[distance_matrix >= rig_cutoff] = 0
    if network_type == 'rig-boolean':
        distance_matrix[distance_matrix > 0] = 1
    elif network_type == 'weighted-rig':
        for i in range(np.shape(distance_matrix)[0]):
            for j in range(np.shape(distance_matrix)[1]):
                if distance_matrix[i, j] > 0:
                    distance_matrix[i, j] = abs(j - i)
    elif network_type == 'weighted-lin':
        for i in range(np.shape(distance_matrix)[0]):
            for j in range(np.shape(distance_matrix)[1]):
                if distance_matrix[i, j] > 0:
                    if abs(i - j) >= lin_cutoff or abs(i - j) == 1:
                        distance_matrix[i, j] = abs(i - j)
                    else:
                        distance_matrix[i, j] = 0
    elif network_type == 'lin':
        for i in range(np.shape(distance_matrix)[0]):
            for j in range(np.shape(distance_matrix)[1]):
                if distance_matrix[i, j] > 0:
                    if abs(i - j) >= lin_cutoff or abs(i - j) == 1:
                        distance_matrix[i, j] = 1
                    else:
                        distance_matrix[i, j] = 0
    else:
        print('Invalid Choice! ' + network_type)
        return None
    G = graphs.Graph(distance_matrix, lap_type='normalized')
    G.compute_fourier_basis()
    return G
 
def get_signal(G, seq, pdb, signal):
    if signal == 'molecular_weight':
        s = np.asarray([amino_molecular_mass[aa] for aa in seq])
    elif signal == 'hydrophobicity':
        s = np.asarray([amino_hydrophobicity[aa] for aa in seq])
    elif signal == 'node_degree':
        s = G.d
    elif signal == 'node_weighted_degree':
        adj = G.W.todense()
        s = np.ravel(adj.sum(axis=0)) / 2
    elif signal == 'avg_adj_degree':
        s = features.compute_avg_adj_deg(G)
        s = np.ravel(s)
    elif signal == 'clustering_coeff':
        N = nx.from_scipy_sparse_matrix(G.W)
        s = nx.clustering(N)
        s = np.asarray(list(s.values()))
    elif signal == 'aaalpha_helix':
        s = eng.aaalpha_helixfasman(seq)
        s = np.array(s._data)
    elif signal == 'residue_count':
        residue_counts = collections.Counter(seq)
        s = np.asarray([residue_counts[s] for s in seq])
    elif signal == 'conservation_score':
        #https://compbio.cs.princeton.edu/conservation/
        filename = pdb.split('.')[0]
#         cmd = ['python3 ./pdb2fasta-master/pdb2fasta.py '+pdb_path+''+pdb+' > ./pdb2fasta-master/'+filename+'.fasta']
#         print(cmd)
#         process = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        with open('./pdb2fasta-master/'+filename+'.fasta', 'w') as the_file:
            the_file.write('>'+filename+':A\n'+seq+"-")
        process = 0
        if process ==0:
            s = []
            cmd = ['python2 ./pdb2fasta-master/conservation_code/score_conservation.py -alignfile ./pdb2fasta-master/'+filename+'.fasta > ./pdb2fasta-master/'+filename+'.csv']
#             print(cmd)
            process = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            if process ==0:
                with open('./pdb2fasta-master/'+filename+'.csv') as f:
                    for i in range(5):
                        next(f)
                    for line in f:
#                         print(line.split("\t")[1])
                        s.append(float(line.split("\t")[1]))
        s = np.array(s)
    else:
        print ('Invalid Choice! ' + signal)
    return s
 
def get_filtered_signal(G, signal, cutoff,type_spatial):
    if type_spatial == 'fourier':
        gftsignal = G.gft(signal)
        signal_hat = gftsignal
        value = np.sum(abs(signal_hat[G.e < G.lmax*cutoff])) / np.sum(abs(signal_hat))
        return value
    elif type_spatial == 'wavelet':
        N_f=4
        scales = utils.compute_log_scales(1, len(signal), N_f-1)
        mex = filters.Abspline(G, Nf=N_f,scales=scales)
#         for i, signal in enumerate(exp):
        signal_filtered_hat = mex.filter(signal)
        signal_filtered_hat = np.abs(signal_filtered_hat)
        for j in range(signal_filtered_hat.shape[1]):
                p = np.percentile(signal_filtered_hat[:,j], 70) 
                signal_filtered_hat[np.where(signal_filtered_hat[:,j]<p),j] = 0
#         inv_fil = mex.filter(signal_filtered_hat)
#         print(inv_fil.shape)
        signal_filtered_hat = np.mean(np.abs(signal_filtered_hat),axis=0)
        return signal_filtered_hat

In [None]:
path = './pdb_files/ant_real/'
pdbinfo_dict = crawl_pdb(path)
lfc_cutoff = 0.42
model = 'weighted-rig'
# print (lfc_cutoff, end=' : ')
gsp_features = pd.DataFrame(columns=signals_wavelet+['class'])
 
path_files = './pdb_files/ant_real/'
for pdb in pdbinfo_dict.keys():
    row = []
    c=0
    G = get_graph(pdbinfo_dict[pdb]['distance_matrix'], network_type=model, rig_cutoff=7.3)
    for signal_name in signals:
        signal = get_signal(G, pdbinfo_dict[pdb]['sequence'],pdb,signal=signal_name)
        value = get_filtered_signal(G,signal,lfc_cutoff,type_spatial='wavelet')       
        row.extend(value)
    row.append(c)
    gsp_features.loc[pdb] = row

In [7]:
path = './pdb_files/ant_real/'

In [8]:
pdbinfo_dict = crawl_pdb(path)



















In [13]:
pdbinfo_dict

{'1ERR.pdb': {<function id(obj, /)>: '1ERR.pdb',
  'coords': matrix([[53.995, 62.069, 72.653],
          [50.596, 62.259, 70.953],
          [51.799, 65.546, 69.479],
          [54.212, 64.025, 66.973],
          [53.347, 64.005, 63.307],
          [53.629, 60.694, 61.475],
          [57.   , 61.809, 60.232],
          [58.238, 63.006, 63.637],
          [57.248, 59.628, 65.129],
          [59.184, 57.774, 62.491],
          [62.423, 59.721, 62.777],
          [62.259, 59.753, 66.577],
          [61.958, 55.967, 66.457],
          [64.721, 55.511, 63.886],
          [67.033, 57.589, 66.105],
          [66.295, 55.442, 69.123],
          [67.392, 52.34 , 67.287],
          [69.755, 50.328, 69.431],
          [73.174, 49.603, 67.97 ],
          [73.891, 46.094, 66.592],
          [76.223, 44.386, 69.099],
          [78.752, 41.596, 68.506],
          [79.214, 38.169, 70.008],
          [82.025, 37.611, 72.521],
          [84.918, 37.609, 70.032],
          [85.796, 34.214, 68.578],
     

In [9]:
lfc_cutoff = 0.42
model = 'weighted-rig'

In [14]:
gsp_features = pd.DataFrame(columns=signals_wavelet+['class'])

In [15]:
gsp_features

Unnamed: 0,molecular_weight_1,molecular_weight_2,molecular_weight_3,molecular_weight_4,hydrophobicity_1,hydrophobicity_2,hydrophobicity_3,hydrophobicity_4,node_degree_1,node_degree_2,...,residue_count_4,clustering_coeff_1,clustering_coeff_2,clustering_coeff_3,clustering_coeff_4,conservation_score_1,conservation_score_2,conservation_score_3,conservation_score_4,class


In [27]:
path_files = './pdb_files/ant_real/'
for pdb in pdbinfo_dict.keys():
    row = []
    c=0
    G = get_graph(pdbinfo_dict[pdb]['distance_matrix'], network_type=model, rig_cutoff=7.3)
    for signal_name in signals:
        signal = get_signal(G, pdbinfo_dict[pdb]['sequence'],pdb,signal=signal_name)
        value = get_filtered_signal(G,signal,lfc_cutoff,type_spatial='wavelet')       
        row.extend(value)
    row.append(c)
    gsp_features.loc[pdb] = row 

In [37]:
import pickle
with open('./files2/ago_dic.pkl', 'rb') as f:
    pdbinfo_dict_1 = pickle.load(f)

In [38]:
pdbinfo_dict_1

{'1EBP.pdb': {<function id(obj, /)>: '1EBP.pdb',
  'coords': matrix([[39.634, 30.133, 20.962],
          [36.868, 32.588, 22.026],
          [38.905, 34.036, 24.886],
          [41.903, 34.094, 22.571],
          [40.117, 35.912, 19.675],
          [38.375, 38.188, 22.194],
          [41.792, 38.796, 23.615],
          [43.007, 39.467, 20.042],
          [40.434, 42.21 , 19.443],
          [40.925, 43.941, 22.789],
          [42.247, 47.513, 23.274],
          [45.789, 48.669, 24.078],
          [47.739, 51.242, 26.045],
          [51.271, 52.542, 25.119],
          [54.731, 51.   , 25.689],
          [55.358, 49.952, 29.291],
          [58.147, 48.034, 31.001],
          [56.613, 44.996, 32.719],
          [58.208, 43.027, 35.554],
          [56.817, 40.403, 37.943],
          [58.021, 37.694, 40.463],
          [56.378, 34.219, 40.786],
          [58.006, 32.39 , 43.67 ],
          [60.084, 34.736, 45.716],
          [63.539, 34.21 , 44.144],
          [62.981, 35.244, 40.483],
     

In [39]:
model = 'weighted-rig'
# print (lfc_cutoff, end=' : ')
gsp_features_1 = pd.DataFrame(columns=signals_wavelet+['class'])

for pdb in pdbinfo_dict_1.keys():
#         print (pdb, end=', ')
    row = []
    c=1
    G = get_graph(pdbinfo_dict_1[pdb]['distance_matrix'], network_type=model, rig_cutoff=7.3)
    for signal_name, lfc_cutoff in signals_and_cutoffs.items():
        signal = get_signal(G, pdbinfo_dict_1[pdb]['sequence'],pdb, signal=signal_name)
        value = get_filtered_signal(G,signal,lfc_cutoff,type_spatial='wavelet')       
        row.extend(value)        
    row.append(c)
    gsp_features_1.loc[pdb] = row

  d = np.power(self.dw, -0.5)


In [40]:
gsp_f = pd.concat([gsp_features,gsp_features_1],axis=0)

X = gsp_f[gsp_f.columns.difference(['class'])]
y = gsp_f['class']

In [41]:
signal.shape

(253,)

In [35]:
len(pdbinfo_dict.keys())

156

In [42]:
gsp_features_1

Unnamed: 0,molecular_weight_1,molecular_weight_2,molecular_weight_3,molecular_weight_4,hydrophobicity_1,hydrophobicity_2,hydrophobicity_3,hydrophobicity_4,node_degree_1,node_degree_2,...,residue_count_4,clustering_coeff_1,clustering_coeff_2,clustering_coeff_3,clustering_coeff_4,conservation_score_1,conservation_score_2,conservation_score_3,conservation_score_4,class
1EBP.pdb,70.779496,26.425562,0.276106,0.000654,0.184844,1.121956,0.019290,0.000046,4.954575,0.861792,...,0.000118,0.280298,0.150925,0.001208,0.000003,0.453421,0.165444,0.001256,0.000003,1.0
1FCX.pdb,72.041204,24.616886,0.257090,0.000547,0.236775,1.176618,0.018181,0.000039,4.916267,1.067916,...,0.000153,0.282164,0.138746,0.001133,0.000002,0.465685,0.148402,0.001070,0.000002,1.0
1FCY.pdb,71.774507,24.813398,0.259079,0.000549,0.280846,1.163933,0.017831,0.000038,4.920244,1.021809,...,0.000150,0.284078,0.135504,0.001128,0.000002,0.465173,0.144941,0.001053,0.000002,1.0
1FCZ.pdb,71.641333,24.768340,0.259057,0.000551,0.261065,1.155173,0.018150,0.000039,4.920575,1.054250,...,0.000156,0.281990,0.132787,0.001143,0.000002,0.464861,0.144760,0.001061,0.000002,1.0
1FM6.pdb,71.937346,27.862871,0.222513,0.000480,0.233965,1.234521,0.016605,0.000036,5.062072,1.139770,...,0.000110,0.288484,0.158782,0.001003,0.000002,0.469322,0.166026,0.000983,0.000002,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6VIF.pdb,74.640365,26.523389,0.201263,0.000428,0.329931,1.266909,0.019409,0.000041,4.847094,0.997561,...,0.000157,0.306519,0.143404,0.001186,0.000003,0.473728,0.149727,0.000969,0.000002,1.0
6W9H.pdb,73.912375,27.576416,0.251568,0.000497,0.373730,1.213114,0.018944,0.000037,4.982009,1.111867,...,0.000128,0.285111,0.158221,0.001019,0.000002,0.464546,0.160485,0.001032,0.000002,1.0
6W9I.pdb,73.984356,27.752762,0.242819,0.000480,0.342279,1.227426,0.018661,0.000037,4.980641,1.120368,...,0.000128,0.285014,0.153838,0.001022,0.000002,0.465244,0.158763,0.000997,0.000002,1.0
6WWZ.pdb,68.924962,18.995717,0.177328,0.000262,0.338022,1.234982,0.010228,0.000015,4.987110,0.841872,...,0.000058,0.268356,0.094176,0.000904,0.000001,0.452957,0.105255,0.000828,0.000001,1.0


In [43]:
gsp_features

Unnamed: 0,molecular_weight_1,molecular_weight_2,molecular_weight_3,molecular_weight_4,hydrophobicity_1,hydrophobicity_2,hydrophobicity_3,hydrophobicity_4,node_degree_1,node_degree_2,...,residue_count_4,clustering_coeff_1,clustering_coeff_2,clustering_coeff_3,clustering_coeff_4,conservation_score_1,conservation_score_2,conservation_score_3,conservation_score_4,class
1ERR.pdb,64.113203,13.390872,0.167218,0.000372,0.293257,1.357203,0.015656,0.000035,3.510459,0.127208,...,0.000163,0.318637,0.087923,0.001029,2.287257e-06,0.406765,0.059607,0.000585,1.299280e-06,0.0
1FTL.pdb,70.173009,24.311121,0.222586,0.000433,0.401456,1.194749,0.014850,0.000029,5.181291,1.162616,...,0.000073,0.277791,0.136593,0.001198,2.330290e-06,0.456963,0.148663,0.001105,2.150493e-06,0.0
1LK2.pdb,73.642242,27.046798,0.214666,0.000392,0.265787,1.218032,0.013091,0.000024,5.061551,1.005204,...,0.000065,0.282789,0.145858,0.000939,1.712731e-06,0.469035,0.154199,0.000866,1.579929e-06,0.0
1NHZ.pdb,74.245257,24.223986,0.210313,0.000440,0.249947,1.223520,0.017202,0.000036,4.778333,0.880368,...,0.000121,0.308139,0.127541,0.001125,2.353066e-06,0.469533,0.131357,0.000908,1.899371e-06,0.0
1PBQ.pdb,70.463795,23.890169,0.217301,0.000397,0.261474,1.227107,0.013249,0.000024,5.112982,1.085919,...,0.000050,0.273369,0.132528,0.001039,1.895173e-06,0.451293,0.141308,0.000989,1.804235e-06,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6UZA.pdb,72.055373,24.290123,0.075449,0.000051,0.495355,1.246897,0.005413,0.000004,4.775746,0.942011,...,0.000026,0.292523,0.128112,0.000338,2.302004e-07,0.459543,0.133805,0.000287,1.951827e-07,0.0
6V4P.pdb,68.387123,24.385745,0.123315,0.000136,0.179745,1.132644,0.007578,0.000008,5.924166,1.008877,...,0.000050,0.259472,0.123318,0.000604,6.686185e-07,0.451771,0.137351,0.000608,6.725464e-07,0.0
6V9S.pdb,73.307653,23.366619,0.111742,0.000111,0.450447,1.148140,0.008392,0.000008,5.093060,1.016212,...,0.000043,0.287075,0.126058,0.000481,4.783200e-07,0.465629,0.136591,0.000459,4.565817e-07,0.0
6W25.pdb,70.903560,23.476416,0.122271,0.000131,0.623548,1.191057,0.008661,0.000009,5.311700,1.112669,...,0.000053,0.277043,0.124538,0.000539,5.765835e-07,0.461191,0.138514,0.000538,5.755512e-07,0.0


In [44]:
gsp_f

Unnamed: 0,molecular_weight_1,molecular_weight_2,molecular_weight_3,molecular_weight_4,hydrophobicity_1,hydrophobicity_2,hydrophobicity_3,hydrophobicity_4,node_degree_1,node_degree_2,...,residue_count_4,clustering_coeff_1,clustering_coeff_2,clustering_coeff_3,clustering_coeff_4,conservation_score_1,conservation_score_2,conservation_score_3,conservation_score_4,class
1ERR.pdb,64.113203,13.390872,0.167218,0.000372,0.293257,1.357203,0.015656,0.000035,3.510459,0.127208,...,0.000163,0.318637,0.087923,0.001029,0.000002,0.406765,0.059607,0.000585,0.000001,0.0
1FTL.pdb,70.173009,24.311121,0.222586,0.000433,0.401456,1.194749,0.014850,0.000029,5.181291,1.162616,...,0.000073,0.277791,0.136593,0.001198,0.000002,0.456963,0.148663,0.001105,0.000002,0.0
1LK2.pdb,73.642242,27.046798,0.214666,0.000392,0.265787,1.218032,0.013091,0.000024,5.061551,1.005204,...,0.000065,0.282789,0.145858,0.000939,0.000002,0.469035,0.154199,0.000866,0.000002,0.0
1NHZ.pdb,74.245257,24.223986,0.210313,0.000440,0.249947,1.223520,0.017202,0.000036,4.778333,0.880368,...,0.000121,0.308139,0.127541,0.001125,0.000002,0.469533,0.131357,0.000908,0.000002,0.0
1PBQ.pdb,70.463795,23.890169,0.217301,0.000397,0.261474,1.227107,0.013249,0.000024,5.112982,1.085919,...,0.000050,0.273369,0.132528,0.001039,0.000002,0.451293,0.141308,0.000989,0.000002,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6VIF.pdb,74.640365,26.523389,0.201263,0.000428,0.329931,1.266909,0.019409,0.000041,4.847094,0.997561,...,0.000157,0.306519,0.143404,0.001186,0.000003,0.473728,0.149727,0.000969,0.000002,1.0
6W9H.pdb,73.912375,27.576416,0.251568,0.000497,0.373730,1.213114,0.018944,0.000037,4.982009,1.111867,...,0.000128,0.285111,0.158221,0.001019,0.000002,0.464546,0.160485,0.001032,0.000002,1.0
6W9I.pdb,73.984356,27.752762,0.242819,0.000480,0.342279,1.227426,0.018661,0.000037,4.980641,1.120368,...,0.000128,0.285014,0.153838,0.001022,0.000002,0.465244,0.158763,0.000997,0.000002,1.0
6WWZ.pdb,68.924962,18.995717,0.177328,0.000262,0.338022,1.234982,0.010228,0.000015,4.987110,0.841872,...,0.000058,0.268356,0.094176,0.000904,0.000001,0.452957,0.105255,0.000828,0.000001,1.0


In [45]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Create a svm Classifier
clf = svm.SVC(kernel='rbf') # Linear Kernel

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=109)

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

from sklearn.metrics import classification_report, confusion_matrix 
print(classification_report(y_test, y_pred)) 

Accuracy: 0.7108433734939759
Precision: 0.7108433734939759
Recall: 1.0
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        48
         1.0       0.71      1.00      0.83       118

    accuracy                           0.71       166
   macro avg       0.36      0.50      0.42       166
weighted avg       0.51      0.71      0.59       166



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
gsp_f

Unnamed: 0,molecular_weight_1,molecular_weight_2,molecular_weight_3,molecular_weight_4,hydrophobicity_1,hydrophobicity_2,hydrophobicity_3,hydrophobicity_4,node_degree_1,node_degree_2,...,residue_count_4,clustering_coeff_1,clustering_coeff_2,clustering_coeff_3,clustering_coeff_4,conservation_score_1,conservation_score_2,conservation_score_3,conservation_score_4,class
1ERR.pdb,64.113203,13.390872,0.167218,0.000372,0.293257,1.357203,0.015656,0.000035,3.510459,0.127208,...,0.000163,0.318637,0.087923,0.001029,0.000002,0.406765,0.059607,0.000585,0.000001,0.0
1FTL.pdb,70.173009,24.311121,0.222586,0.000433,0.401456,1.194749,0.014850,0.000029,5.181291,1.162616,...,0.000073,0.277791,0.136593,0.001198,0.000002,0.456963,0.148663,0.001105,0.000002,0.0
1LK2.pdb,73.642242,27.046798,0.214666,0.000392,0.265787,1.218032,0.013091,0.000024,5.061551,1.005204,...,0.000065,0.282789,0.145858,0.000939,0.000002,0.469035,0.154199,0.000866,0.000002,0.0
1NHZ.pdb,74.245257,24.223986,0.210313,0.000440,0.249947,1.223520,0.017202,0.000036,4.778333,0.880368,...,0.000121,0.308139,0.127541,0.001125,0.000002,0.469533,0.131357,0.000908,0.000002,0.0
1PBQ.pdb,70.463795,23.890169,0.217301,0.000397,0.261474,1.227107,0.013249,0.000024,5.112982,1.085919,...,0.000050,0.273369,0.132528,0.001039,0.000002,0.451293,0.141308,0.000989,0.000002,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6VIF.pdb,74.640365,26.523389,0.201263,0.000428,0.329931,1.266909,0.019409,0.000041,4.847094,0.997561,...,0.000157,0.306519,0.143404,0.001186,0.000003,0.473728,0.149727,0.000969,0.000002,1.0
6W9H.pdb,73.912375,27.576416,0.251568,0.000497,0.373730,1.213114,0.018944,0.000037,4.982009,1.111867,...,0.000128,0.285111,0.158221,0.001019,0.000002,0.464546,0.160485,0.001032,0.000002,1.0
6W9I.pdb,73.984356,27.752762,0.242819,0.000480,0.342279,1.227426,0.018661,0.000037,4.980641,1.120368,...,0.000128,0.285014,0.153838,0.001022,0.000002,0.465244,0.158763,0.000997,0.000002,1.0
6WWZ.pdb,68.924962,18.995717,0.177328,0.000262,0.338022,1.234982,0.010228,0.000015,4.987110,0.841872,...,0.000058,0.268356,0.094176,0.000904,0.000001,0.452957,0.105255,0.000828,0.000001,1.0


In [47]:
path = './pdb_files/ant_real/'
pdbinfo_dict = crawl_pdb(path)
lfc_cutoff = 0.42
model = 'weighted-rig'
# print (lfc_cutoff, end=' : ')
gsp_features = pd.DataFrame(columns=signals_wavelet+['class'])
 
path_files = './pdb_files/ant_real/'
for pdb in pdbinfo_dict.keys():
    row = []
    c=0
    G = get_graph(pdbinfo_dict[pdb]['distance_matrix'], network_type=model, rig_cutoff=7.3)
    for signal_name in signals:
        signal = get_signal(G, pdbinfo_dict[pdb]['sequence'],pdb,signal=signal_name)
        value = get_filtered_signal(G,signal,lfc_cutoff,type_spatial='wavelet')       
        row.extend(value)
    row.append(c)
    gsp_features.loc[pdb] = row
    break



















In [48]:
pdb

'1ERR.pdb'

In [49]:
pdbinfo_dict

{'1ERR.pdb': {<function id(obj, /)>: '1ERR.pdb',
  'coords': matrix([[53.995, 62.069, 72.653],
          [50.596, 62.259, 70.953],
          [51.799, 65.546, 69.479],
          [54.212, 64.025, 66.973],
          [53.347, 64.005, 63.307],
          [53.629, 60.694, 61.475],
          [57.   , 61.809, 60.232],
          [58.238, 63.006, 63.637],
          [57.248, 59.628, 65.129],
          [59.184, 57.774, 62.491],
          [62.423, 59.721, 62.777],
          [62.259, 59.753, 66.577],
          [61.958, 55.967, 66.457],
          [64.721, 55.511, 63.886],
          [67.033, 57.589, 66.105],
          [66.295, 55.442, 69.123],
          [67.392, 52.34 , 67.287],
          [69.755, 50.328, 69.431],
          [73.174, 49.603, 67.97 ],
          [73.891, 46.094, 66.592],
          [76.223, 44.386, 69.099],
          [78.752, 41.596, 68.506],
          [79.214, 38.169, 70.008],
          [82.025, 37.611, 72.521],
          [84.918, 37.609, 70.032],
          [85.796, 34.214, 68.578],
     

In [50]:
gsp_features

Unnamed: 0,molecular_weight_1,molecular_weight_2,molecular_weight_3,molecular_weight_4,hydrophobicity_1,hydrophobicity_2,hydrophobicity_3,hydrophobicity_4,node_degree_1,node_degree_2,...,residue_count_4,clustering_coeff_1,clustering_coeff_2,clustering_coeff_3,clustering_coeff_4,conservation_score_1,conservation_score_2,conservation_score_3,conservation_score_4,class
1ERR.pdb,73.70339,22.779563,0.233207,0.000518,0.449012,1.19651,0.018989,4.2e-05,4.974301,0.923667,...,0.000185,0.296468,0.129666,0.001172,3e-06,0.471706,0.134558,0.001035,2e-06,0.0
