In [2]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
import os
import pickle
from collections import defaultdict
import pickle
from os import walk
from numpy import linalg as LA
from collections import defaultdict

# Scripts

In [9]:

def parse_PDB(pdb_path):
    without_chain = False

    with open(pdb_path, "r") as fi:
        id,connect = [],[]
        for ln in fi:
            if ln.startswith("ATOM") or ln.startswith("HETATM"):
                id.append(ln)
            if ln.startswith("CONECT"):
                connect.append(ln)

    count = 0
    seq = {}
    seq["type_atm"], seq["ind"], seq["coords"] = (
        [],
        [],
        [],
    )

    for element in id:
        type_atm = element[77:78].strip().split()[0]
        ind = int(element[6:12].strip().split()[0])
        atom = element[12:17].strip().split()[0]
        x_coord = float(element[30:38].strip().split()[0])
        y_coord = float(element[38:46].strip().split()[0])
        z_coord = float(element[46:54].strip().split()[0])

        coords = np.array([x_coord, y_coord, z_coord])

        seq["type_atm"].append(type_atm)
        seq["ind"].append(int(ind))
        seq["coords"].append(coords)

        count += 1

    return seq["ind"], seq["type_atm"], connect


In [15]:

from sympy import Q


def create_fingerprints_and_adj(connect,tmp_dict,atoms):
    num_residues = len(atoms)
    fingerprints = []
    retval = [[0 for i in range(0, num_residues)] for j in range(0, num_residues)]

    for element in connect:
        #index
        source = int(element[7:11].strip().split()[0])
        targets = element[12:].strip().split()
        targets = tuple([int(t) for t in targets])
        for target in targets:
            retval[source-1][target-1] += 1
            retval[target-1][source-1] += 1

        #change to atom
        source = tmp_dict[source]
        targets = tuple(sorted([tmp_dict[int(t)] for t in targets]))
        fingerprint = (source,targets)
        fingerprints.append(fingerprint_dict[fingerprint])
    retval = np.array(retval)
    n = retval.shape[0]
    adjacency = retval + np.eye(n)
    degree = sum(adjacency)
    d_half = np.sqrt(np.diag(degree))
    d_half_inv = np.linalg.inv(d_half)
    adjacency = np.matmul(d_half_inv, np.matmul(adjacency, d_half_inv))
        
    return np.array(fingerprints),adjacency

In [16]:

def dump_dictionary(dictionary, file_name):
    with open(file_name, "wb") as f:
        pickle.dump(dict(dictionary), f)

# loop

In [17]:
mod_pdb_folder = "mod_pdbs"
mod_fp_folder = "mod_fingerprints"
adj_folder = os.path.join(mod_fp_folder,"adj")
fp_folder = os.path.join(mod_fp_folder,"fingerprint")
mod_pdb_files = [file for file in os.listdir(mod_pdb_folder) if file.endswith(".pdb")]

In [22]:
# transfer protein into fingerprint
fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))
atom_dict = defaultdict(lambda: len(atom_dict))
mod_names, adjacencies, fingerprints = [],[],[]
sample_mod = mod_pdb_files[0]


for sample_mod in mod_pdb_files:
    inchi_key = sample_mod.split(".")[0]
    print(f"processing {inchi_key}")
    pdb_path = os.path.join(mod_pdb_folder, sample_mod)

    ind, atoms, connect = parse_PDB(pdb_path)
    atom = [atom_dict[a] for a in atoms]
    tmp_dict = dict(zip(ind, atom))
    fp,adj = create_fingerprints_and_adj(connect,tmp_dict,atoms)
    mod_names.append(inchi_key)
    fingerprints.append(fp)
    adjacencies.append(adj)
    # transfer protein into adj list 
    np.save(os.path.join(adj_folder,inchi_key),np.array(adj,dtype=object),allow_pickle=True)
    # transfer protein into fingerprint list 
    np.save(os.path.join(fp_folder,inchi_key),np.array(fp,dtype=object),allow_pickle=True)


processing CQECYVMYMPYNBF-UHFFFAOYSA-N
processing RDJXPXHQENRCNG-UHFFFAOYSA-N
processing UTLUMEAXMVYDSF-UHFFFAOYSA-N
processing GLZIDKKPKNREPI-UHFFFAOYSA-N
processing RLRVAHVRDIDRIV-KRWDZBQOSA-N
processing KFYARNZFJDIZMC-CABCVRRESA-N
processing WYYLCRHWQCODSD-UHFFFAOYSA-N
processing MSKVYWUEYDHWBP-MDWZMJQESA-N
processing NDYKJROXBBBTGO-FQEVSTJZSA-N
processing CLQBKVZRHWAIKX-UHFFFAOYSA-N
processing CMABMSIBRBPFRT-UHFFFAOYSA-N
processing DRKGLYHLBGWKHL-UHFFFAOYSA-N
processing HYXMPCIQSWHPTK-UHFFFAOYSA-N
processing DRZSFOQHHTVBQV-VQIMIIECSA-N
processing DBPWSSGDRRHUNT-CEGNMAFCSA-N
processing KAXCHLAOBYVWBT-UHFFFAOYSA-N
processing LCZZWLIDINBPRC-FQEVSTJZSA-N
processing LMYJMGMGHUALSD-UHFFFAOYSA-N
processing ASFCKFRAVIIYOS-UHFFFAOYSA-N
processing NGJVQDDXILMUHF-UHFFFAOYSA-N
processing FJAWIBGKKKXXAL-LLVKDONJSA-N
processing KAHVJMIONISDRY-MRVPVSSYSA-N
processing AKKBBBALMHJWAE-UHFFFAOYSA-N
processing MZWOOQCTMMZMHY-UHFFFAOYSA-N
processing FEQUIPXIENTMJN-SFHVURJKSA-N
processing OHBNIJCKIIYSBQ

In [26]:
adjacencies = np.array(adjacencies, dtype=object)
fingerprints = np.array(fingerprints, dtype=object)

In [27]:
np.save(os.path.join(mod_fp_folder, "mod_names.npy"), np.array(mod_names), allow_pickle=True)
np.save(os.path.join(mod_fp_folder, "mod_adjacencies.npy"), np.array(adjacencies), allow_pickle=True)
np.save(os.path.join(mod_fp_folder, "mod_fingerprints.npy"), np.array(fingerprints), allow_pickle=True)

In [28]:
dump_dictionary(fingerprint_dict,os.path.join(mod_fp_folder,"mod_fingerprint_dict.pickle"))

# debug

In [54]:
test_adj = np.load("protein_fingerprints/old/protein_adjacencies.npy",allow_pickle=True)


In [57]:
np.save("test_adj.npy",adjacecnies)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (361,) + inhomogeneous part.

In [70]:
test_dict = np.load(os.path.join(protein_fp_folder,"fingerprint_dict.pickle"),allow_pickle=True)

FileNotFoundError: [Errno 2] No such file or directory: 'protein_fingerprints/fingerprint_dict.pickle'

In [19]:
print("threshold = 7")
len(fingerprint_dict)

threshold = 7


59238

In [16]:
fingerprints[200]

array([ 2937,  6674,   729,   503,  2397,  2375,  1803,  1861,  4245,
        1861,  5642,   503,   608,  7253,  1226,  1307,  1327,  4335,
        5064,  5648, 42649,  1828, 10680,   392, 17999,  2822,  4584,
       17663,  1442, 16858,  8057,  4992, 42650,  5542, 21056, 42651,
       42652, 42653, 11230, 20433, 42654,  1236, 42655,  4803,  1302,
        4164,  4548,  1303, 10063, 42656,  1255,  2939,  4965,  8998,
        8997, 42657,   452,  8405, 39169, 16114, 42658, 42659, 42660,
       42661, 42662, 42663, 42664, 42665, 42666, 42667, 11562,    79,
         824, 42668, 19181, 11517, 42669, 42670, 42671, 42672, 42673,
       42674, 42675, 38748,  2454, 42676, 42677, 42678, 42679, 42680,
       42681, 42682, 42683, 42684, 42685, 42686,  5632, 21160, 42687,
       42688, 42689, 42690, 42691, 42692, 42693, 42694, 42695, 42696,
       42697, 42698, 42699,   522, 10898,   503,  2527, 42700, 42701,
       42702, 42703, 42704, 42705, 42706, 15948, 22292, 42313,   503,
       11543, 42314,

In [24]:
# test_dict = np.load(os.path.join(protein_fp_folder,"protein_fingerprints.npy"),allow_pickle=True)

In [29]:
test_dict = np.load("protein_fingerprints/fingerprint_dict.pickle",allow_pickle=True)

In [30]:
test_dict

{(0, (2,)): 0,
 (0, (2, 2)): 1,
 (1, (2, 2)): 2,
 (2, (2, 2, 2)): 3,
 (2, (2,)): 4,
 (2, (2, 2)): 5,
 (1, (2, 2, 2)): 6,
 (1, (2,)): 7,
 (2, (2, 2, 2, 2)): 8,
 (3, (0, 0, 0, 0)): 9,
 (3, (1, 2)): 10,
 (3, (2, 2)): 11,
 (1, (1, 2, 2)): 12,
 (4, (2,)): 13,
 (0, (1, 2)): 14,
 (1, (2, 2, 5)): 15,
 (3, (2,)): 16,
 (6, (2,)): 17,
 (7, (2,)): 18,
 (8, (2,)): 19,
 (1, (2, 2, 2, 2)): 20,
 (1, (1,)): 21,
 (1, (1, 1, 2)): 22,
 (0, (1,)): 23,
 (0, (1, 1)): 24,
 (9, (0, 0, 0, 2)): 25,
 (9, (2,)): 26,
 (3, (1, 1)): 27,
 (9, (0, 0, 0, 0)): 28,
 (9, (0,)): 29,
 (1, (1, 2)): 30,
 (3, (3, 2)): 31,
 (3, (0, 2)): 32,
 (1, (1, 1)): 33,
 (3, (0, 0, 2, 2)): 34,
 (9, (1,)): 35,
 (3, (0,)): 36,
 (1, (1, 1, 1, 2)): 37,
 (3, (3,)): 38,
 (3, (0, 0, 1, 2)): 39,
 (3, (0, 0, 0, 2)): 40,
 (3, (9, 9)): 41,
 (9, (0, 0, 0)): 42}

In [10]:
prots = []
protein_folder = "protein_pdbs"
protein_fingerprints_folder = "protein_fingerprints"
for file in os.listdir(protein_folder):

SyntaxError: incomplete input (4064245176.py, line 4)