In [1]:
import pandas as pd

df = pd.read_csv("example_dataset.csv")
df.head()

Unnamed: 0,PDB,Epitope
0,5VX4,"A:GLN:148, A:PHE:122, A:ASP:192, A:ASN:125, A:..."
1,5F4J,"A:THR:485, A:LEU:527, A:SER:526, A:SER:519, A:..."
2,4E9O,"X:SER:64, X:LEU:203, X:THR:35, X:ASN:175, X:AS..."
3,6H3S,"B:TYR:612, B:HIS:616, B:THR:582, B:GLY:615, B:..."
4,1OQE,"L:CYS:9, L:HIS:16, L:CYS:20, L:LEU:12, L:VAL:1..."


In [2]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.graphs import construct_graph
from graphein.protein.edges.distance import add_distance_threshold
from graphein.protein.visualisation import plotly_protein_structure_graph
from functools import partial
from graphein.protein.features.nodes.aaindex import aaindex1
from graphein.protein.features.nodes.amino_acid import amino_acid_one_hot
from graphein.protein.features.nodes.amino_acid import expasy_protein_scale

# edge construction within 5 Angstrome
edge_funcs = {"edge_construction_functions": [partial(add_distance_threshold, long_interaction_threshold=1, threshold=5)]}

node_funcs = {"node_metadata_functions": [expasy_protein_scale, 
                                          amino_acid_one_hot]}

config = ProteinGraphConfig(**edge_funcs, **node_funcs) #**node_funcs

To use the Graphein submodule graphein.protein.features.sequence.embeddings, you need to install: biovec 
biovec cannot be installed via conda
DEBUG:matplotlib:matplotlib data path: /home/sjchoi/anaconda3/envs/torch/lib/python3.8/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/home/sjchoi/.config/matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is linux


DEBUG:matplotlib:CACHEDIR=/home/sjchoi/.cache/matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /home/sjchoi/.cache/matplotlib/fontlist-v330.json
DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.
DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.
To do so, use the following command: conda install -c pytorch3d pytorch3d
To do so, use the following command: conda install -c pytorch3d pytorch3d


In [3]:
graphs_list = []
pdb_dir = "example_pdb"

for pdb in df["PDB"]:
    print("pdb :", pdb)
    g = construct_graph(config=config, pdb_path=f"{pdb_dir}/{pdb.lower()}.pdb")
    graphs_list.append(g)

DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 162 total nodes
DEBUG:graphein.protein.features.nodes.amino_acid:Reading Expasy protein scales from: /home/sjchoi/anaconda3/envs/torch/lib/python3.8/site-packages/graphein/protein/features/nodes/amino_acid_properties.csv
INFO:graphein.protein.edges.distance:Found: 604 distance edges
INFO:graphein.protein.edges.distance:Added 442 distance edges. (162 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 306 total nodes
INFO:graphein.protein.edges.distance:Found: 1090 distance edges
INFO:graphein.protein.edges.distance:Added 784 distance edges. (306 removed by LIN)


pdb : 5VX4
pdb : 5F4J
pdb : 4E9O


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 230 total nodes
INFO:graphein.protein.edges.distance:Found: 848 distance edges
INFO:graphein.protein.edges.distance:Added 618 distance edges. (230 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 816 total nodes
INFO:graphein.protein.edges.distance:Found: 2824 distance edges


pdb : 6H3S


INFO:graphein.protein.edges.distance:Added 2008 distance edges. (816 removed by LIN)


pdb : 1OQE


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 1681 total nodes
INFO:graphein.protein.edges.distance:Found: 6183 distance edges
INFO:graphein.protein.edges.distance:Added 4502 distance edges. (1681 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 568 total nodes
INFO:graphein.protein.edges.distance:Found: 2056 distance edges
INFO:graphein.protein.edges.distance:Added 1488 distance edges. (568 removed by LIN)


pdb : 2J0O
pdb : 1EPW


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 1287 total nodes
INFO:graphein.protein.edges.distance:Found: 4499 distance edges
INFO:graphein.protein.edges.distance:Added 3212 distance edges. (1287 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 446 total nodes
INFO:graphein.protein.edges.distance:Found: 1622 distance edges
INFO:graphein.protein.edges.distance:Added 1176 distance edges. (446 removed by LIN)


pdb : 3BIK
pdb : 6ACC


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 3195 total nodes
INFO:graphein.protein.edges.distance:Found: 11079 distance edges
INFO:graphein.protein.edges.distance:Added 7884 distance edges. (3195 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 126 total nodes
INFO:graphein.protein.edges.distance:Found: 426 distance edges
INFO:graphein.protein.edges.distance:Added 300 distance edges. (126 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 766 total nodes
INFO:graphein.protein.edges.distance:Found: 2728 distance edges


pdb : 2QKH
pdb : 5GSR


INFO:graphein.protein.edges.distance:Added 1962 distance edges. (766 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 244 total nodes
INFO:graphein.protein.edges.distance:Found: 830 distance edges
INFO:graphein.protein.edges.distance:Added 586 distance edges. (244 removed by LIN)


pdb : 1Z92
pdb : 3Q5Y


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 951 total nodes
INFO:graphein.protein.edges.distance:Found: 3431 distance edges
INFO:graphein.protein.edges.distance:Added 2480 distance edges. (951 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 690 total nodes
INFO:graphein.protein.edges.distance:Found: 2438 distance edges
INFO:graphein.protein.edges.distance:Added 1748 distance edges. (690 removed by LIN)


pdb : 1DHK
pdb : 6EY5


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 585 total nodes
INFO:graphein.protein.edges.distance:Found: 2079 distance edges
INFO:graphein.protein.edges.distance:Added 1494 distance edges. (585 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 333 total nodes
INFO:graphein.protein.edges.distance:Found: 1119 distance edges
INFO:graphein.protein.edges.distance:Added 786 distance edges. (333 removed by LIN)


pdb : 6FNZ
pdb : 2KBH


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 66 total nodes
INFO:graphein.protein.edges.distance:Found: 240 distance edges
INFO:graphein.protein.edges.distance:Added 174 distance edges. (66 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 260 total nodes
INFO:graphein.protein.edges.distance:Found: 956 distance edges
INFO:graphein.protein.edges.distance:Added 696 distance edges. (260 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 397 total nodes
INFO:graphein.protein.edges.distance:Found: 1391 distance edges
INFO:graphein.protein.edges.distance:Added 994 distance edges. (397 removed by LIN)


pdb : 4WE2
pdb : 4NFG
pdb : 1DQT


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 468 total nodes
INFO:graphein.protein.edges.distance:Found: 1694 distance edges
INFO:graphein.protein.edges.distance:Added 1226 distance edges. (468 removed by LIN)


pdb : 5J11


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 500 total nodes
INFO:graphein.protein.edges.distance:Found: 1732 distance edges
INFO:graphein.protein.edges.distance:Added 1232 distance edges. (500 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 247 total nodes
INFO:graphein.protein.edges.distance:Found: 823 distance edges
INFO:graphein.protein.edges.distance:Added 576 distance edges. (247 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 253 total nodes
INFO:graphein.protein.edges.distance:Found: 845 distance edges
INFO:graphein.protein.edges.distance:Added 592 distance edges. (253 removed by LIN)


pdb : 5URV
pdb : 6TFB
pdb : 5W0E


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 114 total nodes
INFO:graphein.protein.edges.distance:Found: 370 distance edges
INFO:graphein.protein.edges.distance:Added 256 distance edges. (114 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 508 total nodes
INFO:graphein.protein.edges.distance:Found: 1806 distance edges
INFO:graphein.protein.edges.distance:Added 1298 distance edges. (508 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 425 total nodes


pdb : 4NZQ
pdb : 5NGI


INFO:graphein.protein.edges.distance:Found: 1475 distance edges
INFO:graphein.protein.edges.distance:Added 1050 distance edges. (425 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 464 total nodes
INFO:graphein.protein.edges.distance:Found: 1652 distance edges
INFO:graphein.protein.edges.distance:Added 1188 distance edges. (464 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 604 total nodes
INFO:graphein.protein.edges.distance:Found: 2140 distance edges


pdb : 2EC8
pdb : 6ICC


INFO:graphein.protein.edges.distance:Added 1536 distance edges. (604 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 293 total nodes
INFO:graphein.protein.edges.distance:Found: 1119 distance edges
INFO:graphein.protein.edges.distance:Added 826 distance edges. (293 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 204 total nodes
INFO:graphein.protein.edges.distance:Found: 726 distance edges
INFO:graphein.protein.edges.distance:Added 522 distance edges. (204 removed by LIN)


pdb : 3RKC
pdb : 3VTT
pdb : 6FFY


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 1139 total nodes
INFO:graphein.protein.edges.distance:Found: 3989 distance edges
INFO:graphein.protein.edges.distance:Added 2850 distance edges. (1139 removed by LIN)


pdb : 2KYH


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 147 total nodes
INFO:graphein.protein.edges.distance:Found: 503 distance edges
INFO:graphein.protein.edges.distance:Added 356 distance edges. (147 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 281 total nodes
INFO:graphein.protein.edges.distance:Found: 1011 distance edges
INFO:graphein.protein.edges.distance:Added 730 distance edges. (281 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 884 total nodes


pdb : 3V6B
pdb : 1FYH


INFO:graphein.protein.edges.distance:Found: 2990 distance edges
INFO:graphein.protein.edges.distance:Added 2106 distance edges. (884 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 161 total nodes
INFO:graphein.protein.edges.distance:Found: 555 distance edges
INFO:graphein.protein.edges.distance:Added 394 distance edges. (161 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 301 total nodes
INFO:graphein.protein.edges.distance:Found: 1043 distance edges
INFO:graphein.protein.edges.distance:Added 742 distance edges. (301 removed by LIN)


pdb : 4ZIH
pdb : 6F8P
pdb : 2B4J


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 444 total nodes
INFO:graphein.protein.edges.distance:Found: 1544 distance edges
INFO:graphein.protein.edges.distance:Added 1100 distance edges. (444 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 556 total nodes
INFO:graphein.protein.edges.distance:Found: 2010 distance edges
INFO:graphein.protein.edges.distance:Added 1454 distance edges. (556 removed by LIN)


pdb : 1TFX
pdb : 1M6B


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 1133 total nodes
INFO:graphein.protein.edges.distance:Found: 4119 distance edges
INFO:graphein.protein.edges.distance:Added 2986 distance edges. (1133 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 356 total nodes
INFO:graphein.protein.edges.distance:Found: 1258 distance edges
INFO:graphein.protein.edges.distance:Added 902 distance edges. (356 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 504 total nodes
INFO:graphein.protein.edges.distance:Found: 1786 distance edges
INFO:graphein.protein.edges.distance:Added 1282 distance edges. (504 removed by LIN)


pdb : 3LES
pdb : 5NIU


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 336 total nodes
INFO:graphein.protein.edges.distance:Found: 1196 distance edges
INFO:graphein.protein.edges.distance:Added 860 distance edges. (336 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 467 total nodes
INFO:graphein.protein.edges.distance:Found: 1585 distance edges
INFO:graphein.protein.edges.distance:Added 1118 distance edges. (467 removed by LIN)


pdb : 1DJS
pdb : 3HX4
pdb : 2XJY


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 166 total nodes
INFO:graphein.protein.edges.distance:Found: 552 distance edges
INFO:graphein.protein.edges.distance:Added 386 distance edges. (166 removed by LIN)
DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 1039 total nodes


pdb : 1Z2C


INFO:graphein.protein.edges.distance:Found: 3531 distance edges
INFO:graphein.protein.edges.distance:Added 2492 distance edges. (1039 removed by LIN)


In [4]:
import pickle
with open("./example_graphs_5A.pkl", "wb") as f:
    pickle.dump(graphs_list, f)