This code parses a sdf data file and makes a dataframe extracting text and numbers from there.

####  About BindingDB https://www.bindingdb.org/rwd/bind/chemsearch/marvin/Download.jsp  
It is a database of experimentally determined protein-ligand binding affinities, currently containing binding affinities for about 20, 000 protein-ligand pairs from 110 proteins and about 11000 smal molecules.

In [1]:
# The following code reads through the data, for each protein-ligand pair it reads 'PubChem SID', 
# 'BindingDB Target Chain Sequence', 'Number of Protein Chains in Target', and '<Kd '. Note that 'PubChem SID' is an 
# ID number for a ligand, which we will use later to get their SMILES
# reads the file line by line, looks for key words/phrases and reads 
# relevant data from the lines immediately following

In [2]:
import numpy as np
import pandas as pd
import re 

In [3]:
file_path = "BindingDB_All_3D.sdf"

df = pd.DataFrame(columns = ["kd", "ligand_cid", "no_of_complexes", "protein_seq"])
print(df)

Empty DataFrame
Columns: [kd, ligand_cid, no_of_complexes, protein_seq]
Index: []


In [None]:
chunk_size = 1024*1024*1024 # 1 GB
row_dict = {}
flag_lig, flag_pr, flag_noc, flag_kd = False, False, False, False

def process_chunk(chunk):
    global row_dict, flag_lig, flag_pr, flag_noc, flag_kd  # Declare row_dict as global to modify the global variable inside the function
    
    for line in chunk:
        line = line.rstrip()
        
        if bool(re.search('PubChem SID', line)):
            flag_lig = True
            continue
        elif bool(re.search('BindingDB Target Chain Sequence', line)):
            flag_pr = True
            continue
        elif bool(re.search('Number of Protein Chains in Target', line)):
            flag_noc = True
            continue
        elif bool(re.search('<Kd', line)):
            flag_kd = True
            continue
        
        if flag_lig == True:
            try:
                line = int(line.strip())
            except:
                line = np.nan
            row_dict['ligand_cid'] = line
            flag_lig = False
        elif flag_pr == True:
            row_dict['protein_seq'] = line.strip()
            flag_pr = False
        elif flag_noc == True:
            try:
                line = int(line.strip())
            except:
                line = np.nan
            row_dict['no_of_complexes'] = line
            flag_noc = False
        elif flag_kd == True:
            try:
                line = float(line.strip())
            except:
            #print(line)
                line = np.nan
            row_dict['kd'] = line
            flag_kd = False
      
        
        if len(list(row_dict.keys())) == 4:
            #print(True)
            df.loc[len(df.index)] = row_dict
            row_dict = {}

c = 0 # to moniter progress
        
with open(file_path, 'r', encoding="utf8") as file:
    while True:
        # Read up to chunk_size bytes from the file
        lines = list(file.readlines(chunk_size))
        
        # If lines is empty, we've reached the end of the file
        if not lines:
            break
        c += 1
        # Process the current chunk of lines
        process_chunk(lines)
        print(c)
        print(df.info())
        df.to_csv('df_raw.csv', index = False)
        
print(df.shape)
print(df.info())

1
<class 'pandas.core.frame.DataFrame'>
Index: 134450 entries, 0 to 134449
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   kd               3438 non-null    float64
 1   ligand_cid       134450 non-null  int64  
 2   no_of_complexes  134450 non-null  int64  
 3   protein_seq      134450 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 5.1+ MB
None
2
<class 'pandas.core.frame.DataFrame'>
Index: 259598 entries, 0 to 259597
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   kd               4498 non-null    float64
 1   ligand_cid       259598 non-null  int64  
 2   no_of_complexes  259598 non-null  int64  
 3   protein_seq      259598 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 9.9+ MB
None
3
<class 'pandas.core.frame.DataFrame'>
Index: 381972 entries, 0 to 381971
Data columns 

In [None]:
df.to_csv('df_raw.csv', index = False)