In [65]:
import sys
import numpy as np

sys.path.append("./src/")
from pdb_numpy import Coor, Model


In [66]:
import shlex
from collections import OrderedDict

def parse_raw_mmcif_lines(mmcif_lines):
    """Parse the mmcif lines and return atom information as a dictionary

    Parameters
    ----------
    self : Coor
        Coor object
    mmcif_lines : list
        list of pdb lines

    Returns
    -------
    None
        self.atom_dict modified as a dictionary with atom information
        self.crystal_pack modified as a string with crystal information

    """
    
    data_mmCIF = OrderedDict()
    tabular = False

    category = "title"
    attribute = "title"

    for i, line in enumerate(mmcif_lines):
        #print(line, end="")

        if line.startswith("#"):
            tabular = False
        
        elif line.startswith("loop_"):
            tabular = True
            col_names = []
        
        elif line.startswith("_"):
            token = shlex.split(line, posix=False)
            category, attribute = token[0].split(".")

            if tabular:
                if category not in data_mmCIF:
                    data_mmCIF[category] = { 'col_names': [], 'value': []}
                data_mmCIF[category]['col_names'].append(attribute)
                data_mmCIF[category]['value'].append([])
                final_token = []
            else:
                if category not in data_mmCIF:
                    data_mmCIF[category] = OrderedDict()
                # Necessary to handle attributes on 2 lines.
                if len(token) == 2:
                    data_mmCIF[category][attribute] = token[1]
        
        elif tabular:
            token = shlex.split(line, posix=False)
            token_complete = True
            if len(token) != len(data_mmCIF[category]['col_names']):
                if len(final_token) == len(data_mmCIF[category]['col_names']):
                    token = final_token
                else:
                    token_complete = False
                    final_token += token
            
            if token_complete:
                for i in range(len(data_mmCIF[category]['col_names'])):
                    data_mmCIF[category]['value'][i].append(token[i])
        else:
            token = shlex.split(line, posix=False)
            if category not in data_mmCIF:
                    data_mmCIF[category] = OrderedDict()
            data_mmCIF[category][attribute] = token[0]
    
    return(data_mmCIF)
         

In [79]:

def parse_mmcif_lines(self, mmcif_lines):
    """Parse the mmcif lines and return atom information as a dictionary

    Parameters
    ----------
    self : Coor
        Coor object
    mmcif_lines : list
        list of pdb lines

    Returns
    -------
    None
        self.atom_dict modified as a dictionary with atom information
        self.crystal_pack modified as a string with crystal information

    """

    data_mmCIF = parse_raw_mmcif_lines(mmcif_lines)

    model_index = data_mmCIF['_atom_site']['col_names'].index('pdbx_PDB_model_num')
    model_array = np.array(data_mmCIF['_atom_site']['value'][model_index]).astype(np.int16)
    model_list = np.unique(model_array)

    # field list
    col_index = data_mmCIF['_atom_site']['col_names'].index('group_PDB')
    field_array = np.array([field[0] for field in data_mmCIF['_atom_site']['value'][col_index]], dtype="|S1")

    # "num_resid_uniqresid"
    col_index = data_mmCIF['_atom_site']['col_names'].index('id')
    num_array = np.array(data_mmCIF['_atom_site']['value'][col_index]).astype(np.int16)
    # check that num_array is consecutive (Maybe useless)
    assert np.array_equal(num_array, np.arange(1, len(num_array) + 1)), "Atom numbering is not consecutive"

    col_index = data_mmCIF['_atom_site']['col_names'].index('auth_seq_id')
    resid_array = np.array(data_mmCIF['_atom_site']['value'][col_index]).astype(np.int16)
    uniq_resid_array = np.unique(resid_array, return_inverse=True)[1]

    num_resid_uniqresid_array = np.column_stack((num_array, resid_array, uniq_resid_array))

    # "name_resname"
    col_index = data_mmCIF['_atom_site']['col_names'].index('label_atom_id')
    name_array = np.array(data_mmCIF['_atom_site']['value'][col_index], dtype="|S4")
    col_index = data_mmCIF['_atom_site']['col_names'].index('label_comp_id')
    resname_array = np.array(data_mmCIF['_atom_site']['value'][col_index], dtype="|S4")

    name_resname_array = np.column_stack((name_array, resname_array, uniq_resid_array))


    # "alterloc_chain_insertres"
    col_index = data_mmCIF['_atom_site']['col_names'].index('label_alt_id')
    alterloc_array = np.array(data_mmCIF['_atom_site']['value'][col_index], dtype="|S1")
    col_index = data_mmCIF['_atom_site']['col_names'].index('label_asym_id')
    chain_array = np.array(data_mmCIF['_atom_site']['value'][col_index], dtype="|S1")
    col_index = data_mmCIF['_atom_site']['col_names'].index('pdbx_PDB_ins_code')
    insertres_array = np.array(data_mmCIF['_atom_site']['value'][col_index], dtype="|S1")

    alterloc_chain_insertres_array = np.column_stack((alterloc_array, chain_array, insertres_array))

    # "xyz"
    col_index = data_mmCIF['_atom_site']['col_names'].index('Cartn_x')
    x_array = np.array(data_mmCIF['_atom_site']['value'][col_index]).astype(np.float32)
    col_index = data_mmCIF['_atom_site']['col_names'].index('Cartn_y')
    y_array = np.array(data_mmCIF['_atom_site']['value'][col_index]).astype(np.float32)
    col_index = data_mmCIF['_atom_site']['col_names'].index('Cartn_z')
    z_array = np.array(data_mmCIF['_atom_site']['value'][col_index]).astype(np.float32)

    xyz_array = np.column_stack((x_array, y_array, z_array))

    # "occ_beta"
    col_index = data_mmCIF['_atom_site']['col_names'].index('occupancy')
    occ_array = np.array(data_mmCIF['_atom_site']['value'][col_index]).astype(np.float32)
    col_index = data_mmCIF['_atom_site']['col_names'].index('B_iso_or_equiv')
    beta_array = np.array(data_mmCIF['_atom_site']['value'][col_index]).astype(np.float32)
    
    occ_beta_array = np.column_stack((occ_array, beta_array))

    # Need to extract atom symbols ?


    for model in model_list:
        model_index = (model_array == model)

        local_model = Model()
        local_model.atom_dict = {
                    "field": field_array[model_index],
                    "num_resid_uniqresid": num_resid_uniqresid_array[model_index],
                    "name_resname": name_resname_array[model_index],
                    "alterloc_chain_insertres": alterloc_chain_insertres_array[model_index],
                    "xyz": xyz_array[model_index],
                    "occ_beta": occ_beta_array[model_index],
                }
        
        if len(self.models) > 1 and local_model.len != self.models[-1].len:
                    logger.warning(
                        f"The atom number is not the same in the model {len(self.models)-1} and the model {len(self.models)}."
                    )
                    
        self.models.append(local_model)

Coor.parse_mmcif_lines = parse_mmcif_lines

In [80]:
a = np.array([0, 1, 2, 3])
b = np.array([6, 7, 8, 9])

tmp = np.column_stack((a, b))
tmp[0]

array([0, 6])

In [81]:
test_mmcif = "src/pdb_numpy/tests/input/2rri.cif"
with open(test_mmcif, "r") as f:
    mmcif_lines = f.readlines()

In [82]:
coor_1 = Coor()

coor_1.parse_mmcif_lines(mmcif_lines)

In [83]:


def get_PDB_mmcif(self, pdb_ID):
    """Get a mmcif file from the PDB using its ID
    and return a Coor object.

    Parameters
    ----------
    self : Coor
        Coor object
    pdb_ID : str
        pdb ID

    Returns
    -------
    None
        self.atom_dict modified as a dictionnary with atom informations
        self.crystal_pack modified as a string with crystal informations

    :Example:
    >>> prot_coor = Coor()
    >>> prot_coor.get_PDB_mmcif('3EAM')
    """

    # Get the pdb file from the PDB:
    with urllib.request.urlopen(
        f"http://files.rcsb.org/download/{pdb_ID}.cif"
    ) as response:
        pdb_lines = response.read().decode("utf-8").splitlines(True)

    mmcif_dict = parse_raw_mmcif_lines(pdb_lines)

    return mmcif_dict


In [221]:
import urllib.request

mmcif_dict = get_PDB_mmcif(1, '1y0m')

In [231]:
index_model = test['_atom_site']['col_names'].index('pdbx_PDB_model_num')
model_list = [int(i) for i in set(test['_atom_site']['value'][index_model])]
model_list.sort()
len(model_list)

20

In [233]:
test['_atom_site']['col_names']

['group_PDB',
 'id',
 'type_symbol',
 'label_atom_id',
 'label_alt_id',
 'label_comp_id',
 'label_asym_id',
 'label_entity_id',
 'label_seq_id',
 'pdbx_PDB_ins_code',
 'Cartn_x',
 'Cartn_y',
 'Cartn_z',
 'occupancy',
 'B_iso_or_equiv',
 'pdbx_formal_charge',
 'auth_seq_id',
 'auth_comp_id',
 'auth_asym_id',
 'auth_atom_id',
 'pdbx_PDB_model_num']

In [232]:
model_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [222]:
'pdbx_PDB_model_num' in mmcif_dict["_atom_site"]['col_names']

True

In [3]:
test_mmcif = "src/pdb_numpy/tests/input/1y0m.cif"

In [5]:
test_mmcif = "src/pdb_numpy/tests/input/2rri.cif"
with open(test_mmcif, "r") as f:
    mmcif_lines = f.readlines()

In [6]:
test = parse_raw_mmcif_lines(mmcif_lines)

In [225]:
index_model = test['_atom_site']['col_names'].index('pdbx_PDB_model_num')
set(test['_atom_site']['value'][index_model])

{'1',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '2',
 '20',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9'}

In [200]:

def get_mmcif_string_from_dict(mmcif_dict):
    """Return a mmcif dict as a mmcif string.

    Parameters
    ----------
    self : dict
        mmcif dict
    
    Returns
    -------
    str
        mmcif dict as a mmcif string
    
    """

    str_out = ""
    old_category = ""

    for category in mmcif_dict:
        if category == 'title':
            str_out += f"{mmcif_dict[category]['title']}\n"
        else:
            if category != old_category:
                str_out += f"# \n"
                old_category = category
            if 'col_names' in mmcif_dict[category]:
                str_out += f"loop_\n"
                raw_width = []
                for i, col_name in enumerate(mmcif_dict[category]['col_names']):
                    str_out += f"{category}.{col_name} \n"
                    max_len = len(max(mmcif_dict[category]['value'][i], key=len))
                    raw_width.append(max_len)
                for i in range(len(mmcif_dict[category]['value'][0])):
                    for j in range(len(mmcif_dict[category]['col_names'])):
                        str_out += f"{mmcif_dict[category]['value'][j][i]:{raw_width[j]}} "
                    str_out += f"\n"
            else:
                max_len = len(max(mmcif_dict[category], key=len)) + len(category) + 3
                #print(max_len, mmcif_dict[category].keys())
                for attribute in mmcif_dict[category]:
                    local_str = f"{'.'.join([category, attribute]):{max_len}} {mmcif_dict[category][attribute]} \n"
                    if len(local_str) > 125:
                        str_out += f"{'.'.join([category, attribute]):{max_len}} \n{mmcif_dict[category][attribute]} \n"
                    else:
                        str_out += local_str
    str_out += f"# \n"

    return str_out


In [201]:
['1', '2'].join('.')

AttributeError: 'list' object has no attribute 'join'

In [202]:
test['_pdbx_database_related'].keys()

odict_keys(['db_id', 'db_name', 'content_type', 'details'])

In [203]:
max_len = max(test['_database_2']['value'][0], key=len)
len(max_len)

5

In [204]:
tmp = 'test'
len_str = 6
print(f"#{tmp:{len_str}}#")

#test  #


In [205]:
print(get_mmcif_string(test))

data_2RRI
# 
_entry.id   2RRI 
# 
_audit_conform.dict_name       mmcif_pdbx.dic 
_audit_conform.dict_version    5.279 
_audit_conform.dict_location   http://mmcif.pdb.org/dictionaries/ascii/mmcif_pdbx.dic 
# 
loop_
_database_2.database_id 
_database_2.database_code 
PDB   2RRI         
RCSB  RCSB150199   
BMRB  11420        
WWPDB D_1000150199 
# 
_pdbx_database_related.db_id          11420 
_pdbx_database_related.db_name        BMRB 
_pdbx_database_related.content_type   unspecified 
_pdbx_database_related.details        . 
# 
_pdbx_database_status.deposit_site                    BMRB 
_pdbx_database_status.entry_id                        2RRI 
_pdbx_database_status.process_site                    PDBJ 
_pdbx_database_status.recvd_initial_deposition_date   2010-12-21 
_pdbx_database_status.SG_entry                        ? 
_pdbx_database_status.status_code                     REL 
_pdbx_database_status.status_code_mr                  REL 
_pdbx_database_status.status_code_sf         

In [163]:
len("_pdbx_nmr_sample_details.contents         '0.5mM [U-13C; U-15N] entity; 1% DPC; 50mM potassium phosphate; 90% H2O/10% D2O' ")

123

In [130]:
%lprun -f parse_mmcif_lines parse_mmcif_lines(mmcif_lines)

Timer unit: 1e-09 s

Total time: 1.33601 s
File: /tmp/ipykernel_673/3205869451.py
Function: parse_mmcif_lines at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
     4                                           def parse_mmcif_lines(mmcif_lines):
     5                                               """Parse the mmcif lines and return atom information as a dictionary
     6                                           
     7                                               Parameters
     8                                               ----------
     9                                               self : Coor
    10                                                   Coor object
    11                                               mmcif_lines : list
    12                                                   list of pdb lines
    13                                           
    14                                               Returns
    15                                 

In [25]:
%timeit parse_mmcif_lines(mmcif_lines)

81.2 ms ± 706 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
sys.path.append("./src/")
from pdb_numpy import Coor
from pdb_numpy.tests.datafiles import PDB_1Y0M

%timeit Coor(PDB_1Y0M)

2.52 ms ± 40.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [1]:
import sys
import numpy as np

sys.path.append("./src/")

from importlib import reload
import pdb_numpy
reload(pdb_numpy)

<module 'pdb_numpy' from './src/pdb_numpy/__init__.py'>

In [2]:
test = pdb_numpy.Coor('../../../../../tmp/pytest-of-murail/pytest-56/test_read_write_pdb_models0/test_2rri.pdb')
#test.crystal_pack.strip()

In [3]:
test.models[0].atom_dict.keys()

dict_keys(['field', 'num_resid_uniqresid', 'name_resname_elem', 'alterloc_chain_insertres', 'xyz', 'occ_beta'])

In [4]:
test2 = pdb_numpy.Coor('src/pdb_numpy/tests/input/2rri.cif')


In [5]:
for model, model_2 in zip(test.models, test2.models):
    for key in model.atom_dict:
        # Atom index can differ
        if key == "num_resid_uniqresid":
            print(key)
            print(model.atom_dict[key][:20])
            print(model_2.atom_dict[key][:20])
            assert (
                model.atom_dict[key][:, 1:]
                ==  model_2.atom_dict[key][:, 1:]
            ).all()
        else:
            print(key)
            print(model.atom_dict[key][0])
            print(model.atom_dict[key][0])
            assert (
                model.atom_dict[key] ==  model_2.atom_dict[key]
            ).all()

field
b'A'
b'A'
num_resid_uniqresid
[[ 1  1  0]
 [ 2  1  0]
 [ 3  1  0]
 [ 4  1  0]
 [ 5  1  0]
 [ 6  1  0]
 [ 7  1  0]
 [ 8  1  0]
 [ 9  1  0]
 [10  1  0]
 [11  1  0]
 [12  1  0]
 [13  1  0]
 [14  1  0]
 [15  1  0]
 [16  1  0]
 [17  1  0]
 [18  1  0]
 [19  2  1]
 [20  2  1]]
[[ 1  1  0]
 [ 2  1  0]
 [ 3  1  0]
 [ 4  1  0]
 [ 5  1  0]
 [ 6  1  0]
 [ 7  1  0]
 [ 8  1  0]
 [ 9  1  0]
 [10  1  0]
 [11  1  0]
 [12  1  0]
 [13  1  0]
 [14  1  0]
 [15  1  0]
 [16  1  0]
 [17  1  0]
 [18  1  0]
 [19  2  1]
 [20  2  1]]
name_resname_elem
[b'N' b'HIS' b'N']
[b'N' b'HIS' b'N']
alterloc_chain_insertres
[b'' b'A' b'']
[b'' b'A' b'']
xyz
[-11.432  14.757 -14.63 ]
[-11.432  14.757 -14.63 ]
occ_beta
[1. 0.]
[1. 0.]
field
b'A'
b'A'
num_resid_uniqresid
[[ 1  1  0]
 [ 2  1  0]
 [ 3  1  0]
 [ 4  1  0]
 [ 5  1  0]
 [ 6  1  0]
 [ 7  1  0]
 [ 8  1  0]
 [ 9  1  0]
 [10  1  0]
 [11  1  0]
 [12  1  0]
 [13  1  0]
 [14  1  0]
 [15  1  0]
 [16  1  0]
 [17  1  0]
 [18  1  0]
 [19  2  1]
 [20  2  1]]
[[480   1   1

AssertionError: 

In [12]:
test.models[0].atom_dict['xyz'].dtype

dtype('float32')