### extract data from mmCIF files for summary dataframe

In [1]:
from Bio.PDB import PDBList                  # fetches/saves PDB data
from Bio.PDB.MMCIF2Dict import MMCIF2Dict    # parses data in mmCIF files
import pandas as pd
import os

In [3]:
# define CIF tokens to process. some are repeated in the asu and the assembly, some are not, 
# as the assembly can have fewer instances of entities. also, some others are not present in assembly, 
# like title and id, src, etc. so we need to grab some from asu and some from assembly.
# key is name of field (column) in database (dataframe), and value is the CIF token.

# tokens from asu entry are all single element entries
asuTokens = { 'pdbid': '_entry.id',\
            'date': '_pdbx_database_status.recvd_initial_deposition_date',\
            'method': '_exptl.method',\
            'title': '_struct.title',\
            'gene': '_entity_src_gen.pdbx_gene_src_gene',\
            'species': '_entity_src_gen.pdbx_gene_src_scientific_name',\
            'keywords': '_struct_keywords.pdbx_keywords',\
            'text': '_struct_keywords.text'
          }
# all keys from assembly are lists of multiple entries
assemblyTokens = {'polyid': '_entity_poly.entity_id',\
                'polytype': '_entity_poly.type',\
                'seq': '_entity_poly.pdbx_seq_one_letter_code_can',\
                'polystrand': '_entity_poly.pdbx_strand_id',\
                'entityid': '_entity.id',\
                'entitytype': '_entity.type',\
                'descr': '_entity.pdbx_description',\
                'MW': '_entity.formula_weight',\
                'number': '_entity.pdbx_number_of_molecules'
               }

In [5]:
# define pdb code, data directories. load and parse both asu and assembly file
pdbCode           = '4nm6'
asuDirectory      = 'asu'
assemblyDirectory = 'assembly'
pdblist = PDBList()
pdblist.retrieve_pdb_file(pdbCode,pdir=asuDirectory,file_format='mmCif')
pdblist.retrieve_assembly_file(pdbCode,1,pdir=assemblyDirectory,file_format='mmCif')
asucif       = MMCIF2Dict(asuDirectory+'/'+pdbCode+'.cif')
assemblycif  = MMCIF2Dict(assemblyDirectory+'/'+pdbCode+'-assembly1.cif')

Structure exists: 'asu/4nm6.cif' 
Structure exists: 'assembly/4nm6-assembly1.cif' 


In [7]:
# create the asuDict and the assemblyDict
asuDict = {}
for k,v in asuTokens.items():
    asuDict[k] = asucif[v]
    print(k,asuDict[k])
assemblyDict = {}
for k,v in assemblyTokens.items():
    assemblyDict[k] = assemblycif[v]
    print(k,assemblyDict[k])


pdbid ['4NM6']
date ['2013-11-14']
method ['X-RAY DIFFRACTION']
title ['Crystal structure of TET2-DNA complex']
gene ['TET2, KIAA1546, Nbla00191', 'TET2, KIAA1546, Nbla00191']
species ['Homo sapiens', 'Homo sapiens']
keywords ['OXIDOREDUCTASE/DNA']
text ['DNA hydroxylation, OXIDOREDUCTASE-DNA complex']
polyid ['1', '2']
polytype ['polypeptide(L)', 'polydeoxyribonucleotide']
seq ['GGSDFPSCRCVEQIIEKDEGPFYTHLGAGPNVAAIREIMEERFGQKGKAIRIERVIYTGKEGKSSQGCPIAKWVVRRSSS\nEEKLLCLVRERAGHTCEAAVIVILILVWEGIPLSLADKLYSELTETLRKYGTLTNRRCALNEERTCACQGLDPETCGASF\nSFGCSWSMYYNGCKFARSKIPRKFKLLGDDPKEEEKLESHLQNLSTLMAPTYKKLAPDAYNNQIEYEHRAPECRLGLKEG\nRPFSGVTACLDFCAHAHRDLHNMQNGSTLVCTLTREDNREFGGKPEDEQLHVLPLYKVSDVDEFGSVEAQEEKKRSGAIQ\nVLSSFRRKVRMLAEPVKTCRQRKLEAKKAAAEKLSGGGGSGGGGSGGGGSDEVWSDSEQSFLDPDIGGVAVAPTHGSILI\nECAKRELHATTPLKNPNRNHPTRISLVFYQHKSMNEPKHGLALWEAKMAEKAREKEEECEKYG', 'ACCACCGGTGGT']
polystrand ['A', 'B,C']
entityid ['1', '2', '3', '4', '5', '6']
entitytype ['polymer', 'polymer', 'non-polymer', 'non-polymer

In [25]:
polymers=list(zip(assemblyDict['polyid'],assemblyDict['polytype'],assemblyDict['seq'],assemblyDict[ 'polystrand' ]) )
entities=list(zip(assemblyDict['entityid'],assemblyDict['entitytype'],assemblyDict['descr'],assemblyDict['MW'],assemblyDict['number']))
print('polymers:')
for p in polymers:
    print(p)
    if p[1]=='polypeptide(L)':
        proteinSequence = p[2]
        proteinChain = p[-1]
    elif p[1]=='polydeoxyribonucleotide':
        dnaSequence = p[2]
        dnaChain = p[-1]
print('\nentities:')
MW = []
number = []
for e in entities:
    print(e)
    if e[1] == 'polymer': 
        MW.append( e[-2] )
        number.append( e[-1] )

polymers:
('1', 'polypeptide(L)', 'GGSDFPSCRCVEQIIEKDEGPFYTHLGAGPNVAAIREIMEERFGQKGKAIRIERVIYTGKEGKSSQGCPIAKWVVRRSSS\nEEKLLCLVRERAGHTCEAAVIVILILVWEGIPLSLADKLYSELTETLRKYGTLTNRRCALNEERTCACQGLDPETCGASF\nSFGCSWSMYYNGCKFARSKIPRKFKLLGDDPKEEEKLESHLQNLSTLMAPTYKKLAPDAYNNQIEYEHRAPECRLGLKEG\nRPFSGVTACLDFCAHAHRDLHNMQNGSTLVCTLTREDNREFGGKPEDEQLHVLPLYKVSDVDEFGSVEAQEEKKRSGAIQ\nVLSSFRRKVRMLAEPVKTCRQRKLEAKKAAAEKLSGGGGSGGGGSGGGGSDEVWSDSEQSFLDPDIGGVAVAPTHGSILI\nECAKRELHATTPLKNPNRNHPTRISLVFYQHKSMNEPKHGLALWEAKMAEKAREKEEECEKYG', 'A')
('2', 'polydeoxyribonucleotide', 'ACCACCGGTGGT', 'B,C')

entities:
('1', 'polymer', 'Methylcytosine dioxygenase TET2', '51454.398', '1')
('2', 'polymer', "5'-D(*AP*CP*CP*AP*CP*(5CM)P*GP*GP*TP*GP*GP*T)-3'", '3677.419', '2')
('3', 'non-polymer', 'ZINC ION', '65.409', '3')
('4', 'non-polymer', 'FE (II) ION', '55.845', '1')
('5', 'non-polymer', 'N-OXALYLGLYCINE', '147.086', '1')
('6', 'water', 'water', '18.015', '131')


In [27]:
print('protein sequence:',proteinSequence)
print('protein chain:', proteinChain )
print('dna sequence:', dnaSequence )
print('dna chain:', dnaChain)
print('MW:',MW)
print('number:',number)

protein sequence: GGSDFPSCRCVEQIIEKDEGPFYTHLGAGPNVAAIREIMEERFGQKGKAIRIERVIYTGKEGKSSQGCPIAKWVVRRSSS
EEKLLCLVRERAGHTCEAAVIVILILVWEGIPLSLADKLYSELTETLRKYGTLTNRRCALNEERTCACQGLDPETCGASF
SFGCSWSMYYNGCKFARSKIPRKFKLLGDDPKEEEKLESHLQNLSTLMAPTYKKLAPDAYNNQIEYEHRAPECRLGLKEG
RPFSGVTACLDFCAHAHRDLHNMQNGSTLVCTLTREDNREFGGKPEDEQLHVLPLYKVSDVDEFGSVEAQEEKKRSGAIQ
VLSSFRRKVRMLAEPVKTCRQRKLEAKKAAAEKLSGGGGSGGGGSGGGGSDEVWSDSEQSFLDPDIGGVAVAPTHGSILI
ECAKRELHATTPLKNPNRNHPTRISLVFYQHKSMNEPKHGLALWEAKMAEKAREKEEECEKYG
protein chain: A
dna sequence: ACCACCGGTGGT
dna chain: B,C
MW: ['51454.398', '3677.419']
number: ['1', '2']


In [None]:
print(entities)

In [None]:
a[1].append(200)

In [None]:
a