<a href="https://colab.research.google.com/github/rcsb/rcsb-training-resources/blob/master/training-events/2024/utilizing-binary-cif/RCSB_mmCIF_BCIF_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Demonstration of working with mmCIF and BCIF using RCSB PDB Python Packages

## Set-up
Install packages (and make sure to keep re-upgrade often!)

In [None]:
!pip install --upgrade rcsb.utils.io
!pip install --upgrade mmcif

# Make sure to keep packages up-to-date (re-upgrade often)!!!


## 1. Reading an mmCIF or BCIF File

In [None]:
from rcsb.utils.io.MarshalUtil import MarshalUtil

mU = MarshalUtil()

# Reading mmCIF
# Load from remote URL
dataContainerList = mU.doImport("https://files.rcsb.org/download/4HHB.cif.gz", fmt="mmcif")

# Or, load from a local file (either compressed or uncompressed)
# dataContainerList = mU.doImport("local/path/to/file.cif", fmt="mmcif")

In [None]:
# Reading BCIF (note the URL change)
dataContainerList = mU.doImport("https://models.rcsb.org/4HHB.bcif.gz", fmt="bcif")


## 2. Accessing Data Categories

Once the data is loaded, you can access and inspect the data categories:

In [None]:
# Get the first data container (usually there's only one per file)
dataContainer = dataContainerList[0]

# Print the container name
containerName = dataContainer.getName()
print(f"Container Name: {containerName}")

In [None]:
# Get the list of categories
categoryNames = dataContainer.getObjNameList()
print("Categories:", categoryNames)

In [None]:
# Access a specific category and its attributes
j = 0  # print the first 5 rows
if dataContainer.exists("atom_site"):
    atomSiteObj = dataContainer.getObj("atom_site")
    for i in range(atomSiteObj.getRowCount()):
        rowData = atomSiteObj.getRowAttributeDict(i)
        print(rowData)
        j += 1
        if j > 5:
            break

In [None]:
# Creating a dictionary from a DataContainer
dcD = {}

for dataContainer in dataContainerList:
    eName = dataContainer.getName()
    for catName in categoryNames:
        if not dataContainer.exists(catName):
            continue
        dObj = dataContainer.getObj(catName)
        for ii in range(dObj.getRowCount()):
            dD = dObj.getRowAttributeDict(ii)
            dcD.setdefault(eName, {}).setdefault(catName, []).append(dD)

print(dcD)

In [None]:
# FYI—You can also export and import JSON and pickle data:
# Works for any type of json or dictionary—doesn't need to be CIF-related!
mU.doExport("4HHB.json", dcD, fmt="json")
mU.doExport("4HHB.pic", dcD, fmt="pickle")

## 3. Manipulating, Deleting, and Adding Categories

In [None]:
### Renaming a category
# For example, to rename "citation" to "citation_reference"
dataContainer.rename("citation", "citation_reference")

In [None]:
### Delete a Category
# For example, to delete all EM-related categories:
for catName in categoryNames:
    if catName.startswith("em"):
        dataContainer.remove(catName)

In [None]:
### Add a New Category
# To add a new category to the data container:
from mmcif.api.DataCategory import DataCategory

# Create a new category object
newCategory = DataCategory("new_category", attributeNameList=["ordinal", "attribute1", "attribute2"])

# Add data to the category
newCategory.append([1, "a", "b"])
newCategory.append([2, "c", "d"])
newCategory.append([3, "e", "f"])
newCategory.append([4, "g", "h"])

# Add the new category to the data container
dataContainer.append(newCategory)

# Now verify it is present
categoryNames = dataContainer.getObjNameList()
print("Categories:", categoryNames)

In [None]:
### All available Data Container methods:

dc = dataContainerList[0]
# >>> dc.
# dc.append(                 dc.getGlobal()             dc.getObjNameList()        dc.invokeDataBlockMethod(  dc.rename(                 dc.setProp(
# dc.copy(                   dc.getName()               dc.getProp(                dc.merge(                  dc.replace(                dc.setType(
# dc.exists(                 dc.getObj(                 dc.getPropCatalog()        dc.printIt(                dc.setGlobal()             dc.toJSON()
# dc.filterObjectNameList(   dc.getObjCatalog()         dc.getType()               dc.remove(                 dc.setName(

help(dc.remove)


## 4. Exporting Data

In [None]:
### Export as mmCIF
mU.doExport("4HHB_new.cif", dataContainerList, fmt="mmcif")

In [None]:
### Export as BCIF
mU.doExport("4HHB_new.bcif", dataContainerList, fmt="bcif")

In [None]:
# Export as a BCIF file (with gzip compression - note that this will create a temporary file as well):
mU.doExport("4HHB_new_2.bcif.gz", dataContainerList, fmt="bcif")

In [None]:
### Compress the file with Gzip
# To export the data in a compressed (gzipped) format:
from rcsb.utils.io.FileUtil import FileUtil
fU = FileUtil(workPath=".")
fU.compress("4HHB_new.bcif", "4HHB_new.bcif.gz")

In [None]:
# Remove the uncompressed file
mU.remove("4HHB_new.bcif")

## 5. Working with computed structure models (CSMs)

In [None]:
# Read in a CSM (from AlphaFold DB)
dataContainerList = mU.doImport("https://alphafold.ebi.ac.uk/files/AF-P24854-F1-model_v4.cif", fmt="mmcif")

In [None]:
# Export as BCIF (using default PDBx/mmCIF dictionaries)
mU.doExport("4HHB_new.bcif", dataContainerList, fmt="bcif")

In [None]:
# Export as BCIF by specifying additional dictionaries
mU.doExport(
    "AF-P24854-F1.bcif",
    dataContainerList,
    fmt="bcif",
    dictFilePathL=[
        "https://raw.githubusercontent.com/wwpdb-dictionaries/mmcif_pdbx/master/dist/mmcif_pdbx_v5_next.dic",
        "https://raw.githubusercontent.com/ihmwg/ModelCIF/master/dist/mmcif_ma_ext.dic",
    ]
)