In [1]:
# Note: Cell is tagged to not show up in the mkdocs build
%load_ext autoreload
%autoreload 2

<div class="admonition abstract highlight">
    <p class="admonition-title">In short</p>
    <p>This tutorial shows how to create datasets with PDBs through the .zarr format.</p>
</div>

### Dummy PDB example

In [1]:
import zarr
import platformdirs

import numpy as np
import datamol as dm
import pandas as pd

from polaris.dataset import DatasetFactory
from polaris.dataset.converters import SDFConverter, PDBConverter

SAVE_DIR = dm.fs.join(platformdirs.user_cache_dir(appname="polaris-tutorials"), "002")

### Fetch PDB files from RCSB PDB

In [13]:
import biotite.database.rcsb as rcsb

pdb_path = rcsb.fetch("6s89", "pdb", SAVE_DIR)
print(pdb_path)

/Users/lu.zhu/Library/Caches/polaris-tutorials/002/6s89.pdb


### Create dataset from PDB file

In [14]:
save_dst = dm.fs.join(SAVE_DIR, "tutorial_pdb.zarr")

factory = DatasetFactory(zarr_root_path=save_dst)
factory.reset(save_dst)

factory.register_converter("pdb", PDBConverter(pdb_column="pdb"))
factory.add_from_file(pdb_path)

# Build the dataset
dataset = factory.build()

### Check the dataset

In [15]:
dataset

0,1
name,
description,
tags,
user_attributes,
owner,
polaris_version,0.7.10.dev22+g8edf177.d20240814
default_adapters,pdbARRAY_TO_PDB
zarr_root_path,/Users/lu.zhu/Library/Caches/polaris-tutorials/002/tutorial_pdb.zarr
readme,
annotations,pdbis_pointerTruemodalityPROTEIN_3DdescriptionNoneuser_attributesdtypeobject

0,1
pdb,ARRAY_TO_PDB

0,1
pdb,is_pointerTruemodalityPROTEIN_3DdescriptionNoneuser_attributesdtypeobject

0,1
is_pointer,True
modality,PROTEIN_3D
description,
user_attributes,
dtype,object


### Check data table

In [16]:
dataset.table

Unnamed: 0,pdb
0,pdb/6s89


### Get PDB data from specific row
A array of list of `biotite.Atom` will be returned.
See more details at [fastpdb](https://github.com/biotite-dev/fastpdb) and [Atom](https://github.com/biotite-dev/biotite/blob/main/src/biotite/structure/atoms.py).

In [17]:
dataset.get_data(0, "pdb")

array([
	Atom(np.array([ -7.009,  66.478, -50.24 ], dtype=float32), chain_id="A", res_id=697, ins_code="", res_name="GLU", hetero=False, atom_name="N", element="N", atom_id=1, b_factor=80.67, charge=0, occupancy=1.0),
	Atom(np.array([ -6.543,  66.534, -48.848], dtype=float32), chain_id="A", res_id=697, ins_code="", res_name="GLU", hetero=False, atom_name="CA", element="C", atom_id=2, b_factor=87.32, charge=0, occupancy=1.0),
	Atom(np.array([ -5.031,  66.811, -48.762], dtype=float32), chain_id="A", res_id=697, ins_code="", res_name="GLU", hetero=False, atom_name="C", element="C", atom_id=3, b_factor=82.26, charge=0, occupancy=1.0),
	Atom(np.array([ -4.303,  66.672, -49.737], dtype=float32), chain_id="A", res_id=697, ins_code="", res_name="GLU", hetero=False, atom_name="O", element="O", atom_id=4, b_factor=76.67, charge=0, occupancy=1.0),
	Atom(np.array([ -6.888,  65.234, -48.11 ], dtype=float32), chain_id="A", res_id=697, ins_code="", res_name="GLU", hetero=False, atom_name="CB", elemen

### Create dataset from multiple PDB files

In [7]:
pdb_paths = rcsb.fetch(["1l2y", "4i23"], "pdb", SAVE_DIR)
print(pdb_paths)

['/Users/lu.zhu/Library/Caches/polaris-tutorials/002/1l2y.pdb', '/Users/lu.zhu/Library/Caches/polaris-tutorials/002/4i23.pdb']


In [8]:
factory = DatasetFactory(SAVE_DIR.join("pdbs.zarr"))

converter = PDBConverter()
factory.register_converter("pdb", converter)

factory.add_from_files(pdb_paths, axis=0)
dataset = factory.build()

In [9]:
dataset.table

Unnamed: 0,pdb
0,pdb/1l2y
1,pdb/4i23


In [11]:
dataset.get_data(1, "pdb")

array([
	Atom(np.array([  4.893, -71.44 , -55.064], dtype=float32), chain_id="A", res_id=695, ins_code="", res_name="SER", hetero=False, atom_name="N", element="N", atom_id=1, b_factor=100.04, charge=0, occupancy=1.0),
	Atom(np.array([  5.038, -70.058, -54.508], dtype=float32), chain_id="A", res_id=695, ins_code="", res_name="SER", hetero=False, atom_name="CA", element="C", atom_id=2, b_factor=99.34, charge=0, occupancy=1.0),
	Atom(np.array([  6.262, -69.963, -53.589], dtype=float32), chain_id="A", res_id=695, ins_code="", res_name="SER", hetero=False, atom_name="C", element="C", atom_id=3, b_factor=98.3, charge=0, occupancy=1.0),
	Atom(np.array([  6.492, -70.845, -52.752], dtype=float32), chain_id="A", res_id=695, ins_code="", res_name="SER", hetero=False, atom_name="O", element="O", atom_id=4, b_factor=97.86, charge=0, occupancy=1.0),
	Atom(np.array([  3.77 , -69.674, -53.731], dtype=float32), chain_id="A", res_id=695, ins_code="", res_name="SER", hetero=False, atom_name="CB", elemen

The process of completing the dataset's metadata and uploading it to the hub follows the same steps as outlined in the tutorial [dataset_zarr.ipynb](docs/tutorials/dataset_zarr.ipynb)

The End. 