In [1]:
import os
import shutil
from pathlib import Path

from parquetdb import ParquetDB

from crystpqdb.loaders import get_loader


CURRENT_DIR = Path(os.path.abspath("."))
ROOT_DIR = CURRENT_DIR.parent
DATA_DIR = ROOT_DIR / "data"

print("ROOT_DIR: {}".format(ROOT_DIR))
print("DATA_DIR: {}".format(DATA_DIR))
print("CURRENT_DIR: {}".format(CURRENT_DIR))

DB_DIR = DATA_DIR / "crystpqdb"


[INFO] 2025-09-01 09:57:47 - parquetdb.utils.config[37][load_config] - Config file: C:\Users\lllang\AppData\Local\parquetdb\parquetdb\config.yml


  from .autonotebook import tqdm as notebook_tqdm


ROOT_DIR: c:\Users\lllang\Desktop\Current_Projects\Crystal-Parquet-Database
DATA_DIR: c:\Users\lllang\Desktop\Current_Projects\Crystal-Parquet-Database\data
CURRENT_DIR: c:\Users\lllang\Desktop\Current_Projects\Crystal-Parquet-Database\examples


## Initialize or Download the database

## Initialize from scratch

In [2]:
if DB_DIR.exists():
    shutil.rmtree(DB_DIR)
pqdb = ParquetDB(DB_DIR)

[INFO] 2025-09-01 09:25:38 - parquetdb.core.parquetdb[201][__init__] - Initializing ParquetDB with db_path: c:\Users\lllang\Desktop\Current_Projects\Crystal-Parquet-Database\data\crystpqdb
[INFO] 2025-09-01 09:25:38 - parquetdb.core.parquetdb[203][__init__] - verbose: 1


## Download or initialize from local directory

In [2]:
if not DB_DIR.exists():
    print("Downloading the database...")
    from crystpqdb.download import download
    download(DB_DIR)
pqdb = ParquetDB(DB_DIR)

[INFO] 2025-09-01 09:54:33 - parquetdb.core.parquetdb[201][__init__] - Initializing ParquetDB with db_path: c:\Users\lllang\Desktop\Current_Projects\Crystal-Parquet-Database\data\crystpqdb
[INFO] 2025-09-01 09:54:33 - parquetdb.core.parquetdb[203][__init__] - verbose: 1


## Load the datasets into the database withe the loaders

In [None]:
datasets = [
    ("alex", "3d"),
    ("alex", "2d"),
    ("alex", "1d"),
    ("mp", "summary"),
    ("materialscloud", "mc3d"),
]

### Alexandria3D (alex, 3d)

In [None]:
loader = get_loader("alex", "3d", data_dir=DATA_DIR)
table = loader.run()
pqdb.create(table, convert_to_fixed_shape=False)

### Alexandria2D (alex, 2d)

In [None]:
loader = get_loader("alex", "2d", data_dir=DATA_DIR)
table = loader.run()
pqdb.create(table, convert_to_fixed_shape=False)

### Alexandria1D (alex, 1d)

In [None]:
loader = get_loader("alex", "1d", data_dir=DATA_DIR)
table = loader.run()
pqdb.create(table, convert_to_fixed_shape=False)

### Materials Project (mp, summary)

In [4]:
loader = get_loader("mp", "summary", data_dir=DATA_DIR)
table = loader.run()
pqdb.create(table, convert_to_fixed_shape=False)

[INFO] 2025-09-01 09:04:29 - crystpqdb.loaders.base[147][download] - Directory c:\Users\lllang\Desktop\Current_Projects\Crystal-Parquet-Database\data\materials_project\summary\raw already exists and is not empty
Loading from c:\Users\lllang\Desktop\Current_Projects\Crystal-Parquet-Database\data\materials_project\summary\raw into c:\Users\lllang\Desktop\Current_Projects\Crystal-Parquet-Database\data\materials_project\summary\interim\pqdb


### Materials Cloud (materialscloud, mc3d)

In [3]:
from crystpqdb.loaders import LoaderConfig

config = LoaderConfig(ingest_from_scratch=False)
loader = get_loader("materialscloud", "mc3d", data_dir=DATA_DIR, config=config)
table = loader.run()
pqdb.create(table, convert_to_fixed_shape=False)

[INFO] 2025-09-01 09:25:39 - crystpqdb.loaders.base[147][download] - Directory c:\Users\lllang\Desktop\Current_Projects\Crystal-Parquet-Database\data\materialscloud\mc3d\raw already exists and is not empty


## Check the database

In [4]:
print("Shape of the database: ", (pqdb.n_rows,pqdb.n_columns))

Shape of the database:  (5465421, 87)


### Field names

In [8]:
schema = pqdb.get_schema()
for field in schema:
    print(f"{field.name:<30} {str(field.type):<30}")

cart_coords                    list<element: list<element: double>>
data.band_gap                  double                        
data.band_gap_dir              double                        
data.band_gap_ind              double                        
data.dos_ef                    double                        
data.e_electronic              double                        
data.e_ionic                   double                        
data.e_total                   double                        
data.energy_above_hull         double                        
data.energy_corrected          double                        
data.energy_formation          double                        
data.energy_phase_seperation   double                        
data.energy_total              double                        
data.energy_uncorrected        double                        
data.g_reuss                   double                        
data.g_voigt                   double                        
da

## Distrbution of data across row groups and files.

In [22]:
n_row_groups_per_file = pqdb.get_number_of_row_groups_per_file()
print(f"Number of row groups per file: {n_row_groups_per_file}\n")

file_sizes = pqdb.get_file_sizes()
print("="*80)
print("File sizes:")
for filename, size in file_sizes.items():
    print(f"{filename}: {size:.2f} MB")
print("="*80)
## 

print("Number of rows/size per row group per file:")
n_rows_per_row_group_per_file = pqdb.get_n_rows_per_row_group_per_file()
row_group_sizes_per_file = pqdb.get_row_group_sizes_per_file()
for i, (filename, filesize_per_row_group) in enumerate(row_group_sizes_per_file.items()):
    print(f"File {i+1}: {filename} | {file_sizes[filename]:.2f} MB")
    n_rows_per_row_group = n_rows_per_row_group_per_file[i]
    for row_group_index, row_group_size in filesize_per_row_group.items():
        n_rows = n_rows_per_row_group[row_group_index]
        print(f"  Row group {row_group_index}: {n_rows} rows | {row_group_size:.2f} MB")
print("="*80)

Number of row groups per file: [84, 1]

File sizes:
crystpqdb_0.parquet: 6077.85 MB
crystpqdb_1.parquet: 55.69 MB
Number of rows/size per row group per file:
File 1: crystpqdb_0.parquet | 6077.85 MB
  Row group 0: 65536 rows | 92.70 MB
  Row group 1: 65536 rows | 101.62 MB
  Row group 2: 65536 rows | 103.12 MB
  Row group 3: 65536 rows | 98.98 MB
  Row group 4: 65536 rows | 99.54 MB
  Row group 5: 65536 rows | 85.15 MB
  Row group 6: 65536 rows | 107.19 MB
  Row group 7: 65536 rows | 94.16 MB
  Row group 8: 65536 rows | 91.69 MB
  Row group 9: 65536 rows | 103.27 MB
  Row group 10: 65536 rows | 103.24 MB
  Row group 11: 65536 rows | 98.97 MB
  Row group 12: 65536 rows | 105.41 MB
  Row group 13: 65536 rows | 96.83 MB
  Row group 14: 65536 rows | 98.39 MB
  Row group 15: 65536 rows | 96.53 MB
  Row group 16: 65536 rows | 98.54 MB
  Row group 17: 65536 rows | 95.39 MB
  Row group 18: 65536 rows | 101.66 MB
  Row group 19: 65536 rows | 112.19 MB
  Row group 20: 65536 rows | 110.58 MB
  Ro

> Note: This has a 6GB file. This should probably be normalized to have it be distributed across 2 GB per file.

# Uploading to huggingface


Has to be executed in script to gain benefit of hf-transfer. This is due to jupyter notebook running in an event loop.

In [2]:
from crystpqdb.download import upload


# parquet_files = list(DB_DIR.glob("*.parquet"))
# print(parquet_files)
# for file in parquet_files:
upload(DB_DIR)

  0%|          | 0/1 [01:06<?, ?it/s]


KeyboardInterrupt: 