In [None]:
# First, let's check if we're in Google Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab")
    # Clone the CHEMELEON repository
    !git clone https://github.com/hspark1212/chemeleon-dng.git
    %cd chemeleon-dng
    !pip install -e .
else:
    print("Running locally")
    # Ensure you have the package installed locally
    # You should have already cloned and installed chemeleon-dng

In [None]:
# Import required libraries
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pymatgen.core import Structure, Composition
from pymatgen.analysis.structure_matcher import StructureMatcher
from pymatgen.io.cif import CifWriter
import warnings
warnings.filterwarnings('ignore')

# Add chemeleon-dng to path if running locally
if not IN_COLAB:
    chemeleon_path = Path('/home/ryan/informatics/chemeleon-dng')
    if chemeleon_path.exists():
        sys.path.insert(0, str(chemeleon_path))

print("Libraries imported successfully!")

In [None]:
import tempfile
import subprocess

# Create output directory
output_dir = tempfile.mkdtemp(prefix="chemeleon_csp_")
print(f"Output directory: {output_dir}")

# Set up the command for NaClO4
cmd = (
    f"python /content/chemeleon-dng/chemeleon_dng/sample.py "
    f"--task=csp --formulas='NaClO4' --num_samples=5 "
    f"--output_dir='{output_dir}' --device=cpu"
)

print(f"Running: {cmd}")

# Run the generation command
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("Errors:", result.stderr)


In [None]:
# Let's examine the generated structures
from pathlib import Path
import glob

# Find all generated CIF files
cif_files = glob.glob(os.path.join(output_dir, "*.cif"))
print(f"Found {len(cif_files)} generated structures:")

# Load and analyze the structures
generated_structures = []
for cif_file in cif_files:
    structure = Structure.from_file(cif_file)
    generated_structures.append(structure)
    print(f"\n{Path(cif_file).name}:")
    print(f"  Formula: {structure.composition.reduced_formula}")
    print(f"  Space group: {structure.get_space_group_info()[0]}")
    print(f"  Volume: {structure.volume:.2f} Ų")
    print(f"  Density: {structure.density:.2f} g/cm³")

In [None]:
import os
import subprocess
import tempfile

formulas_list = ['NaClO4', 'NaBF4', 'NaPF6']
formulas_str = ','.join(formulas_list)

# Create output directory
output_dir_multi = tempfile.mkdtemp(prefix="chemeleon_multi_")
print(f"Output directory: {output_dir_multi}")

# Construct correct command for Colab
cmd = f"python /content/chemeleon-dng/chemeleon_dng/sample.py --task=csp --formulas='{formulas_str}' --num_samples=3 --output_dir='{output_dir_multi}' --device=cpu"

print(f"Generating structures for: {formulas_list}")
print(f"Running: {cmd}")

# Run the command and capture output
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

# Print both stdout and stderr for full visibility
print(result.stdout)
if result.stderr:
    print("Errors:", result.stderr)


In [None]:
# Analyze the generated battery material structures
battery_structures = {}
cif_files_multi = glob.glob(os.path.join(output_dir_multi, "*.cif"))

for cif_file in cif_files_multi:
    structure = Structure.from_file(cif_file)
    formula = structure.composition.reduced_formula

    if formula not in battery_structures:
        battery_structures[formula] = []
    battery_structures[formula].append(structure)

# Compare structures for each formula
matcher = StructureMatcher()

for formula, structures in battery_structures.items():
    print(f"\n{formula}: Generated {len(structures)} structures")

    # Check for unique structures
    unique_structures = []
    for s in structures:
        is_unique = True
        for u in unique_structures:
            if matcher.fit(s, u):
                is_unique = False
                break
        if is_unique:
            unique_structures.append(s)

    print(f"  Unique structures: {len(unique_structures)}")
    for i, s in enumerate(unique_structures):
        print(f"  Structure {i+1}: SG {s.get_space_group_info()[0]}, V={s.volume:.1f} ų")

In [None]:
import tempfile

output_dir_dng = tempfile.mkdtemp(prefix="chemeleon_dng_")
print(f"Output directory: {output_dir_dng}")

!python /content/chemeleon-dng/chemeleon_dng/sample.py \
  --task=dng \
  --num_samples=20 \
  --batch_size=5 \
  --output_dir="{output_dir_dng}" \
  --device=cpu


In [None]:
# Analyze the diversity of generated structures
dng_cif_files = glob.glob(os.path.join(output_dir_dng, "*.cif"))
print(f"Generated {len(dng_cif_files)} novel structures")

# Collect composition and structural information
compositions = []
space_groups = []
volumes = []
elements_count = {}

for cif_file in dng_cif_files:
    structure = Structure.from_file(cif_file)
    comp = structure.composition

    compositions.append(comp.reduced_formula)
    space_groups.append(structure.get_space_group_info()[0])
    volumes.append(structure.volume / len(structure))

    # Count elements
    for element in comp.elements:
        elem_str = str(element)
        elements_count[elem_str] = elements_count.get(elem_str, 0) + 1

# Display statistics
print(f"\nUnique compositions: {len(set(compositions))}")
print(f"Unique space groups: {len(set(space_groups))}")
print(f"\nMost common elements:")
sorted_elements = sorted(elements_count.items(), key=lambda x: x[1], reverse=True)[:10]
for elem, count in sorted_elements:
    print(f"  {elem}: {count} structures")

In [None]:
# Visualize the distribution of generated structures
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Volume distribution
axes[0].hist(volumes, bins=20, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Volume per atom (ų)')
axes[0].set_ylabel('Count')
axes[0].set_title('Volume Distribution of Generated Structures')

# Number of elements per structure
n_elements = [len(Composition(comp).elements) for comp in compositions]
unique_counts = list(set(n_elements))
count_freq = [n_elements.count(i) for i in unique_counts]

axes[1].bar(unique_counts, count_freq, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Number of unique elements')
axes[1].set_ylabel('Count')
axes[1].set_title('Chemical Complexity Distribution')
axes[1].set_xticks(unique_counts)

plt.tight_layout()
plt.show()