<a href="https://colab.research.google.com/github/dellacortelab/chronosort/blob/main/chronosort_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> 

# **Chronosort Colab Pipeline**
Run the Chronosort PCA workflow directly in Google Colab using uploaded MMCIF structures.

**Workflow overview**
- Install dependencies and fetch the Chronosort source code
- Upload MMCIF files (optionally bundled inside .zip or .tar archives)
- Configure PCA parameters and execute the pipeline
- Download the resulting trajectory, eigenvectors, projections, and plots

**Tips**
- Each code cell can be run with the ▶ button on the left.
- Uploads are placed in a scratch directory for the current Colab session only.
- If you re-run the upload cell, previously uploaded data will be replaced.
- Larger uploads may take a few minutes to transfer; keep the browser tab open until the upload finishes.

**Colab notes**
- Runtime: GPU is optional for this workflow, but the notebook retains Colab's default GPU metadata.
- Session storage is ephemeral; download results before ending the session.

In [None]:
%%time
#@title Install dependencies and fetch Chronosort
#@markdown This cell clones the Chronosort repository and installs the required Python packages.
import sys
import subprocess
from pathlib import Path

repo_url = "https://github.com/dellacortelab/chronosort.git"
repo_path = Path("chronosort")
if not repo_path.exists():
    subprocess.run(["git", "clone", "--depth", "1", repo_url, str(repo_path)], check=True)
requirements_path = repo_path / "requirements.txt"
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", str(requirements_path)], check=True)
print("Chronosort repository ready.")

In [None]:
#@title Upload CIF files or archives
#@markdown Upload one or more `.cif` files, or archives (`.zip`, `.tar`, `.tar.gz`, `.tgz`, `.tar.bz2`) containing them.
from pathlib import Path
import io
import zipfile
import tarfile
import shutil

try:
    from google.colab import files
except ImportError as exc:
    raise RuntimeError("This cell is intended to run inside Google Colab.") from exc

upload_root = Path("user_uploads")
cif_dir = upload_root / "cif_inputs"
if cif_dir.exists():
    shutil.rmtree(cif_dir)
cif_dir.mkdir(parents=True, exist_ok=True)

def _unique_target(directory, filename):
    base_path = Path(filename).name
    stem = Path(base_path).stem
    suffix = Path(base_path).suffix
    candidate = directory / base_path
    counter = 1
    while candidate.exists():
        candidate = directory / f"{stem}_{counter}{suffix}"
        counter += 1
    return candidate

def _should_skip(filename: str) -> bool:
    base = Path(filename).name
    return base.startswith("._") or base.startswith("__MACOSX")

uploaded = files.upload()
if not uploaded:
    raise RuntimeError("No files were uploaded. Please provide at least one CIF or archive.")

for name, data in uploaded.items():
    if _should_skip(name):
        print(f"Skipped hidden file: {name}")
        continue
    file_path = upload_root / name
    file_path.write_bytes(data)
    lower_name = name.lower()
    if lower_name.endswith(".zip"):
        with zipfile.ZipFile(io.BytesIO(data)) as zf:
            for member in zf.namelist():
                if member.endswith("/") or _should_skip(member):
                    continue
                if member.lower().endswith(".cif"):
                    target = _unique_target(cif_dir, member)
                    with zf.open(member) as source, open(target, "wb") as dest:
                        dest.write(source.read())
    elif lower_name.endswith((".tar", ".tar.gz", ".tgz", ".tar.bz2")):
        with tarfile.open(fileobj=io.BytesIO(data)) as tf:
            for member in tf.getmembers():
                if not member.isfile() or _should_skip(member.name):
                    continue
                if member.name.lower().endswith(".cif"):
                    target = _unique_target(cif_dir, member.name)
                    with tf.extractfile(member) as source, open(target, "wb") as dest:
                        dest.write(source.read())
    elif lower_name.endswith(".cif"):
        target = _unique_target(cif_dir, name)
        shutil.copy2(file_path, target)
    else:
        print(f"Skipped unsupported file: {name}")

cif_files = [p for p in sorted(cif_dir.glob("*.cif")) if not p.name.startswith("._")]
if not cif_files:
    raise RuntimeError("No CIF files found after processing uploads.")
CIF_INPUT_DIR = str(cif_dir.resolve())
print(f"Added {len(cif_files)} CIF files to {CIF_INPUT_DIR}")
for sample in cif_files[:5]:
    print(" -", sample.name)
if len(cif_files) > 5:
    print(f"... and {len(cif_files) - 5} more")

In [None]:
%%time
#@title Run Chronosort PCA pipeline
#@markdown Configure paths and parameters, then launch the Chronosort analysis.
import sys
import subprocess
from pathlib import Path

if "CIF_INPUT_DIR" not in globals():
    raise RuntimeError("No CIF input directory detected. Please run the upload cell first.")

cif_dir_override = ""  # @param {type:"string"}
scale = 30.0  # @param {type:"number"}
components_text = "0"  # @param {type:"string"}
# components_text accepts 0-based PCA indices (e.g., "0 1") to choose which eigenvectors drive the projection
trajectory_filename = "trajectory.pdb"  # @param {type:"string"}
vecs_filename = "vecs.txt"  # @param {type:"string"}
projection_filename = "projection.pdb"  # @param {type:"string"}

cif_dir_path = Path(cif_dir_override.strip()) if cif_dir_override.strip() else Path(CIF_INPUT_DIR)
if not cif_dir_path.exists():
    raise FileNotFoundError(f"cif_dir '{cif_dir_path}' does not exist.")

components = []
for piece in components_text.replace(";", ",").split(","):
    piece = piece.strip()
    if piece:
        components.append(int(piece))
if not components:
    components = [0]

output_root = Path("chronosort_outputs")
output_root.mkdir(parents=True, exist_ok=True)

trajectory_path = output_root / trajectory_filename
vecs_path = output_root / vecs_filename
projection_path = output_root / projection_filename

repo_root = Path("chronosort")
cmd = [
    sys.executable,
    "scripts/run_analysis.py",
    "--cif_dir",
    str(cif_dir_path),
    "--trajectory_file",
    str(trajectory_path.resolve()),
    "--vecs_file",
    str(vecs_path.resolve()),
    "--projection_file",
    str(projection_path.resolve()),
    "--scale",
    str(scale),
    "--components",
]
cmd.extend(str(c) for c in components)

print("Running (cwd=chronosort):", " ".join(cmd))
result = subprocess.run(cmd, cwd=repo_root, text=True, capture_output=True)
if result.stdout:
    print(result.stdout)
if result.returncode != 0:
    if result.stderr:
        print("stderr:\n" + result.stderr)
    raise RuntimeError(f"Chronosort pipeline failed with exit code {result.returncode}.")

eigenvalues_path = repo_root / "output" / "eigenvalues.png"
if eigenvalues_path.exists():
    target = output_root / "eigenvalues.png"
    target.write_bytes(eigenvalues_path.read_bytes())
    print(f"Copied eigenvalue plot to {target}")

print("\nGenerated files:")
for path in sorted(output_root.glob("*")):
    print(" -", path)

In [None]:
%%time
#@title Download results archive
#@markdown Package the generated outputs into a zip archive and download them locally.
from pathlib import Path
import shutil
import datetime

try:
    from google.colab import files
except ImportError as exc:
    raise RuntimeError("This cell is intended to run inside Google Colab.") from exc

output_root = Path("chronosort_outputs")
if not output_root.exists():
    raise RuntimeError("No outputs found. Run the analysis before downloading.")

timestamp = datetime.datetime.now(datetime.UTC).strftime("%Y%m%d_%H%M%S")
archive_name = f"chronosort_results_{timestamp}"
archive_path = shutil.make_archive(archive_name, "zip", root_dir=output_root)
print(f"Created archive: {archive_path}")
files.download(archive_path)