# Setup & Environment Check

This notebook verifies the environment and shows a tiny EDA preview.

## Create tiny dataset artifacts (collection step)
This cell tries to run the repo scripts; if they don't exist, it writes a minimal placeholder `data/pairs.csv`.

In [3]:
from pathlib import Path
import os, csv, json, subprocess

# Ensure data directories exist
Path("data").mkdir(parents=True, exist_ok=True)
Path("data/images").mkdir(parents=True, exist_ok=True)

def try_run(cmd):
    try:
        print(">>", cmd)
        subprocess.check_call(cmd, shell=True)
        print("   ✓ OK")
        return True
    except Exception as e:
        print("   (skipping) —", e)
        return False

ran_any = False
if Path("scripts/pdb_fetch_render.py").exists():
    ran_any = try_run("python scripts/pdb_fetch_render.py") or ran_any
if Path("scripts/build_captions.py").exists():
    ran_any = try_run("python scripts/build_captions.py") or ran_any

pairs_path = Path("data/pairs.csv")
if not pairs_path.exists():
    print("pairs.csv not found — creating a tiny placeholder file...")
    rows = [
        {"id":"000001","pdb_id":"4MBS","caption":"Surface view of CCR5 bound to maraviroc (PDB 4MBS); front orientation.","image_path":"data/images/4MBS_front_surface.png","view":"front"},
        {"id":"000002","pdb_id":"3OE0","caption":"Cartoon view of CXCR4 with antagonist IT1t (PDB 3OE0); side orientation.","image_path":"data/images/3OE0_side_cartoon.png","view":"side"},
        {"id":"000003","pdb_id":"1GC1","caption":"Surface view of HIV-1 gp120 bound to CD4 (PDB 1GC1); top orientation.","image_path":"data/images/1GC1_top_surface.png","view":"top"},
    ]
    with pairs_path.open("w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["id","pdb_id","caption","image_path","view"])
        w.writeheader()
        w.writerows(rows)
    print("Created:", pairs_path)
else:
    print("Found existing:", pairs_path)    


pairs.csv not found — creating a tiny placeholder file...
Created: data/pairs.csv


## Environment check

In [None]:
import sys, torch, transformers, numpy as np, pandas as pd, matplotlib
print('Python', sys.version)
print('Torch', torch.__version__)
print('Transformers', transformers.__version__)


## Sample plot

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.plot([0,1,2],[0,1,0])
plt.title('Environment OK — Sample Plot')
plt.show()


## Example PDB IDs (HIV-related)
We will render these in future milestones.

In [None]:
pdb_ids = ['4MBS','3OE0','1GC1']
pdb_ids


## Early EDA: load `data/pairs.csv`

In [None]:
import pandas as pd
df = pd.read_csv("data/pairs.csv")
display(df.head())
print("Rows:", len(df))
