# Vision Pipeline - Google Colab Controller

This notebook allows you to run the vision pipeline on Google Colab using a GPU.
It mounts Google Drive for storage and sets up the environment automatically.

## 1. Environment Setup

In [None]:
# @title Clone Repository & Install Dependencies
import os

# --- Configuration ---
REPO_URL = "https://github.com/ppopgi-pang/ppopgipang-vision-auto-labeler.git" # @param {type:"string"}
BRANCH = "main" # @param {type:"string"}
# ---------------------

# Clone repository
repo_name = REPO_URL.split("/")[-1].replace(".git", "")
if not os.path.exists(repo_name):
    print(f"Cloning {REPO_URL}...")
    !git clone {REPO_URL}
else:
    print(f"Repository {repo_name} already exists.")

# Move into the project directory
%cd {repo_name}

# Install dependencies
print("Installing dependencies...")
if os.path.exists("requirements.txt"):
    !pip install -r requirements.txt
else:
    print("requirements.txt not found, installing default dependencies...")
    # Fallback based on pyproject.toml inspection
    !pip install pyyaml pydantic pydantic-settings python-dotenv requests pillow numpy opencv-python-headless ImageHash
    !pip install torch torchvision transformers ultralytics openai playwright nest_asyncio

print("Installing system dependencies for Playwright...")
!apt-get update
!apt-get install -y libatk1.0-0 libatk-bridge2.0-0 libgtk-3-0 libnss3 libx11-xcb1 \\
  libxcomposite1 libxdamage1 libxrandr2 libgbm1 libasound2 libpangocairo-1.0-0 \\
  libpango-1.0-0 libcups2 libdrm2 libxkbcommon0 libxfixes3 libxext6
print("Installing Playwright browsers...")
!playwright install chromium

print("Done.")


## 1.1 API Keys Configuration
**Important:** Run this cell BEFORE importing any project modules to ensure keys are loaded correctly.

In [None]:
import os

# @markdown ### API Keys
# @markdown Enter your API keys here. These are required for crawling and LLM verification.
import os
from google.colab import userdata

OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")

# Inject into environment
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Validation Warning
if "YOUR_" in OPENAI_API_KEY:
    print("WARNING: Default placeholder detected for OPENAI_API_KEY. Please update with actual keys.")
    
print("Environment variables set.")

## 2. GPU Verification

In [None]:
import torch

if torch.cuda.is_available():
    print(f"SUCCESS: GPU is available - {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: GPU not found. Please enable it in Runtime > Change runtime type > Hardware accelerator > GPU")

## 3. Google Drive Mount & Path Setup

In [None]:
from google.colab import drive
import sys
import os

# Mount Drive
drive.mount('/content/drive')

# Setup Paths
PROJECT_ROOT = os.getcwd()
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
    print(f"Added {PROJECT_ROOT} to sys.path")

# IMPORTANT: Add vision_pipeline to sys.path to allow imports from 'pipelines', 'domain', etc.
PIPELINE_ROOT = os.path.join(PROJECT_ROOT, 'vision_pipeline')
if PIPELINE_ROOT not in sys.path:
    sys.path.append(PIPELINE_ROOT)
    print(f"Added {PIPELINE_ROOT} to sys.path")

# Configure Output Directory on Drive
BASE_OUTPUT_DIR = "/content/drive/MyDrive/vision_pipeline_data" # @param {type:"string"}
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)

# Set Environment Variable to override config.py default
os.environ["OUTPUT_DIR"] = BASE_OUTPUT_DIR
print(f"Output directory set to: {BASE_OUTPUT_DIR}")

## 4. Pipeline Execution

In [None]:
import nest_asyncio
nest_asyncio.apply()

from vision_pipeline.domain.job import Job
from vision_pipeline.services.pipeline_runner import PipelineRunner
from vision_pipeline.config import settings
import logging

# --- Run Configuration ---
KEYWORDS = "anime character doll, anime figure doll, anime plush doll, anime character plush, anime collectible doll, anime vinyl figure, anime chibi doll, anime character toy, anime action figure, anime mascot doll, アニメ キャラクター 人形, アニメ フィギュア, アニメ ぬいぐるみ, キャラクター フィギュア, キャラクター ぬいぐるみ, 美少女 フィギュア, デフォルメ フィギュア, アニメ グッズ 人形, キャラクター ドール, フィギュア 写真, 애니 캐릭터 인형, 애니 피규어, 캐릭터 인형, 캐릭터 피규어, 애니메이션 인형, 애니 굿즈 인형, 캐릭터 굿즈 피규어, 미소녀 피규어, SD 피규어, 애니 캐릭터 장난감, anime doll figure, anime plush toy, anime figure collection, anime doll collection, anime figure photography, anime toy figure, anime doll toy, anime character merchandise doll, anime figure close up, anime doll product photo, anime figure real photo, anime doll real life, anime figure unboxing, anime figure review, anime plush doll photo, anime figure shelf, anime figure display, anime doll collection photo, anime figure product shot, anime figure desk setup, anime plush, anime stuffed doll, character plush doll, anime soft toy, anime mascot plush, anime plush collection, character plush toy, anime doll plush, anime plush figure, anime plush photo, anime scale figure, anime PVC figure, anime resin figure, anime Nendoroid, anime garage kit, anime action figure toy, anime figure model, anime collectible figure, anime statue figure, anime figure closeup, Japanese character doll, Japanese anime figure, otaku figure collection, otaku room figure, anime goods figure, anime hobby figure, anime character merchandise, anime toy collection, anime doll merchandise, anime figure goods, anime doll photo site:jp, anime figure photo site:jp, アニメ フィギュア 写真, キャラクター 人形 写真, anime plush photo site:jp, フィギュア 実物 写真, anime figure blog, anime doll review blog, フィギュア レビュー, アニメ グッズ 写真" # @param {type:"string"}
TARGET_OBJECT = "doll" # @param {type:"string"}
LIMIT = 50000 # @param {type:"integer"}

# Parse keywords (split by space if needed, or treat as single phrase if preferred)
# The CLI treats multiple args as list. Here we take one string and split if multiple terms intended, 
# or just wrap in list.
keyword_list = [k.strip() for k in KEYWORDS.split(',') if k.strip()]
if not keyword_list:
    keyword_list = [KEYWORDS]

print(f"Processing Keywords: {keyword_list}")
print(f"Target Object: {TARGET_OBJECT}")
print(f"Limit: {LIMIT}")
print(f"Saving to: {settings.output_dir}")

def run_pipeline():
    # Construct Job ID
    first_kw = keyword_list[0].replace(' ', '_')
    job_id = f"job_{TARGET_OBJECT}_{first_kw}"
    
    # Check for existing job data to avoid accidental overwrite/re-run if not desired
    # (The pipeline logic itself handles skipping ideally, but we warn)
    job_path = os.path.join(settings.output_dir, job_id)
    if os.path.exists(job_path):
        print(f"\n[INFO] Job directory {job_path} already exists. Pipeline will attempt to resume or skip accomplished steps.")
    
    # Create Job
    job = Job(
        keywords=keyword_list,
        target_class=TARGET_OBJECT,
        limit=LIMIT,
        job_id=job_id
    )
    
    # Run
    runner = PipelineRunner()
    try:
        runner.run(job)
        print("\nWith Great Success! Pipeline finished.")
    except Exception as e:
        print(f"\n[ERROR] Pipeline failed: {e}")
        logging.exception("Pipeline Failure")

# Execute
if __name__ == "__main__":
    run_pipeline()