In [2]:
pip install git+https://github.com/openai/CLIP.git torch torchvision pillow scikit-learn tqdm joblib


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-ffd45xw5
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-ffd45xw5
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=9e0b43a67f75c1b37f2ef90a5239286e943e81fad70f00aff84304195f8b20f4
  Stored in directory: /tmp/pip-ephem-wheel-cache-38_f38_z/wheels/35/3e/df/3d24cbfb3b6a06f17

In [None]:
from google.colab import files
import zipfile, os

# Upload ONE zip with /data/train/... and /data/predict/ inside
uploaded = files.upload()
zip_name = next(iter(uploaded.keys()))

# Extract while skipping macOS junk
with zipfile.ZipFile(zip_name, 'r') as z:
    members = [m for m in z.namelist()
               if "__MACOSX" not in m
               and not m.endswith("/")
               and not m.split("/")[-1].startswith("._")
               and not m.split("/")[-1].startswith(".DS_Store")]
    z.extractall(".", members=members)

# Show tree to confirm
!find data -maxdepth 3 -type d -print | sed 's|^|/|'


KeyboardInterrupt: 

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import numpy as np, csv, random, os, shutil
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import torch, clip
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import joblib

In [4]:
TRAIN_DIR = "/content/drive/MyDrive/Data/Train"
PREDICT_DIR = "/content/drive/MyDrive/Data/Predict"


In [None]:
!find Data -maxdepth 3 -type d -print | sed 's|^|/|'

find: ‘Data’: No such file or directory


In [None]:
TRAIN_DIR = "Data/Train"
PREDICT_DIR = "Data/Predict"

In [5]:

OUTPUT_CSV = "results.csv"
COPY_TO = "output"               # images copied into subfolders by predicted label
UNKNOWN_LABEL = "notlabeled"     # everything below the threshold goes here
UNKNOWN_THRESHOLD = 0.80         # ↑ stricter (more notlabeled), ↓ looser

SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EXTS = {".jpg",".jpeg",".png",".webp",".bmp",".tiff",".gif",".heic",".heif"}

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)


<torch._C.Generator at 0x7d4b643847b0>

In [6]:
def is_junk(p: Path) -> bool:
    parts = set(p.parts)
    name = p.name
    return ("__MACOSX" in parts or
            name.startswith("._") or
            name.startswith(".DS_Store") or
            name.startswith("."))



In [7]:
def list_imgs(root):
    root = Path(root)
    return [p for p in root.rglob("*")
            if p.is_file() and (p.suffix.lower() in EXTS) and not is_junk(p)]



In [8]:
def embed(paths, model, preprocess, device):
    feats = []
    for p in tqdm(paths, desc="Embedding"):
        try:
            im = Image.open(p).convert("RGB")
            x = preprocess(im).unsqueeze(0).to(device)
            with torch.no_grad():
                f = model.encode_image(x)
                f = f / f.norm(dim=-1, keepdim=True)
            feats.append(f.cpu().numpy())
        except Exception:
            feats.append(np.zeros((1,512), dtype=np.float32))  # keep index alignment
    return np.concatenate(feats, axis=0)


In [9]:
model, preprocess = clip.load("ViT-B/32", device=DEVICE)


100%|███████████████████████████████████████| 338M/338M [00:22<00:00, 16.1MiB/s]


In [10]:
class_dirs = [
    ("Portraits", Path(TRAIN_DIR) / "Potraits"),  # folder = Potraits, label = Portraits
    ("Protests",  Path(TRAIN_DIR) / "Protests"),
    ("Casualities", Path(TRAIN_DIR) / "Casualities"),
    ("Posts",    Path(TRAIN_DIR) / "Posts")
]

# Verify both exist
for cname, cdir in class_dirs:
    if not cdir.is_dir():
        raise SystemExit(f"Missing folder: {cdir} (expected {cname} folder under {TRAIN_DIR})")

# Build training set
classes = [c for c, _ in class_dirs]   # ["Portraits", "Protests"]
X_paths, y = [], []
for ci, (cname, cdir) in enumerate(class_dirs):
    ps = list_imgs(cdir)
    if len(ps) == 0:
        print(f"Warning: no images in {cdir}")
    X_paths += ps
    y += [ci] * len(ps)

y = np.array(y)
X = embed(X_paths, model, preprocess, DEVICE)

Embedding: 100%|██████████| 106/106 [01:22<00:00,  1.29it/s]


In [11]:
labels_for_weight = np.arange(len(classes))
cw = compute_class_weight(class_weight='balanced', classes=labels_for_weight, y=y)
class_weight = {int(k):float(v) for k,v in zip(labels_for_weight, cw)}

clf = LogisticRegression(max_iter=2000, C=2.0, class_weight=class_weight, multi_class="auto")
clf.fit(X, y)
joblib.dump({"clf": clf, "classes": classes}, "clip_probe.joblib")




['clip_probe.joblib']

In [12]:
P_paths = list_imgs(PREDICT_DIR)
PX = embed(P_paths, model, preprocess, DEVICE)
prob = clf.predict_proba(PX)            # [N, 2]
maxp = prob.max(axis=1)
pred_idx = prob.argmax(axis=1)

Embedding: 100%|██████████| 2940/2940 [03:31<00:00, 13.93it/s]


In [13]:
os.makedirs(COPY_TO, exist_ok=True)
for c in classes + [UNKNOWN_LABEL]:
    os.makedirs(os.path.join(COPY_TO, c), exist_ok=True)

with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["path", "pred_label", "confidence",
                f"p({classes[0]})", f"p({classes[1]})"])
    for p, mp, pi, pr0, pr1 in zip(P_paths, maxp, pred_idx, prob[:,0], prob[:,1]):
        label = classes[pi] if mp >= UNKNOWN_THRESHOLD else UNKNOWN_LABEL
        dst = Path(COPY_TO, label, p.name)
        if not dst.exists():
            try:
                shutil.copy2(p, dst)
            except Exception:
                pass
        w.writerow([str(p), label, round(float(mp),4),
                    round(float(pr0),4), round(float(pr1),4)])

print(f"Done. Wrote {OUTPUT_CSV} and organized copies in '{COPY_TO}/'.")
print(f"Device: {DEVICE} | Threshold: {UNKNOWN_THRESHOLD}")


Done. Wrote results.csv and organized copies in 'output/'.
Device: cuda | Threshold: 0.8


In [16]:
import csv

# CSV output file
OUTPUT_CSV = "results_4.csv"


with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["path", "pred_label", "confidence"] + [f"p({c})" for c in classes])


    for p, mp, pi, probs in zip(P_paths, maxp, pred_idx, prob):
        label = classes[pi] if mp >= UNKNOWN_THRESHOLD else UNKNOWN_LABEL
        row = [str(p), label, round(float(mp),4)] + [round(float(x),4) for x in probs]
        w.writerow(row)

print(f"CSV written to {OUTPUT_CSV}")


CSV written to results_4.csv
