<a href="https://colab.research.google.com/github/sofiasilingardi16/Misogyny-Repeated-and-Reposted/blob/main/Coding_SUB_RQ_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install git+https://github.com/openai/CLIP.git
!pip install transformers torch torchvision pandas pillow tqdm python-dotenv

import os
import torch
import clip
import pandas as pd
import numpy as np
from PIL import Image
from tqdm.auto import tqdm
from transformers import AutoImageProcessor, SiglipForImageClassification

device = "cuda" if torch.cuda.is_available() else "cpu"

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-2rpwkyud
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-2rpwkyud
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [13]:
# Mount Google Drive
from google.colab import drive
import os
from dotenv import load_dotenv
import pandas as pd

drive.mount('/content/drive')

# ✅ Create the .env file with your paths
with open(".env", "w") as f:
    f.write("TSV_PATH=/content/drive/MyDrive/MA_THESIS/CODE_FOR_THESIS/test.tsv\n")
    f.write("IMAGE_DIR=/content/drive/MyDrive/MA_THESIS/test_images\n")

# ✅ Load environment variables
load_dotenv()

# ✅ Get paths from .env
TSV_PATH = os.getenv("TSV_PATH")
IMAGE_DIR = os.getenv("IMAGE_DIR")

# Optional: confirm that the paths loaded correctly
print("TSV_PATH:", TSV_PATH)
print("IMAGE_DIR:", IMAGE_DIR)

# ✅ Load your data
df = pd.read_csv(TSV_PATH, sep="\t", on_bad_lines='skip')
df = df[df["label"] == 1].reset_index(drop=True)

# ✅ Create full image paths
df["image_path_full"] = df["file_name"].apply(lambda f: os.path.join(IMAGE_DIR, f))

# Preview the DataFrame
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
TSV_PATH: /content/drive/MyDrive/MA_THESIS/CODE_FOR_THESIS/test.tsv
IMAGE_DIR: /content/drive/MyDrive/MA_THESIS/test_images


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/MA_THESIS/CODE_FOR_THESIS/test.tsv'

In [None]:
# Load CLIP
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

# Load SIGLIP
siglip_model_name = "prithivMLmods/siglip2-x256-explicit-content"
siglip_model = SiglipForImageClassification.from_pretrained(siglip_model_name).to(device)
siglip_processor = AutoImageProcessor.from_pretrained(siglip_model_name, trust_remote_code=True)

In [None]:
def compute_clip_sim(text, img_path):
    try:
        image = clip_preprocess(Image.open(img_path)).unsqueeze(0).to(device)
        tokens = clip.tokenize([text]).to(device)
        with torch.no_grad():
            img_vec = clip_model.encode_image(image).float()
            txt_vec = clip_model.encode_text(tokens).float()
            img_vec /= img_vec.norm(dim=-1, keepdim=True)
            txt_vec /= txt_vec.norm(dim=-1, keepdim=True)
        return (img_vec @ txt_vec.T).item()
    except Exception as e:
        print(f"[CLIP ERROR] {img_path}: {e}")
        return None

def compute_explicitness(img_path):
    try:
        image = Image.open(img_path).convert("RGB")
        inputs = siglip_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            logits = siglip_model(**inputs).logits
            probs = torch.nn.functional.softmax(logits, dim=1).squeeze().cpu().tolist()
        return probs[3] + probs[4]  # Pornography + Enticing or Sensual
    except Exception as e:
        print(f"[SIGLIP ERROR] {img_path}: {e}")
        return None

In [None]:
tqdm.pandas()

df["clip_sim"] = df.progress_apply(
    lambda row: compute_clip_sim(row["text"], row["image_path_full"]),
    axis=1
)

df["explicitness_score"] = df.progress_apply(
    lambda row: compute_explicitness(row["image_path_full"]),
    axis=1
)

# Drop rows where processing failed
df_clean = df.dropna(subset=["clip_sim", "explicitness_score"]).reset_index(drop=True)

In [None]:
threshold = df_clean["clip_sim"].quantile(0.75)

df_clean["alignment"] = df_clean["clip_sim"].apply(
    lambda x: "amplify" if x >= threshold else "obscure"
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(df_clean["clip_sim"], bins=30, kde=True)
plt.title("CLIP Similarity Distribution")
plt.xlabel("clip_sim")
plt.ylabel("count")
plt.show()

In [None]:
sns.boxplot(data=df_clean, x="alignment", y="explicitness_score")
plt.title("Explicitness Score by Alignment Category")
plt.xlabel("Alignment")
plt.ylabel("Explicitness Score (SIGLIP)")
plt.show()

In [None]:
print(df_clean[["clip_sim", "explicitness_score"]].corr())

In [None]:
examples = pd.concat([
    df_clean[df_clean["alignment"] == "amplify"].nlargest(2, "clip_sim"),
    df_clean[df_clean["alignment"] == "obscure"].nsmallest(2, "clip_sim")
])

examples[["file_name", "text", "clip_sim", "explicitness_score", "alignment"]].to_csv("manual_examples.csv", index=False)

In [None]:
print(examples.head())

In [None]:
df_clean.to_csv("subrq3_clip_results.csv", index=False)

In [None]:
# Define your path to Drive
drive_path = "/content/drive/MyDrive/MA THESIS/subrq3_clip_results.csv"

# Save it there
df_clean.to_csv(drive_path, index=False)

In [None]:
sns.histplot(df_clean["clip_sim"], bins=30, kde=True)
plt.title("Distribution of CLIP Similarity")
plt.show()

In [None]:
sns.boxplot(data=df_clean, x="alignment", y="explicitness_score")
plt.title("Explicitness by Alignment")
plt.show()

In [None]:
df_clean.groupby("alignment")["explicitness_score"].describe()

In [None]:
df_clean[["clip_sim", "explicitness_score"]].corr()

In [None]:
save_path = "/content/drive/MyDrive/MA THESIS/manual_examples.csv"

examples[["file_name", "text", "clip_sim", "explicitness_score", "alignment"]].to_csv(save_path, index=False)

In [None]:
# prompt: how to print manual_examples.csv

import pandas as pd
print(pd.read_csv("manual_examples.csv"))