## complete multi-round labeling notebook

1. Initialization Cell

In [None]:
import os
import time
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

# --------------------------------
# SETUP
# --------------------------------
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "..", "data")
os.makedirs(DATA_DIR, exist_ok=True)

# Source file
SOURCE_FILE = os.path.join(DATA_DIR, "remaining_to_label_14.csv")

# Growing master
MASTER_FILE = os.path.join(DATA_DIR, "full_covid_abuse.csv")

# Remaining unlabeled <- output
REMAINING_FILE = os.path.join(DATA_DIR, "remaining_to_label_15.csv")

# Batch files
BATCH_DIR = os.path.join(DATA_DIR, "batches_14")
CHUNK_DIR = os.path.join(BATCH_DIR, "chunks")
OUTPUT_DIR = os.path.join(BATCH_DIR, "outputs")

for d in [BATCH_DIR, CHUNK_DIR, OUTPUT_DIR]:
    os.makedirs(d, exist_ok=True)


# --------------------------------
# LOAD API KEY
# --------------------------------
env_path = os.path.join(BASE_DIR, "..", "credentials", "openai.env")
load_dotenv(env_path)

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

MODEL_NAME = "gpt-4o-mini"
TEXT_COL = "original_text"
TEMP = 0

print("Initialization complete.")


Initialization complete.


2. Load Data or Resume State

In [80]:
# --------------------------------------
# If MASTER_FILE exists, resume
# --------------------------------------
if os.path.exists(MASTER_FILE):
    master_df = pd.read_csv(MASTER_FILE, dtype=str)
    print(f"Loaded master: {len(master_df)} rows")

else:
    # First round: start fresh
    src = pd.read_csv(SOURCE_FILE, dtype=str)
    src["is_abusive"] = None
    src["new_covid_terms"] = None
    src["label_round"] = None

    master_df = src.copy()
    print(f"Initialized master: {len(master_df)} rows")


# --------------------------------------
# Compute remaining
# --------------------------------------
remaining_df = master_df[master_df["is_abusive"].isnull()].copy()

# =======================================================
#  Shuffle the unlabeled rows
# =======================================================
remaining_df = remaining_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Remaining to label:", len(remaining_df))

remaining_df.to_csv(REMAINING_FILE, index=False)


Loaded master: 1884097 rows
Remaining to label: 140000


3. Define Prompt

In [81]:
SYSTEM_PROMPT = """
You are an expert in abusive language detection and content moderation.

Given ONE tweet, perform:

1. Abusive vs Non-abusive:
   - Return "abusive": 1 if the tweet contains insulting, demeaning, harassing,
     hateful, threatening, or abusive language (explicit or implicit).
   - Otherwise return "abusive": 0.

2. Detect newly coined pandemic or COVID-related terms, hashtags, or neologisms,
   such as "scamdemic", "plandemic", "covidiot", "anti-masker", "no-mask",
   "anti-vax", "anti-vaxxer", and similar COVID-era conspiracy or slang terms.
   Return them as a JSON list of strings (empty list if none).

Return ONLY a single JSON object in the format:
{
  "abusive": 0 or 1,
  "new": ["term1", "term2"]
}

Do NOT return arrays. Do NOT return explanations. Do NOT add extra fields.
Use double quotes only.
"""


4. Generate JSONL Batch Request

In [82]:
JSONL_MASTER = os.path.join(BATCH_DIR, "batch_master.jsonl")

with open(JSONL_MASTER, "w", encoding="utf-8") as f:
    for _, row in remaining_df.iterrows():

        req = {
            "custom_id": str(row["tweet_id"]),
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": MODEL_NAME,
                "messages": [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": str(row[TEXT_COL]).replace("\n", " ")}
                ],
                "temperature": TEMP
            }
        }

        f.write(json.dumps(req) + "\n")

print("JSONL ready:", JSONL_MASTER)


JSONL ready: c:\Users\Sama\Documents\Active learninig\experiments\notebooks\..\data\covid_tweets\batches_14\batch_master.jsonl


5. Chunk the JSONL

In [83]:
MAX_LINES = 20000

chunk_files = []
chunk_id = 0
line_count = 0

out = None

with open(JSONL_MASTER, "r", encoding="utf-8") as f:
    for line in f:
        if line_count % MAX_LINES == 0:
            if out:
                out.close()
            fp = os.path.join(CHUNK_DIR, f"chunk_{chunk_id}.jsonl")
            out = open(fp, "w", encoding="utf-8")
            chunk_files.append(fp)
            chunk_id += 1

        out.write(line)
        line_count += 1

if out:
    out.close()

print("Chunks:", len(chunk_files))


Chunks: 7


6. Upload + Create Batch Jobs

In [84]:
batch_ids = []

def upload_and_create(path):
    upload = client.files.create(file=open(path, "rb"), purpose="batch")
    batch = client.batches.create(
        input_file_id=upload.id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )
    return batch.id

for fp in chunk_files:
    b_id = upload_and_create(fp)
    batch_ids.append(b_id)
    print("Started:", b_id)


Started: batch_6937a4176458819092abb1e1c80fd813
Started: batch_6937a42dd21081909d18771244f3d8fe
Started: batch_6937a43cc55481908f6873b09abc0c11
Started: batch_6937a44f7b9c8190be60bf4c50237d3d
Started: batch_6937a45ebe608190920dc83fcbab0384
Started: batch_6937a46cec8c81908f5dd9af4cf08126
Started: batch_6937a47c32f081909115e22187c11849


7. Poll Until Completion

In [85]:
import time

print("Polling for completion ...")

pending = batch_ids.copy()

while pending:
    print("Checking:", len(pending), "batches")
    finished = []
    for b in pending:
        s = client.batches.retrieve(b)
        print(b, "→", s.status)
        if s.status in ["completed", "failed"]:
            finished.append(b)

    pending = [b for b in pending if b not in finished]
    time.sleep(120)

print("All batches finished.")


Polling for completion ...
Checking: 7 batches
batch_6937a4176458819092abb1e1c80fd813 → in_progress
batch_6937a42dd21081909d18771244f3d8fe → in_progress
batch_6937a43cc55481908f6873b09abc0c11 → in_progress
batch_6937a44f7b9c8190be60bf4c50237d3d → validating
batch_6937a45ebe608190920dc83fcbab0384 → in_progress
batch_6937a46cec8c81908f5dd9af4cf08126 → in_progress
batch_6937a47c32f081909115e22187c11849 → validating
Checking: 7 batches
batch_6937a4176458819092abb1e1c80fd813 → in_progress
batch_6937a42dd21081909d18771244f3d8fe → in_progress
batch_6937a43cc55481908f6873b09abc0c11 → in_progress
batch_6937a44f7b9c8190be60bf4c50237d3d → in_progress
batch_6937a45ebe608190920dc83fcbab0384 → in_progress
batch_6937a46cec8c81908f5dd9af4cf08126 → in_progress
batch_6937a47c32f081909115e22187c11849 → in_progress
Checking: 7 batches
batch_6937a4176458819092abb1e1c80fd813 → in_progress
batch_6937a42dd21081909d18771244f3d8fe → in_progress
batch_6937a43cc55481908f6873b09abc0c11 → in_progress
batch_6937a44f

8. Download Output

In [86]:
output_files = []

for b in batch_ids:
    status = client.batches.retrieve(b)
    out_id = status.output_file_id

    if not out_id:
        print("No output for:", b)
        continue

    out_path = os.path.join(OUTPUT_DIR, f"{b}.jsonl")

    body = client.files.content(out_id).read()

    with open(out_path, "wb") as f:
        f.write(body)

    output_files.append(out_path)
    print("Downloaded:", out_path)


Downloaded: c:\Users\Sama\Documents\Active learninig\experiments\notebooks\..\data\covid_tweets\batches_14\outputs\batch_6937a4176458819092abb1e1c80fd813.jsonl
Downloaded: c:\Users\Sama\Documents\Active learninig\experiments\notebooks\..\data\covid_tweets\batches_14\outputs\batch_6937a42dd21081909d18771244f3d8fe.jsonl
Downloaded: c:\Users\Sama\Documents\Active learninig\experiments\notebooks\..\data\covid_tweets\batches_14\outputs\batch_6937a43cc55481908f6873b09abc0c11.jsonl
Downloaded: c:\Users\Sama\Documents\Active learninig\experiments\notebooks\..\data\covid_tweets\batches_14\outputs\batch_6937a44f7b9c8190be60bf4c50237d3d.jsonl
Downloaded: c:\Users\Sama\Documents\Active learninig\experiments\notebooks\..\data\covid_tweets\batches_14\outputs\batch_6937a45ebe608190920dc83fcbab0384.jsonl
Downloaded: c:\Users\Sama\Documents\Active learninig\experiments\notebooks\..\data\covid_tweets\batches_14\outputs\batch_6937a46cec8c81908f5dd9af4cf08126.jsonl
Downloaded: c:\Users\Sama\Documents\Acti

9. Parse and Merge Results Into Master

In [87]:
results = {}
bad = []

# ------------------------------------
# Parse each output file safely
# ------------------------------------
for fp in output_files:
    with open(fp, "r") as f:
        for line in f:
            try:
                resp = json.loads(line)
            except Exception as e:
                print("FAILED to load JSON line, skipping:", line[:200])
                continue

            tid = resp.get("custom_id")

            try:
                content = resp["response"]["body"]["choices"][0]["message"]["content"]
            except Exception:
                bad.append((tid, "MISSING content"))
                results[tid] = {"abusive": None, "new": []}
                continue

            # Try to parse JSON returned by the model
            try:
                parsed = json.loads(content)
            except Exception:
                bad.append((tid, content))
                parsed = {"abusive": None, "new": []}

            results[tid] = parsed

print("======================================")
print("Parsed rows:", len(results))
print("Bad JSON rows:", len(bad))
print("======================================")

# ======================================================
# Save bad responses for inspection
# ======================================================
if bad:
    bad_df = pd.DataFrame(bad, columns=["tweet_id", "raw_content"])
    bad_path = os.path.join(DATA_DIR, "bad_json_responses_round.csv")
    bad_df.to_csv(bad_path, index=False)
    print("Saved bad responses to:", bad_path)


# ======================================================
# BUILD results_df
# ======================================================
results_df = pd.DataFrame([
    {
        "tweet_id": tid,
        "is_abusive": v.get("abusive", None),
        "new_covid_terms": json.dumps(v.get("new", []))
    }
    for tid, v in results.items()
])

print("results_df rows:", len(results_df))


# ======================================================
# Ensure numeric label_round
# ======================================================
if "label_round" not in master_df.columns:
    master_df["label_round"] = None

master_df["label_round"] = pd.to_numeric(master_df["label_round"], errors="coerce")


# ======================================================
# Determine next round number
# ======================================================
current_max = master_df["label_round"].max()
if pd.isna(current_max):
    current_max = 0

next_round = int(current_max) + 1

print("Current max round:", current_max)
print("This batch is round:", next_round)


# ======================================================
# Assign round to this batch
# ======================================================
results_df["label_round"] = next_round

Parsed rows: 140000
Bad JSON rows: 0
results_df rows: 140000
Current max round: 13.0
This batch is round: 14


10. Merge Without Overwritting Past Labels

In [88]:
# Merge
master_df = master_df.merge(
    results_df,
    on="tweet_id",
    how="left",
    suffixes=("", "_new")
)

# Only fill missing
master_df["is_abusive"] = master_df["is_abusive"].fillna(master_df["is_abusive_new"])
master_df["new_covid_terms"] = master_df["new_covid_terms"].fillna(master_df["new_covid_terms_new"])

# If label_round is empty and we just filled it → set round
master_df["label_round"] = master_df["label_round"].fillna(master_df["label_round_new"])

# Drop temp
master_df = master_df.drop(columns=["is_abusive_new", "new_covid_terms_new", "label_round_new"], errors="ignore")

master_df.to_csv(MASTER_FILE, index=False)
print("Master saved:", MASTER_FILE)


Master saved: c:\Users\Sama\Documents\Active learninig\experiments\notebooks\..\data\covid_tweets\classified_master.csv


11. Generate Next Iteration Inputs

In [89]:
remaining_df = master_df[master_df["is_abusive"].isnull()].copy()
remaining_df = remaining_df.sample(frac=1, random_state=42)

remaining_df.to_csv(REMAINING_FILE, index=False)

print("Remaining for next round:", len(remaining_df))


Remaining for next round: 0


13. Summary Output

In [91]:
print("\n================ SUMMARY ================\n")
print("Master total rows:", len(master_df))
print("Labeled rows:", master_df["is_abusive"].notnull().sum())
print("Remaining rows:", len(remaining_df))
print("Latest round:", master_df["label_round"].max())
print("\n========================================\n")




Master total rows: 1884097
Labeled rows: 1884097
Remaining rows: 0
Latest round: 14.0


