# Step 1 - Install the required dependencies, set up W&B and make sure the python version is 3.10 and above

In [None]:
!pip install -q wandb datasets transformers evaluate tqdm emoji regex pandas pyarrow scikit-learn

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import wandb
# wandb.login(key = "")
wandb.login(key="XX") #replace with your key

In [None]:
!python --version

In [None]:
#imports and config:
import re, regex, emoji
import pandas as pd
import numpy as np
import tqdm

import wandb
from datasets import load_dataset
from transformers import pipeline
import evaluate


# WANDB CONFIG
PROJECT = "mlip-lab4-slices-2025"    
ENTITY = None                        
RUN_NAME = "tweet_eval_roberta_vs_gpt2"


In [None]:
# Models to compare
MODELS = {
    "roberta": "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "gpt2":    "LYTinn/finetuning-sentiment-model-tweet-gpt2",
}

In [None]:

# Label normalization 
ID2LABEL = {0: "negative", 1: "neutral", 2: "positive"}
HF_LABEL_MAP = {"LABEL_0":"negative","LABEL_1":"neutral","LABEL_2":"positive"}

USE_HF_DATASET = True   # set False to use tweets.csv fallback
SEED = 42
np.random.seed(SEED)


# Step 2 - Load a dataset from Hugging Face

In [None]:
if USE_HF_DATASET:
    ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")
    df = pd.DataFrame(ds["test"]).head(500).copy()
    df["label"] = df["label"].map(ID2LABEL)
else:
    df = pd.read_csv("tweets.csv")
    # Ensure it has 'text' and 'label' columns
    df = df.rename(columns={c: c.strip() for c in df.columns})
    assert {"text","label"}.issubset(df.columns), "tweets.csv must include text,label"

df = df[["text","label"]].dropna().reset_index(drop=True)
df.head(3)


# Step 3 - Add MetaData for slicing

In this step, you'll add **5 metadata columns** to your dataset to enable slicing later in **Weights & Biases (W&B)**.

You can use:
- **Value matching** (e.g., tweets with hashtags)
- **Regex** (e.g., strong positive words like *love*, *great*)
- **Heuristics** (e.g., emoji count, all-caps detection, tweet length)

These columns will be carried forward when you run inference in Step 6 and will appear in your final `predictions_table` logged to W&B.

---

Once inference is complete, your W&B table (`df_long`) will include:
- Original tweet text
- Ground-truth labels
- Model predictions and confidence scores
- All slicing metadata you define here

Later, in the W&B UI, you can use the ➕ `Filter` option in the table view to explore model behavior across these slices.

In [None]:
# Step 3 – Add Slicing Metadata
# Add new columns for filtering in W&B later

# Example: count emojis in each tweet
def count_emojis(text):
    return sum(ch in emoji.EMOJI_DATA for ch in str(text))

df["emoji_count"] = df["text"].apply(count_emojis).astype(int)

df.head(5)

In [None]:
# Transformers requires a backend (PyTorch/TensorFlow/Flax). We'll use PyTorch.
try:
    import torch, transformers, sys
    print("torch:", torch.__version__)
    print("transformers:", transformers.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("Python:", sys.executable)
except Exception as e:
    raise RuntimeError("Install PyTorch before proceeding: pip install torch torchvision torchaudio") from e

#  Step 4 – Run Inference on Tweets Using Two Sentiment Models

In this step, you'll use two HuggingFace sentiment analysis models to run inference on your dataset:

In [None]:
from tqdm.auto import tqdm

def run_pipeline(model_id, texts):
    clf = pipeline("text-classification", model=model_id, truncation=True, framework="pt", device=-1)
    preds, confs = [], []
    for t in tqdm(texts, desc=f"Infer: {model_id}"):
        out = clf(t)[0]
        lbl = HF_LABEL_MAP.get(out["label"], out["label"])
        preds.append(lbl)
        confs.append(float(out["score"]))
    return preds, confs

pred_frames = []
for model_name, model_id in MODELS.items():
    yhat, conf = run_pipeline(model_id, df["text"].tolist())
    tmp = df.copy()
    tmp["model"] = model_name
    tmp["pred"]  = yhat
    tmp["conf"]  = conf
    pred_frames.append(tmp)

df_long = pd.concat(pred_frames, ignore_index=True)
df_long.head(5)

# Step 5: Compute Metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics_sklearn(y_true, y_pred, average="macro"):
    y_true = list(y_true); y_pred = list(y_pred)
    return {
        "accuracy":  accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average=average, zero_division=0),
        "recall":    recall_score(y_true, y_pred, average=average, zero_division=0),
        "f1":        f1_score(y_true, y_pred, average=average, zero_division=0),
    }

overall = (
    df_long.groupby("model")
           .apply(lambda g: compute_metrics_sklearn(g["label"], g["pred"]))
           .apply(pd.Series)
)
overall

# Step 6: Log to Wandb:

In [None]:
run = wandb.init(project=PROJECT, entity=ENTITY, name=RUN_NAME, config={
    "models": MODELS,
    "n_rows": len(df),
    "use_hf_dataset": USE_HF_DATASET
})

# Main predictions table: one row per (example, model)
pred_table = wandb.Table(dataframe=df_long)
wandb.log({"predictions_table": pred_table})

# Log overall metrics per model into run summary
for m, row in overall.iterrows():
    for k, v in row.items():
        wandb.summary[f"{m}_{k}"] = float(v)

run.finish()

Students now go to W&B → open the run → Tables → predictions_table.
Use “+ Add filter” to create slices (e.g., has_hashtag is True, token_count < 8, emoji_count > 0, has_url is True, strong_neg_lexicon is True). They can add panels and apply different filters on each panel.

In [None]:
# Students: replace the placeholders below with 1–2 sentence insights
saved_slice_notes = [""
]
pd.DataFrame(saved_slice_notes)


After successfully creating the two slices, come up with three *additional* slices you want to check and **create** the slices & view them in Wandb.

There are two directions to identify useful slices:
- Top-down: Think about what kinds of things the model can struggle with, and come up with some slices.
- Bottom-up: Look at model (mis-)predictions, come up with hypotheses, and translate them into data slices.

3. [YOUR CHOICE]
4. [YOUR CHOICE]
5. [YOUR CHOICE]

In [None]:
# Add these three slices & re-run the notebook to see them on Wandb.

additional_slice_ideas = [
""
]
additional_slice_ideas

# Step 7 - Generate more test cases with Large Language Models

Select one slice from the three you wrote down and generate **10 test cases** using LLMs, which can include average case, boundary case, or difficult case.

Your input can be in the following format:

> Examples:
> - @user @user That’s coming, but I think the victims are going to be Medicaid recipients.
> - I think I may be finally in with the in crowd #mannequinchallenge  #grads2014 @user
> 
> Generate more tweets using slangs.

The first part of **Examples** conditions the LLM on the style, length, and content of examples. The second part of **Instructions** instructs what kind of examples you want LLM to generate.

Use our provided GPTs to start the task: [llm-based-test-case-generator](https://chatgpt.com/g/g-982cylVn2-llm-based-test-case-generator). If you do not have access to GPTs, use the plain ChatGPT or other LLM providers you have access to instead.

In [None]:
# Paste your 10 generated tweets here:
generated_slice = "Sarcasm about products"

generated_cases = [
    "Love how my new laptop dies after 30 minutes. Revolutionary battery life.",
    "This update made my phone faster — at crashing.",
    "Amazing headphones: premium static included.",
    "Customer support put me on hold just long enough to meditate. Namaste.",
    "Five stars to the delivery for inventing ‘lost in transit’.",
    "My smartwatch is so smart it forgot the time.",
    "Great blender — now I own a cup of chunks.",
    "Thanks to the camera update, I can fi/nally take abstract photos.",
    "Alarm app works perfectly, except in the morning.",
    "Truly premium earbuds: they pair with everyone except me.",
]

gen_df = pd.DataFrame({"text": generated_cases})


In [None]:
# Run both models on generated cases
rows = []
for model_name, model_id in MODELS.items():
    yhat, conf = run_pipeline(model_id, gen_df["text"].tolist())
    tmp = gen_df.copy()
    tmp["model"] = model_name
    tmp["pred"]  = yhat
    tmp["conf"]  = conf
    rows.append(tmp)

gen_long = pd.concat(rows, ignore_index=True)
gen_long["slice"] = generated_slice
gen_long.head()
