# Step 1 - Install the required dependencies, set up W&B and make sure the python version is 3.10 and above

In [24]:
!pip install -q wandb datasets transformers evaluate tqdm emoji regex pandas pyarrow scikit-learn

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [25]:
import wandb
# wandb.login(key = "")
wandb.login(key="ca7aad83edf119127bfff120288b123954d34330") #replace with your key



True

In [26]:
!python --version

Python 3.12.3


In [27]:
#imports and config:
import re, regex, emoji
import pandas as pd
import numpy as np
import tqdm

import wandb
from datasets import load_dataset
from transformers import pipeline
import evaluate


# WANDB CONFIG
PROJECT = "mlip-lab4-slices-2025"    
ENTITY = None                        
RUN_NAME = "tweet_eval_roberta_vs_gpt2"


In [28]:
# Models to compare
MODELS = {
    "roberta": "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "gpt2":    "LYTinn/finetuning-sentiment-model-tweet-gpt2",
}

In [29]:

# Label normalization 
ID2LABEL = {0: "negative", 1: "neutral", 2: "positive"}
HF_LABEL_MAP = {"LABEL_0":"negative","LABEL_1":"neutral","LABEL_2":"positive"}

USE_HF_DATASET = True   # set False to use tweets.csv fallback
SEED = 42
np.random.seed(SEED)


# Step 2 - Load a dataset from Hugging Face

In [30]:
if USE_HF_DATASET:
    ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")
    df = pd.DataFrame(ds["test"]).head(500).copy()
    df["label"] = df["label"].map(ID2LABEL)
else:
    df = pd.read_csv("tweets.csv")
    # Ensure it has 'text' and 'label' columns
    df = df.rename(columns={c: c.strip() for c in df.columns})
    assert {"text","label"}.issubset(df.columns), "tweets.csv must include text,label"

df = df[["text","label"]].dropna().reset_index(drop=True)
df.head(3)


Unnamed: 0,text,label
0,@user @user what do these '1/2 naked pics' hav...,neutral
1,OH: “I had a blue penis while I was this” [pla...,neutral
2,"@user @user That's coming, but I think the vic...",neutral


# Step 3 - Add MetaData for slicing

In this step, you'll add **5 metadata columns** to your dataset to enable slicing later in **Weights & Biases (W&B)**.

You can use:
- **Value matching** (e.g., tweets with hashtags)
- **Regex** (e.g., strong positive words like *love*, *great*)
- **Heuristics** (e.g., emoji count, all-caps detection, tweet length)

These columns will be carried forward when you run inference in Step 6 and will appear in your final `predictions_table` logged to W&B.

---

Once inference is complete, your W&B table (`df_long`) will include:
- Original tweet text
- Ground-truth labels
- Model predictions and confidence scores
- All slicing metadata you define here

Later, in the W&B UI, you can use the ➕ `Filter` option in the table view to explore model behavior across these slices.

In [31]:
# Step 3 – Add Slicing Metadata
# Add new columns for filtering in W&B later

# Example: count emojis in each tweet & create a slice for tweets with >3 emojis
def count_emojis(text):
    return sum(ch in emoji.EMOJI_DATA for ch in str(text))

df["emoji_count"] = df["text"].apply(count_emojis).astype(int)

def get_slices(df):
    return {
        "emoji_gt3": df["emoji_count"] > 3,
    }


In [32]:
# --- Step 3: Add Slicing Metadata (5 new columns) ---

# Regex patterns for slicing
HASHTAG_RE    = re.compile(r'(?<!\w)#\w+', re.UNICODE)  # start or whitespace before '#'
MENTION_RE    = re.compile(r'(?<!\w)@\w+', re.UNICODE)
URL_RE        = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
STRONG_POS_RE = re.compile(r'\b(?:love|great|awesome|fantastic|amazing|best|excellent|wonderful|terrific)\b',
                           re.IGNORECASE)

df["has_hashtag"]    = df["text"].str.contains(HASHTAG_RE, na=False)
df["has_mention"]    = df["text"].str.contains(MENTION_RE, na=False)
df["url_count"]      = df["text"].str.findall(URL_RE).str.len().astype(int)
df["strong_pos_lex"] = df["text"].str.contains(STRONG_POS_RE, na=False)


In [33]:
# Transformers requires a backend (PyTorch/TensorFlow/Flax). We'll use PyTorch.
try:
    import torch, transformers, sys
    print("torch:", torch.__version__)
    print("transformers:", transformers.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("Python:", sys.executable)
except Exception as e:
    raise RuntimeError("Install PyTorch before proceeding: pip install torch torchvision torchaudio") from e

torch: 2.8.0+cpu
transformers: 4.56.1
CUDA available: False
Python: /home/cat/cmu-mlip-model-testing-lab/.venv/bin/python


#  Step 4 – Run Inference on Tweets Using Two Sentiment Models

In this step, you'll use two HuggingFace sentiment analysis models to run inference on your dataset:

In [34]:
import transformers, transformers.pipelines as _pp
from transformers.utils import is_torch_available
print("HF:", transformers.__version__, "is_torch_available():", is_torch_available())
_pp.torch = torch

HF: 4.56.1 is_torch_available(): True


In [35]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

device = -1  # CPU
def run_pipeline(model_id, texts):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_id)
    clf = pipeline("text-classification", model=mdl, tokenizer=tok, truncation=True, device=device)
    preds, confs = [], []
    for t in texts:
        out = clf(t)[0]
        preds.append(HF_LABEL_MAP.get(out["label"], out["label"]))
        confs.append(float(out["score"]))
    return preds, confs


pred_frames = []
for model_name, model_id in MODELS.items():
    yhat, conf = run_pipeline(model_id, df["text"].tolist())
    tmp = df.copy()
    tmp["model"] = model_name
    tmp["pred"]  = yhat
    tmp["conf"]  = conf
    pred_frames.append(tmp)

df_long = pd.concat(pred_frames, ignore_index=True)
df_long.head(5)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cpu


Unnamed: 0,text,label,emoji_count,has_hashtag,has_mention,url_count,strong_pos_lex,model,pred,conf
0,@user @user what do these '1/2 naked pics' hav...,neutral,0,False,True,0,False,roberta,negative,0.804726
1,OH: “I had a blue penis while I was this” [pla...,neutral,0,False,False,0,False,roberta,neutral,0.866949
2,"@user @user That's coming, but I think the vic...",neutral,0,False,True,0,False,roberta,neutral,0.763724
3,I think I may be finally in with the in crowd ...,positive,0,True,True,0,False,roberta,positive,0.774047
4,"@user Wow,first Hugo Chavez and now Fidel Cast...",negative,0,False,True,0,False,roberta,neutral,0.416397


# Step 5: Compute Metrics

In [36]:
#compute metrics model-wise
from sklearn.metrics import accuracy_score

def compute_accuracy(y_true, y_pred):
    y_true = list(y_true)
    y_pred = list(y_pred)
    return accuracy_score(y_true, y_pred)

overall = (
    df_long.groupby("model")[["label", "pred"]]
           .apply(lambda g: accuracy_score(g["label"], g["pred"]))
           .rename("accuracy")
)

slice_table = wandb.Table(columns=["slice", "model", "accuracy"])
slice_metrics = {}

for slice_name, mask in get_slices(df_long).items():
    slice_metrics[slice_name] = {}  # Initialize inner dict

    for model_name, g in df_long[mask].groupby("model"):
        acc = compute_accuracy(g["label"], g["pred"])
        acc = float(acc) 
        # Add to wandb Table
        slice_table.add_data(slice_name, model_name, acc)
        # Add to dict
        slice_metrics[slice_name][model_name] = acc

# Step 6: Log to Wandb:

In [37]:
run = wandb.init(project=PROJECT, entity=ENTITY, name=RUN_NAME, config={
    "models": MODELS,
    "n_rows": len(df),
    "use_hf_dataset": USE_HF_DATASET
})

# Main predictions table: one row per (example, model)
pred_table = wandb.Table(dataframe=df_long)
wandb.log({"predictions_table": pred_table})

# Log overall accuracy to wandb summary
for model_name, acc in overall.items():
    wandb.summary[f"{model_name}_accuracy"] = float(acc)

# wandb.log({"slice_accuracy_table": slice_table})
for slice_name, model_dict in slice_metrics.items():
    for model_name, acc in model_dict.items():
        metric_name = f"slice/{slice_name}/{model_name}_accuracy"
        wandb.log({metric_name: acc})


wandb.log({"slice_metrics": slice_table})

run.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
overall/gpt2_accuracy,▁
overall/roberta_accuracy,▁

0,1
overall/gpt2_accuracy,0.398
overall/roberta_accuracy,0.698


0,1
slice/emoji_gt3/gpt2_accuracy,▁
slice/emoji_gt3/roberta_accuracy,▁

0,1
gpt2_accuracy,0.398
roberta_accuracy,0.698
slice/emoji_gt3/gpt2_accuracy,1.0
slice/emoji_gt3/roberta_accuracy,0.0


## Instructions: Exploring Slice-Based Evaluation in W&B
 Step 1: Open the W&B Project
- Click on the **project link** above.
- Click on the **latest run** near the top.
Step 2: View Tables
- Click the **"Tables"** tab.
- You should see:
  - `predictions_table`
  - `slice_metrics`

Step 3: Use Filters in `predictions_table`
- Click on `predictions_table`.
- Use the filter bar to explore:
Example (see image):
  ```python
  col2 == 0
Step 4: 
- Check slice_metrics table: It shows accuracy of each model for every slice.
- Add a Bar Chart Panel: Click the "Add panels" button (top-right).
- Choose Bar chart under "Charts".
Try to create bar charts comparing accuracies of both models for a slice. Do it for 2 slices.

Discuss your findings with your TA.

# Filtering: 
<img src="images/filtering.png" alt="Predictions Table" width="600">

## Plotting:
<img src="images/plotting.png" alt="Predictions Table" height="300">
<img src="images/bar-charts.png" alt="Predictions Table" width="600">


In [38]:
# Students: replace the placeholders below with 1–2 sentence insights
saved_slice_notes = [""
]
pd.DataFrame(saved_slice_notes)

Unnamed: 0,0
0,



After successfully creating the two slices, come up with three *additional* slices you want to check and **create** the slices & view them in Wandb.

There are two directions to identify useful slices:
- Top-down: Think about what kinds of things the model can struggle with, and come up with some slices.
- Bottom-up: Look at model (mis-)predictions, come up with hypotheses, and translate them into data slices.

3. has_url — tweets containing at least one URL
4. negation_present — tweets with explicit negation (hard for sentiment models)
5. short_text_le20 — very short tweets (≤20 tokens)


In [39]:
# Add these three slices & re-run the notebook to see them on Wandb.

additional_slice_ideas = [
""
]
additional_slice_ideas

['']

In [None]:
# --- Metadata needed (add if not already present) ---
# URL presence
df["url_count"] = df["text"].str.findall(re.compile(r'(?:https?://\S+|www\.\S+)', re.I)).str.len().astype(int)
df["has_url"] = df["url_count"] > 0

# Negation presence
NEGATION_RE = re.compile(
    r"\b(?:no|not|never|without|n't|cannot|can't|won't|don't|didn't|isn't|aren't|"
    r"couldn't|shouldn't|wasn't|weren't)\b", re.I
)
df["negation_present"] = df["text"].str.contains(NEGATION_RE, na=False)

# Short text (by tokens)
df["token_len"] = df["text"].astype(str).str.split().str.len().astype(int)
df["short_text_le20"] = df["token_len"] <= 20


In [52]:
# 1) Ensure required metadata exist on df
# URL presence
if "url_count" not in df.columns:
    URL_RE = re.compile(r'(?:https?://\S+|www\.\S+)', re.I)
    df["url_count"] = df["text"].astype(str).str.findall(URL_RE).str.len().astype(int)
df["has_url"] = df["url_count"] > 0

# Negation presence
NEGATION_RE = re.compile(
    r"\b(?:no|not|never|without|n't|cannot|can't|won't|don't|didn't|isn't|aren't|"
    r"couldn't|shouldn't|wasn't|weren't)\b",
    re.I
)
df["negation_present"] = df["text"].astype(str).str.contains(NEGATION_RE, na=False)

# Length features
df["token_len"] = df["text"].astype(str).str.split().str.len().astype(int)
df["short_text_le20"] = df["token_len"] <= 20

# 2) Propagate metadata columns from df -> df_long using modulo index alignment
meta_cols = ["has_url", "negation_present", "token_len", "short_text_le20"]
n_base = len(df)
idx_map = (np.arange(len(df_long)) % n_base)

for col in meta_cols:
    df_long[col] = df[col].values[idx_map]


In [53]:
# 3) Slice vs. unslice metrics
from sklearn.metrics import accuracy_score

def acc_on_mask(frame, mask):
    sub = frame[mask]
    if len(sub) == 0:
        return None
    return float(accuracy_score(sub["label"], sub["pred"]))

# Optional: overall per-model
overall = (
    df_long.groupby("model")[["label","pred"]]
           .apply(lambda g: (g["label"] == g["pred"]).mean())
           .rename("accuracy")
)

run = wandb.init(project=PROJECT, entity=ENTITY, name=RUN_NAME, reinit=True)
for model_name, acc in overall.items():
    wandb.run.summary[f"{model_name}_accuracy"] = float(acc)

SLICES = {
    "has_url":           df_long["has_url"],
    "negation_present":  df_long["negation_present"],
    "short_text_le20":   df_long["short_text_le20"],
}

for slice_name, mask in SLICES.items():
    tbl = wandb.Table(columns=["group", "model", "accuracy", "n_rows"])
    for model_name, g in df_long.groupby("model"):
        acc_slice = acc_on_mask(g, mask)
        acc_unslc = acc_on_mask(g, ~mask)
        tbl.add_data(f"{slice_name}=True",  model_name, acc_slice,  int(mask.sum()))
        tbl.add_data(f"{slice_name}=False", model_name, acc_unslc, int((~mask).sum()))
        if acc_slice is not None:
            wandb.log({f"slice/{slice_name}/{model_name}_accuracy": acc_slice})
        if acc_unslc is not None:
            wandb.log({f"slice/{slice_name}_complement/{model_name}_accuracy": acc_unslc})
    wandb.log({f"{slice_name}_accuracy": tbl})

run.finish()


0,1
gpt2_accuracy,0.398
roberta_accuracy,0.698


  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
  sub = frame[mask]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
slice/has_url/gpt2_accuracy,▁
slice/has_url/roberta_accuracy,▁
slice/has_url_complement/gpt2_accuracy,▁
slice/has_url_complement/roberta_accuracy,▁
slice/negation_present/gpt2_accuracy,▁
slice/negation_present/roberta_accuracy,▁
slice/negation_present_complement/gpt2_accuracy,▁
slice/negation_present_complement/roberta_accuracy,▁
slice/short_text_le20/gpt2_accuracy,▁
slice/short_text_le20/roberta_accuracy,▁

0,1
gpt2_accuracy,0.398
roberta_accuracy,0.698
slice/has_url/gpt2_accuracy,0.5625
slice/has_url/roberta_accuracy,0.5
slice/has_url_complement/gpt2_accuracy,0.39256
slice/has_url_complement/roberta_accuracy,0.70455
slice/negation_present/gpt2_accuracy,0.25641
slice/negation_present/roberta_accuracy,0.67949
slice/negation_present_complement/gpt2_accuracy,0.42417
slice/negation_present_complement/roberta_accuracy,0.70142


# Step 7 - Write down three addition data slices you want to create but do not have the metadata for slicing

In the previous step, you might have already come up with some slices you wanted to create but found it hard to do with existing metadata. Write down three of such slices in this step.

Example: 
- I want to create a slice on tweets using slangs
- I want to create a slice on non-English tweets (if any)

In [40]:
## Write down three additional data slices here:

additional_slice_descriptions = [
    "",
]

In [44]:
additional_slice_descriptions = [
    "has_url: tweets containing at least one URL (url_count > 0)",
    "negation_present: tweets with explicit negation terms (e.g., not, never, can't)",
    "short_text_le20: tweets with token_len ≤ 20",
]

# Step 8 - Generate more test cases with Large Language Models

Select one slice from the three you wrote down and generate **10 test cases** using LLMs, which can include average case, boundary case, or difficult case.

Your input can be in the following format:

> Examples:
> - @user @user That’s coming, but I think the victims are going to be Medicaid recipients.
> - I think I may be finally in with the in crowd #mannequinchallenge  #grads2014 @user
> 
> Generate more tweets using slangs.

The first part of **Examples** conditions the LLM on the style, length, and content of examples. The second part of **Instructions** instructs what kind of examples you want LLM to generate.

Use our provided GPTs to start the task: [llm-based-test-case-generator](https://chatgpt.com/g/g-982cylVn2-llm-based-test-case-generator). If you do not have access to GPTs, use the plain ChatGPT or other LLM providers you have access to instead.

In [42]:
# Paste your 10 generated tweets here:
generated_slice_description = ""

generated_cases = [""
]

In [46]:
generated_slice_description = "negation_present — tweets containing explicit negation tokens (e.g., no, not, never, can’t, won’t, don’t, didn’t, isn’t, ain’t, without). Mix average, boundary, and difficult cases (polarity flips, litotes, sarcasm), with varied lengths and occasional hashtags/@mentions/URLs."

generated_cases = [
    "Not gonna lie, this rollout isn’t the disaster people said.",
    "Can’t say it’s great, but it’s not terrible either.",
    "No way this isn’t broken for EU users, @support.",
    "I’m not unhappy with the update—just not convinced it fixes the lag.",
    "Won’t pretend the price hike helps; it doesn’t.",
    "Didn’t hate the beta, didn’t love it; not sure I’d recommend it yet.",
    "Without sounding dramatic, the sync still isn’t reliable. #productivity",
    "Never thought I’d say this, but it’s not bad (for once): https://ex.am/ple",
    "Ain’t no way the ‘instant’ checkout isn’t timing out again, @brand.",
    "It’s not that I’m mad, I just can’t use this for work given the bugs."
]