In [1]:
# Cell 1: Imports and setup (only run once)
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import numpy as np
from tqdm.auto import tqdm

bias_classes = [
    "no_bias", "group_1", "group_2"
]

In [2]:
# Cell 2: Model loading (only run once)
model_name = "microsoft/Phi-4-mini-instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
print("Model loaded successfully!")

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.91M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/54.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded successfully!


In [3]:
import ast

# Cell 3: Prompt(only run once)
def classify_bias(description):
    class_list = ", ".join(bias_classes)
    prompt = [
             {"role": "system", "content": (
                f"You are a policy-analysis assistant.\n"
                f"Given a policy excerpt, output *exactly one* of these three labels-nothing else, all lower-case:\n"
                f"- group_2\n"
                f"- group_1\n"
                f"- no_bias\n\n"
                f"Definitions:\n"
                f"- *group_2: policies related to identity, social, and economic status*, including:\n"
                f"• economic: references income, poverty, homelessness, wealth, financial assistance, costs, payments, fees, funding, housing, area living in, etc.\n"
                f"• racial_cultural: references race, ethnicity, culture, personal beliefs, etc.\n"
                f"• age: references children, youth, child welfare policies, adults, elderly, seniors, age-related policies, etc.\n"
                f"• religion: references religious beliefs, regligious groups, faith-based accommodations, etc.\n"
                f"• gender: references women, men sex, gender identity, use of only one pronoun for a role, sexual harassment, reproductive rights, pregnancy, etc.\n\n"
                f"- *group_1: policies related to legal, institutional, or civic systems*, including:\n"
                f"• political: references voting rights, politics, elections, campaigns, government, war, international relations, etc.\n"
                f"• criminal_justice: refernces crime, criminals, court, law enforcement, policing, prison, etc.\n"
                f"• citizenship: references immigration, immigration status, deportation, visas, border control, etc.\n"
                f"• disability: references physical or mental impairments, accommodations for impairments, accessibility, illness, etc.\n"
                f"• education: curriculum, degrees, teaching credentials, language proficiency, language required to learn, standardized testing, school admission, etc.\n\n"
                f"- no_bias: procedural or factual text.\n\n"
                f"Instructions - Follow the following steps exactly, when picking a label:\n"
                f"Step 1: Does the excerpt fit under economic, racial_cultural, age, religion, or gender? If yes, pick group_2.\n"
                f"Step 2: If the answer to Step 1 was \"no\", does the excerpt fit under political, criminal_justice, citizenship, disability, or education? If yes, pick group_1.\n"
                f"Step 3: If the excerpt does not fit under group_1 and group_2, it is purely a procedure, a definition, or administrative, so pick no_bias.\n"
                f"- Do not default to any label. All labels are equally possible for an excerpt.\n"
                f"- Return just the label; no explanations.\n"
                f"- IMPORTANT: *Avoid defaulting to group_1 or no_bias.* Fully consider if an exerpt is group_2 bias"
             )},
            {"role": "user", "content": (
                    f"Classify the following policy excerpt.\n"
                    f"Think step by step:\n"
                    f"1. Are any groups of people mentioned?\n"
                    f"2. Are there any implications of different treatment?\n"
                    f"3. Which category does this best fit?\n\n"
                    f"Return *only* one of: group_2, group_1, no_bias\n\n"
                    f"Excerpt:\n"
                    f"\"\"\"\n"
                    f"{description}\n"
                    f"\"\"\""
            )}
     ]
    result = pipe(prompt, max_new_tokens=10, do_sample=False)

    # The result is a list with a dictionary that has 'generated_text'
    full_text = str(result[0]['generated_text'])
    #print("RAW MODEL OUTPUT:", full_text)

    try:
        parsed = ast.literal_eval(full_text)  # Safely evaluate the string as a list of dicts
        for entry in parsed:
            if entry.get("role") == "assistant":
                label = entry.get("content", "").strip().lower()
                if label in {"group_1", "group_2", "no_bias"}:
                    return label
    except Exception as e:
        print("Parsing error:", e)

    # Fallback: search for keywords in text
    for cls in ["group_1", "group_2", "no_bias"]:
        if cls in full_text.lower():
            return cls

    return "Unknown"


In [None]:
'''
# Cell 4: Process data and calculate metrics for first 100 rows
# First, run the classification on the dataset
df = pd.read_csv("FINAL_DATASET.csv")

# Check dataset distribution
print("Dataset class distribution:")
print(df['bias_type_merged'].value_counts())

first_100 = df.head(100)  # Limit to first 100 rows

# Process with your classify_bias function
print("Processing first 100 rows...")
first_100['predicted_bias'] = first_100['policy'].apply(classify_bias)

# Ensure we're working with clean labels
y_true = first_100['bias_type_merged'].str.strip().str.lower()  # Changed from bias_type to bias_type_merged
y_pred = first_100['predicted_bias']

# Calculate accuracy
accuracy = (y_true == y_pred).mean()

# Print results in the requested format
print("\n===== RESULTS FOR FIRST 100 ROWS =====")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_true, y_pred, labels=bias_classes))

# Also print the custom table format showing counts
result_df = pd.DataFrame(index=bias_classes)
result_df['Dataset Count'] = [sum(y_true == cls) for cls in bias_classes]
result_df['Predictions Count'] = [sum(y_pred == cls) for cls in bias_classes]

# Calculate class-wise accuracy
for cls in bias_classes:
    cls_instances = first_100[y_true == cls]
    if len(cls_instances) > 0:
        result_df.loc[cls, 'Accuracy'] = sum(cls_instances['predicted_bias'] == cls) / len(cls_instances)
    else:
        result_df.loc[cls, 'Accuracy'] = 0

print("\nDetailed Counts and Class Accuracy:")
print(result_df)

# Save results to a CSV file
#output_file = "bias_classification_results_first100.csv"
#first_100.to_csv(output_file, index=False)
#print(f"Results saved to {output_file}")
'''

Dataset class distribution:
bias_type_merged
group_1    7339
group_2    6969
no_bias    6386
Name: count, dtype: int64
Processing first 100 rows...

===== RESULTS FOR FIRST 100 ROWS =====
Accuracy: 60.00%
              precision    recall  f1-score   support

     no_bias       0.65      0.94      0.77        54
     group_1       0.42      0.42      0.42        19
     group_2       0.50      0.04      0.07        27

    accuracy                           0.60       100
   macro avg       0.52      0.47      0.42       100
weighted avg       0.56      0.60      0.51       100


Detailed Counts and Class Accuracy:
         Dataset Count  Predictions Count  Accuracy
no_bias             54                 79  0.944444
group_1             19                 19  0.421053
group_2             27                  2  0.037037


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_100['predicted_bias'] = first_100['policy'].apply(classify_bias)


In [None]:
'''
# Cell 5: Create balanced sample and calculate metrics
# Create a balanced sample with 33 rows from each class
balanced_df = pd.DataFrame()

# Debug class counts in the dataset
for cls in bias_classes:
    cls_count = sum(df['bias_type_merged'].str.strip().str.lower() == cls)
    print(f"Class {cls} has {cls_count} instances in the full dataset")

for cls in bias_classes:
    # Get all rows of this class
    cls_rows = df[df['bias_type_merged'].str.strip().str.lower() == cls]

    # Sample 100 rows or all available if less
    sample_size = min(100, len(cls_rows))
    if sample_size < 100:
        print(f"Warning: Only {sample_size} rows available for class {cls}")

    sampled = cls_rows.sample(sample_size, random_state=42)
    balanced_df = pd.concat([balanced_df, sampled])

# Shuffle the rows
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Process with your classify_bias function
print(f"\nProcessing balanced sample with {len(balanced_df)} rows...")
balanced_df['predicted_bias'] = balanced_df['policy'].apply(classify_bias)

# Ensure we're working with clean labels
y_true = balanced_df['bias_type_merged'].str.strip().str.lower()
y_pred = balanced_df['predicted_bias']

# Calculate accuracy
accuracy = (y_true == y_pred).mean()

# Print results in the requested format
print("\n===== RESULTS FOR BALANCED SAMPLE =====")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_true, y_pred, labels=bias_classes))

# Also print the custom table format showing counts
balanced_result = pd.DataFrame(index=bias_classes)
balanced_result['Dataset Count'] = [sum(y_true == cls) for cls in bias_classes]
balanced_result['Predictions Count'] = [sum(y_pred == cls) for cls in bias_classes]

# Calculate class-wise accuracy
for cls in bias_classes:
    cls_instances = balanced_df[y_true == cls]
    if len(cls_instances) > 0:
        balanced_result.loc[cls, 'Accuracy'] = sum(cls_instances['predicted_bias'] == cls) / len(cls_instances)
    else:
        balanced_result.loc[cls, 'Accuracy'] = 0

print("\nDetailed Counts and Class Accuracy:")
print(balanced_result)

# Save results to a CSV file
#output_file_balanced = "bias_classification_results_balanced.csv"
#balanced_df.to_csv(output_file_balanced, index=False)
#print(f"Results saved to {output_file_balanced}")

# Create a download link in Colab
#from google.colab import files
#files.download(output_file)
#files.download(output_file_balanced)
'''

Class no_bias has 6386 instances in the full dataset
Class group_1 has 7339 instances in the full dataset
Class group_2 has 6969 instances in the full dataset

Processing balanced sample with 300 rows...

===== RESULTS FOR BALANCED SAMPLE =====
Accuracy: 57.00%
              precision    recall  f1-score   support

     no_bias       0.55      0.70      0.62       100
     group_1       0.50      0.56      0.53       100
     group_2       0.73      0.45      0.56       100

    accuracy                           0.57       300
   macro avg       0.59      0.57      0.57       300
weighted avg       0.59      0.57      0.57       300


Detailed Counts and Class Accuracy:
         Dataset Count  Predictions Count  Accuracy
no_bias            100                127      0.70
group_1            100                111      0.56
group_2            100                 62      0.45


In [4]:
# Cell 6: Process full dataset with batch processing

# Load full dataset
print("Loading full dataset...")
df = pd.read_csv("FINAL_DATASET.csv")

# Check dataset distribution
print("Full dataset class distribution:")
print(df['bias_type_merged'].value_counts())

# Create empty column for predictions
df['predicted_bias'] = np.nan

# Set up batch processing
BATCH_SIZE = 32
num_batches = len(df) // BATCH_SIZE + (1 if len(df) % BATCH_SIZE > 0 else 0)

# Process in batches with progress bar
print(f"\nProcessing full dataset in {num_batches} batches of size {BATCH_SIZE}...")

for i in tqdm(range(num_batches)):
    # Get batch indices
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, len(df))

    # Process batch
    batch = df.iloc[start_idx:end_idx]
    batch_predictions = [classify_bias(text) for text in batch['policy']]

    # Store predictions
    df.loc[start_idx:end_idx-1, 'predicted_bias'] = batch_predictions

# Ensure we're working with clean labels
y_true = df['bias_type_merged'].str.strip().str.lower()
y_pred = df['predicted_bias']

# Calculate overall accuracy
accuracy = (y_true == y_pred).mean()

# Print results
print("\n===== RESULTS FOR FULL DATASET =====")
print(f"Total samples processed: {len(df)}")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_true, y_pred, labels=bias_classes))

# Create detailed results table
result_df = pd.DataFrame(index=bias_classes)
result_df['Dataset Count'] = [sum(y_true == cls) for cls in bias_classes]
result_df['Predictions Count'] = [sum(y_pred == cls) for cls in bias_classes]

# Calculate class-wise accuracy
for cls in bias_classes:
    cls_instances = df[y_true == cls]
    if len(cls_instances) > 0:
        result_df.loc[cls, 'Accuracy'] = sum(cls_instances['predicted_bias'] == cls) / len(cls_instances)
    else:
        result_df.loc[cls, 'Accuracy'] = 0

print("\nDetailed Counts and Class Accuracy:")
print(result_df)

# Create confusion matrix
confusion_df = pd.DataFrame(index=bias_classes, columns=bias_classes)
for true_cls in bias_classes:
    for pred_cls in bias_classes:
        confusion_df.loc[true_cls, pred_cls] = sum((y_true == true_cls) & (y_pred == pred_cls))

print("\nConfusion Matrix:")
print(confusion_df)

# Save results to a CSV file
output_file = "phi4_mini_full_dataset_results.csv"
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

Loading full dataset...
Full dataset class distribution:
bias_type_merged
group_1    7339
group_2    6969
no_bias    6386
Name: count, dtype: int64

Processing full dataset in 647 batches of size 32...


  0%|          | 0/647 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  df.loc[start_idx:end_idx-1, 'predicted_bias'] = batch_predictions



===== RESULTS FOR FULL DATASET =====
Total samples processed: 20694
Accuracy: 53.99%
              precision    recall  f1-score   support

     no_bias       0.51      0.69      0.59      6386
     group_1       0.50      0.52      0.51      7339
     group_2       0.67      0.42      0.52      6969

    accuracy                           0.54     20694
   macro avg       0.56      0.54      0.54     20694
weighted avg       0.56      0.54      0.54     20694


Detailed Counts and Class Accuracy:
         Dataset Count  Predictions Count  Accuracy
no_bias           6386               8670  0.689790
group_1           7339               7687  0.523777
group_2           6969               4337  0.419572

Confusion Matrix:
        no_bias group_1 group_2
no_bias    4405    1538     443
group_1    2525    3844     970
group_2    1740    2305    2924
Results saved to phi4_mini_full_dataset_results.csv
