# Part B: Zero-Shot, Few-Shot & Chain-of-Thought Prompting

**Experiments can be conducted using either:**
1. **Automated:** HuggingFace Inference API (uncomment code to run automatically)
2. **Manual:** https://gpt-oss.com/ (copy prompts, paste predictions)

For automated method, set `HF_TOKEN` in `.env` file and uncomment `get_completion()` calls.

## Setup

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

# Initialize client with HuggingFace router (for automated inference)
client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.environ.get("HF_TOKEN", "your_token_here"),
)

def get_completion(prompt, model="openai/gpt-oss-20b:groq"):
    """Get completion from HuggingFace inference API"""
    try:
        completion = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

# Training examples - use these in your few-shot prompts
train_examples = [
    {"text": "(Lawrence bounces) All over the stage, dancing, running, sweating, mopping his face and generally displaying the wacky talent that brought him fame in the first place.", "label": "positive"},
    {"text": "Despite all evidence to the contrary, this clunker has somehow managed to pose as an actual feature movie, the kind that charges full admission and gets hyped on tv and purports to amuse small children and ostensible adults.", "label": "negative"},
    {"text": "For the first time in years, de niro digs deep emotionally, perhaps because he's been stirred by the powerful work of his co-stars.", "label": "positive"},
    {"text": "I'll bet the video game is a lot more fun than the film.", "label": "negative"},
    {"text": "A masterpiece of storytelling with brilliant performances throughout.", "label": "positive"}
]

# Test examples - evaluate your prompts on these
test_examples = [
    {"text": "An absolute waste of time and money. Terrible acting and nonsensical plot.", "label": "negative"},
    {"text": "Captivating from start to finish. A must-see film!", "label": "positive"},
    {"text": "Boring and predictable. I fell asleep halfway through.", "label": "negative"},
    {"text": "Brilliant cinematography and a touching story that stays with you.", "label": "positive"},
    {"text": "The worst movie I've seen this year. Complete disaster.", "label": "negative"}
]

ground_truth = [ex['label'] for ex in test_examples]

print(f"Training set: {len(train_examples)} examples")
print(f"Test set: {len(test_examples)} examples")
print(f"\nTest examples:")
for i, ex in enumerate(test_examples, 1):
    print(f"{i}. [{ex['label']}] {ex['text'][:50]}...")

---
## Experiment 1: Zero-Shot (Baseline)

In [None]:
# Prompt template
prompt_template = """Classify the sentiment of the following movie review as either 'positive' or 'negative'.

Review: {text}
Sentiment:"""

# Print prompt for Test 1
print(prompt_template.format(text=test_examples[0]['text']))

In [None]:
# Option 1: Run automated (uncomment to use)
# zero_shot_predictions = []
# for ex in test_examples:
#     prompt = f"""Classify the sentiment of the following movie review as either 'positive' or 'negative'.
# 
# Review: {ex['text']}
# Sentiment:"""
#     response = get_completion(prompt).strip().lower()
#     zero_shot_predictions.append(response)

# Option 2: Paste predictions manually
zero_shot_predictions = [
    "negative",  # Test 1
    "positive",  # Test 2
    "negative",  # Test 3
    "positive",  # Test 4
    "negative"   # Test 5
]

# Calculate accuracy
correct = sum([1 for pred, truth in zip(zero_shot_predictions, ground_truth) if pred == truth])
accuracy = correct / len(ground_truth)

print(f"Zero-Shot Results:")
for i, (pred, truth) in enumerate(zip(zero_shot_predictions, ground_truth), 1):
    status = "✓" if pred == truth else "✗"
    print(f"Test {i}: {status} Predicted={pred}, Expected={truth}")
print(f"\nAccuracy: {correct}/{len(ground_truth)} = {accuracy:.1%}")

---
## Experiment 2: One-Shot

In [None]:
# Prompt template
prompt_template = """Classify the sentiment of movie reviews as either 'positive' or 'negative'.

Review: {train_ex1}
Sentiment: {train_label1}

Review: {test_text}
Sentiment:"""

# Print prompt for Test 1
print(prompt_template.format(
    train_ex1=train_examples[0]['text'],
    train_label1=train_examples[0]['label'],
    test_text=test_examples[0]['text']
))

In [None]:
# Paste your predictions here
one_shot_predictions = [
    "",  # Test 1
    "",  # Test 2
    "",  # Test 3
    "",  # Test 4
    ""   # Test 5
]

# Calculate accuracy
correct = sum([1 for pred, truth in zip(one_shot_predictions, ground_truth) if pred == truth])
accuracy = correct / len(ground_truth)

print(f"One-Shot Results:")
for i, (pred, truth) in enumerate(zip(one_shot_predictions, ground_truth), 1):
    status = "✓" if pred == truth else "✗"
    print(f"Test {i}: {status} Predicted={pred}, Expected={truth}")
print(f"\nAccuracy: {correct}/{len(ground_truth)} = {accuracy:.1%}")

---
## Experiment 3: Three-Shot

In [None]:
# Prompt template with 3 examples
prompt_template = """Classify the sentiment of movie reviews as either 'positive' or 'negative'.

Review: {train_ex1}
Sentiment: {train_label1}

Review: {train_ex2}
Sentiment: {train_label2}

Review: {train_ex3}
Sentiment: {train_label3}

Review: {test_text}
Sentiment:"""

# Print prompt for Test 1
print(prompt_template.format(
    train_ex1=train_examples[0]['text'],
    train_label1=train_examples[0]['label'],
    train_ex2=train_examples[1]['text'],
    train_label2=train_examples[1]['label'],
    train_ex3=train_examples[2]['text'],
    train_label3=train_examples[2]['label'],
    test_text=test_examples[0]['text']
))

In [None]:
# Paste your predictions here
three_shot_predictions = [
    "",  # Test 1
    "",  # Test 2
    "",  # Test 3
    "",  # Test 4
    ""   # Test 5
]

# Calculate accuracy
correct = sum([1 for pred, truth in zip(three_shot_predictions, ground_truth) if pred == truth])
accuracy = correct / len(ground_truth)

print(f"Three-Shot Results:")
for i, (pred, truth) in enumerate(zip(three_shot_predictions, ground_truth), 1):
    status = "✓" if pred == truth else "✗"
    print(f"Test {i}: {status} Predicted={pred}, Expected={truth}")
print(f"\nAccuracy: {correct}/{len(ground_truth)} = {accuracy:.1%}")

---
## Experiment 4: Five-Shot

In [None]:
# Prompt template with all 5 training examples
prompt_parts = ["Classify the sentiment of movie reviews as either 'positive' or 'negative'.\n"]
for ex in train_examples:
    prompt_parts.append(f"\nReview: {ex['text']}\nSentiment: {ex['label']}")

prompt_parts.append(f"\n\nReview: {test_examples[0]['text']}\nSentiment:")
print("".join(prompt_parts))

In [None]:
# Paste your predictions here
five_shot_predictions = [
    "",  # Test 1
    "",  # Test 2
    "",  # Test 3
    "",  # Test 4
    ""   # Test 5
]

# Calculate accuracy
correct = sum([1 for pred, truth in zip(five_shot_predictions, ground_truth) if pred == truth])
accuracy = correct / len(ground_truth)

print(f"Five-Shot Results:")
for i, (pred, truth) in enumerate(zip(five_shot_predictions, ground_truth), 1):
    status = "✓" if pred == truth else "✗"
    print(f"Test {i}: {status} Predicted={pred}, Expected={truth}")
print(f"\nAccuracy: {correct}/{len(ground_truth)} = {accuracy:.1%}")

---
## Experiment 5: Shuffled Examples

In [None]:
# Shuffled order: 4, 5, 1, 3, 2
shuffled_order = [3, 4, 0, 2, 1]
shuffled_examples = [train_examples[i] for i in shuffled_order]

prompt_parts = ["Classify the sentiment of movie reviews as either 'positive' or 'negative'.\n"]
for ex in shuffled_examples:
    prompt_parts.append(f"\nReview: {ex['text']}\nSentiment: {ex['label']}")

prompt_parts.append(f"\n\nReview: {test_examples[0]['text']}\nSentiment:")
print("".join(prompt_parts))

In [None]:
# Paste your predictions here
shuffled_predictions = [
    "",  # Test 1
    "",  # Test 2
    "",  # Test 3
    "",  # Test 4
    ""   # Test 5
]

# Calculate accuracy
correct = sum([1 for pred, truth in zip(shuffled_predictions, ground_truth) if pred == truth])
accuracy = correct / len(ground_truth)

print(f"Shuffled Examples Results:")
for i, (pred, truth) in enumerate(zip(shuffled_predictions, ground_truth), 1):
    status = "✓" if pred == truth else "✗"
    print(f"Test {i}: {status} Predicted={pred}, Expected={truth}")
print(f"\nAccuracy: {correct}/{len(ground_truth)} = {accuracy:.1%}")

---
## Experiment 6: Mislabeled Examples

In [None]:
# Flip labels for first 3 training examples
mislabeled_examples = [
    {"text": train_examples[0]['text'], "label": "negative"},
    {"text": train_examples[1]['text'], "label": "positive"},
    {"text": train_examples[2]['text'], "label": "negative"},
]

prompt_parts = ["Classify the sentiment of movie reviews as either 'positive' or 'negative'.\n"]
for ex in mislabeled_examples:
    prompt_parts.append(f"\nReview: {ex['text']}\nSentiment: {ex['label']}")

prompt_parts.append(f"\n\nReview: {test_examples[0]['text']}\nSentiment:")
print("".join(prompt_parts))

In [None]:
# Paste your predictions here
mislabeled_predictions = [
    "",  # Test 1
    "",  # Test 2
    "",  # Test 3
    "",  # Test 4
    ""   # Test 5
]

# Calculate accuracy
correct = sum([1 for pred, truth in zip(mislabeled_predictions, ground_truth) if pred == truth])
accuracy = correct / len(ground_truth)

print(f"Mislabeled Examples Results:")
for i, (pred, truth) in enumerate(zip(mislabeled_predictions, ground_truth), 1):
    status = "✓" if pred == truth else "✗"
    print(f"Test {i}: {status} Predicted={pred}, Expected={truth}")
print(f"\nAccuracy: {correct}/{len(ground_truth)} = {accuracy:.1%}")

---
## Experiment 7: Chain-of-Thought

In [None]:
# Chain-of-thought prompt with reasoning
prompt = f"""Classify the sentiment of movie reviews as either 'positive' or 'negative'.
Think step by step about the language used before making your classification.

Review: "{train_examples[0]['text']}"
Reasoning: The review uses positive descriptive words like "talent" and "fame" and describes energetic, entertaining actions.
Sentiment: positive

Review: "{train_examples[1]['text']}"
Reasoning: The review uses negative words like "clunker" and sarcastic phrases like "despite all evidence" suggesting disappointment.
Sentiment: negative

Review: "{test_examples[0]['text']}"
Reasoning:"""

print(prompt)

In [None]:
# Paste your predictions here (extract the sentiment from the full response)
cot_predictions = [
    "",  # Test 1
    "",  # Test 2
    "",  # Test 3
    "",  # Test 4
    ""   # Test 5
]

# Calculate accuracy
correct = sum([1 for pred, truth in zip(cot_predictions, ground_truth) if pred == truth])
accuracy = correct / len(ground_truth)

print(f"Chain-of-Thought Results:")
for i, (pred, truth) in enumerate(zip(cot_predictions, ground_truth), 1):
    status = "✓" if pred == truth else "✗"
    print(f"Test {i}: {status} Predicted={pred}, Expected={truth}")
print(f"\nAccuracy: {correct}/{len(ground_truth)} = {accuracy:.1%}")

---
## Summary: Compare All Experiments

In [None]:
import pandas as pd

# After filling in all predictions above, calculate accuracies
def calc_accuracy(predictions):
    if not any(predictions):  # Check if empty
        return 0.0
    correct = sum([1 for pred, truth in zip(predictions, ground_truth) if pred == truth])
    return correct / len(ground_truth)

results = pd.DataFrame({
    'Experiment': [
        'Zero-Shot',
        'One-Shot',
        'Three-Shot',
        'Five-Shot',
        'Shuffled',
        'Mislabeled',
        'Chain-of-Thought'
    ],
    'Accuracy': [
        f"{calc_accuracy(zero_shot_predictions):.1%}",
        f"{calc_accuracy(one_shot_predictions):.1%}",
        f"{calc_accuracy(three_shot_predictions):.1%}",
        f"{calc_accuracy(five_shot_predictions):.1%}",
        f"{calc_accuracy(shuffled_predictions):.1%}",
        f"{calc_accuracy(mislabeled_predictions):.1%}",
        f"{calc_accuracy(cot_predictions):.1%}"
    ]
})

print("\n" + "="*50)
print("FINAL RESULTS SUMMARY")
print("="*50)
print(results.to_string(index=False))
print("="*50)