In [None]:
# Setup and Configuration

import vertexai
from vertexai.generative_models import GenerativeModel
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    EvalTask,
)
import pandas as pd
import unittest
from datetime import datetime

PROJECT_ID = "qwiklabs-gcp-01-34739c20280a"
LOCATION = "us-central1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

model = GenerativeModel("gemini-2.0-flash-exp")

In [None]:
# Test the classification function

test_questions = [
    "How do I apply for unemployment benefits?",
    "What are the office hours?",
    "There's a fire in my building!",
    "When are property taxes due?"
]

for question in test_questions:
    category = classify_question(question)
    print(f"Q: {question}")
    print(f"Category: {category}\n")

Q: How do I apply for unemployment benefits?
Category: Employment

Q: What are the office hours?
Category: General Information

Q: There's a fire in my building!
Category: Emergency Services

Q: When are property taxes due?
Category: Tax Related



In [None]:
# Function 2: Social Media Post Generator
# Generates social media posts for government announcements

def generate_social_media_post(announcement_type, details):
    prompt = f"""You are a social media manager for a government department.

Create a clear, concise social media post for the following announcement.

Rules:
1. Keep it under 280 characters
2. Use appropriate tone for government communication
3. Include relevant hashtags
4. Be clear and informative
5. Use proper grammar and punctuation

Announcement Type: {announcement_type}
Details: {details}

Social Media Post:"""

    response = model.generate_content(prompt)
    return response.text.strip()

In [None]:
# Test the social media post generator

announcements = [
    ("weather emergency", "Heavy snowfall expected tonight. 6-8 inches accumulation. Stay off roads if possible."),
    ("holiday", "City offices will be closed on Monday, November 25th for Thanksgiving holiday."),
    ("school closing", "All public schools closed tomorrow due to winter storm warning.")
]

for announcement_type, details in announcements:
    post = generate_social_media_post(announcement_type, details)
    print(f"Type: {announcement_type}")
    print(f"Post: {post}")
    print(f"Length: {len(post)} characters\n")

Type: weather emergency
Post: Heavy snowfall expected tonight: 6-8 inches. Avoid unnecessary travel. Stay safe & monitor local forecasts. #WinterWeather #SnowAlert #StaySafe
Length: 143 characters

Type: holiday
Post: City offices will be closed Monday, Nov 25th, for the Thanksgiving holiday. We wish you a safe and happy holiday! #Thanksgiving #CityServices #HolidayClosure
Length: 157 characters

Type: school closing
Length: 165 characters



In [None]:
# Unit Tests for Question Classification

class TestQuestionClassification(unittest.TestCase):

    def test_employment_classification(self):
        question = "How do I apply for unemployment benefits?"
        result = classify_question(question)
        self.assertEqual(result, "Employment")

    def test_general_info_classification(self):
        question = "What are the office hours?"
        result = classify_question(question)
        self.assertEqual(result, "General Information")

    def test_emergency_classification(self):
        question = "There's a fire in my building!"
        result = classify_question(question)
        self.assertEqual(result, "Emergency Services")

    def test_tax_classification(self):
        question = "When are property taxes due?"
        result = classify_question(question)
        self.assertEqual(result, "Tax Related")

unittest.main(argv=[''], verbosity=2, exit=False)

test_emergency_classification (__main__.TestQuestionClassification.test_emergency_classification) ... ok
test_employment_classification (__main__.TestQuestionClassification.test_employment_classification) ... ok
test_general_info_classification (__main__.TestQuestionClassification.test_general_info_classification) ... ok
test_tax_classification (__main__.TestQuestionClassification.test_tax_classification) ... ok

----------------------------------------------------------------------
Ran 4 tests in 1.652s

OK


<unittest.main.TestProgram at 0x7ea35e418f50>

In [None]:
# Unit Tests for Social Media Post Generator

class TestSocialMediaPosts(unittest.TestCase):

    def test_post_length(self):
        post = generate_social_media_post(
            "weather emergency",
            "Heavy snowfall expected tonight"
        )
        self.assertLess(len(post), 280)

    def test_post_not_empty(self):
        post = generate_social_media_post(
            "holiday",
            "City offices closed for Thanksgiving"
        )
        self.assertGreater(len(post), 0)

    def test_post_contains_hashtag(self):
        post = generate_social_media_post(
            "school closing",
            "All schools closed due to weather"
        )
        self.assertIn("#", post)

unittest.main(argv=[''], verbosity=2, exit=False)

test_emergency_classification (__main__.TestQuestionClassification.test_emergency_classification) ... ok
test_employment_classification (__main__.TestQuestionClassification.test_employment_classification) ... ok
test_general_info_classification (__main__.TestQuestionClassification.test_general_info_classification) ... ok
test_tax_classification (__main__.TestQuestionClassification.test_tax_classification) ... ok
test_post_contains_hashtag (__main__.TestSocialMediaPosts.test_post_contains_hashtag) ... ok
test_post_length (__main__.TestSocialMediaPosts.test_post_length) ... ok
test_post_not_empty (__main__.TestSocialMediaPosts.test_post_not_empty) ... ok

----------------------------------------------------------------------
Ran 7 tests in 3.523s

OK


<unittest.main.TestProgram at 0x7ea34c3bf980>

In [None]:
# Evaluation Setup
# We'll create an alternative version of the classification function to compare

def classify_question_v2(question):
    prompt = f"""Classify this question into exactly one category.

Categories and their scope:
1. Employment - job applications, unemployment benefits, workplace issues, hiring
2. General Information - office hours, locations, contact information, general inquiries
3. Emergency Services - fires, medical emergencies, police assistance, urgent situations
4. Tax Related - property tax, income tax, tax payments, tax deadlines, tax forms

Question: {question}

Respond with only the category name."""

    response = model.generate_content(prompt)
    return response.text.strip()

In [None]:
# Create evaluation dataset for classification

eval_questions = [
    "How do I file for unemployment?",
    "What time does the office open?",
    "Emergency! Need police assistance!",
    "Where do I pay my property taxes?",
    "Are you hiring for any positions?",
    "I need help with my tax return",
    "Is the office open on weekends?",
    "Fire emergency at 123 Main Street",
    "What jobs are available?",
    "When is the tax deadline?"
]

eval_dataset = pd.DataFrame({
    "prompt": eval_questions,
})

eval_dataset

Unnamed: 0,prompt
0,How do I file for unemployment?
1,What time does the office open?
2,Emergency! Need police assistance!
3,Where do I pay my property taxes?
4,Are you hiring for any positions?
5,I need help with my tax return
6,Is the office open on weekends?
7,Fire emergency at 123 Main Street
8,What jobs are available?
9,When is the tax deadline?


Evaluation Task

In [None]:
# Create evaluation task for classification v1

classification_eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[
        MetricPromptTemplateExamples.Pointwise.FLUENCY,
        MetricPromptTemplateExamples.Pointwise.COHERENCE,
    ],
    experiment="question-classification-comparison"
)

In [None]:
# Evaluate version 1 of the classifier with rate limiting

import time

run_ts = datetime.now().strftime("%Y%m%d-%H%M%S")

def model_v1(prompt):
    time.sleep(2)  # Add 2 second delay between requests
    return classify_question(prompt)

eval_result_v1 = classification_eval_task.evaluate(
    model=model_v1,
    experiment_run_name=f"classification-v1-{run_ts}"
)

eval_results_to_compare = []
eval_results_to_compare.append(eval_result_v1)

INFO:vertexai.evaluation._evaluation:Generating a total of 10 responses from the custom model function.
100%|██████████| 10/10 [00:05<00:00,  1.76it/s]
INFO:vertexai.evaluation._evaluation:All 10 responses are successfully generated from the custom model function.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 5.691966877999221 seconds.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 20/20 [00:29<00:00,  1.47s/it]
INFO:vertexai.evaluation._evaluation:All 20 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:29.37053349999769 seconds


In [24]:
# View results for version 1

print("Summary Metrics:")
print(eval_result_v1.summary_metrics)
print("\nMetrics Table:")
eval_result_v1.metrics_table

Summary Metrics:
{'row_count': 10, 'fluency/mean': np.float64(2.0), 'fluency/std': 1.632993161855452, 'coherence/mean': np.float64(1.5), 'coherence/std': 1.2692955176439846}

Metrics Table:


Unnamed: 0,prompt,response,fluency/explanation,fluency/score,coherence/explanation,coherence/score
0,How do I file for unemployment?,Employment,"The response consists of a single, irrelevant ...",1.0,The response is highly illogical and lacks any...,1.0
1,What time does the office open?,General Information,The response 'General Information' is grammati...,1.0,The response 'General Information' is complete...,1.0
2,Emergency! Need police assistance!,Emergency Services,"The response is a short phrase, but it is gram...",5.0,The response is somewhat incoherent because it...,2.0
3,Where do I pay my property taxes?,Tax Related,"The response consists of only two words, 'Tax ...",1.0,The response is highly illogical and lacks any...,1.0
4,Are you hiring for any positions?,Employment,"The response is a single word, 'Employment,' w...",2.0,The response 'Employment' is highly illogical ...,1.0
5,I need help with my tax return,Tax Related,"The response consists of only two words, 'Tax ...",2.0,The response 'Tax Related' is highly illogical...,1.0
6,Is the office open on weekends?,General Information,"The response consists of only two words, 'Gene...",1.0,The response 'General Information' is complete...,1.0
7,Fire emergency at 123 Main Street,Emergency Services,The response is a brief but perfectly fluent p...,5.0,The response is extremely concise but perfectl...,5.0
8,What jobs are available?,Employment,"The response consists of a single word, 'Emplo...",1.0,The response is a single word that is related ...,1.0
9,When is the tax deadline?,Tax Related,"The response is a two-word phrase, not a compl...",1.0,The response is highly illogical as it complet...,1.0


In [25]:
# Evaluate version 2 of the classifier

import time

run_ts = datetime.now().strftime("%Y%m%d-%H%M%S")

def model_v2(prompt):
    time.sleep(2)  # Add 2 second delay between requests
    return classify_question_v2(prompt)

eval_result_v2 = classification_eval_task.evaluate(
    model=model_v2,
    experiment_run_name=f"classification-v2-{run_ts}"
)

eval_results_to_compare.append(eval_result_v2)

INFO:vertexai.evaluation._evaluation:Generating a total of 10 responses from the custom model function.
100%|██████████| 10/10 [00:05<00:00,  1.72it/s]
INFO:vertexai.evaluation._evaluation:All 10 responses are successfully generated from the custom model function.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 5.827200948999234 seconds.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 20/20 [00:22<00:00,  1.12s/it]
INFO:vertexai.evaluation._evaluation:All 20 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:22.365800689000025 seconds


In [26]:
# View results for version 2

print("Summary Metrics:")
print(eval_result_v2.summary_metrics)
print("\nMetrics Table:")
eval_result_v2.metrics_table

Summary Metrics:
{'row_count': 10, 'fluency/mean': np.float64(2.0), 'fluency/std': 1.632993161855452, 'coherence/mean': np.float64(1.6), 'coherence/std': 1.2649110640673518}

Metrics Table:


Unnamed: 0,prompt,response,fluency/explanation,fluency/score,coherence/explanation,coherence/score
0,How do I file for unemployment?,Employment,The response is a single word that does not an...,1.0,"The response is a single word, 'Employment,' w...",1.0
1,What time does the office open?,General Information,"The response consists of only two words, which...",2.0,The response 'General Information' is complete...,1.0
2,Emergency! Need police assistance!,Emergency Services,"The response is a short phrase, but it is gram...",5.0,The AI response 'Emergency Services' demonstra...,4.0
3,Where do I pay my property taxes?,Tax Related,The response consists of only two words and la...,1.0,The response 'Tax Related' is completely illog...,1.0
4,Are you hiring for any positions?,Employment,"The response consists of a single word, 'Emplo...",2.0,The response is a single word that is topicall...,1.0
5,I need help with my tax return,Tax Related,"The response consists of only two words, 'Tax ...",1.0,The response is highly illogical and incoheren...,1.0
6,Is the office open on weekends?,General Information,"The response is a single phrase, 'General Info...",1.0,The response is highly incoherent as it comple...,1.0
7,Fire emergency at 123 Main Street,Emergency Services,"The response is a very concise phrase, but it ...",5.0,The response 'Emergency Services' demonstrates...,4.0
8,What jobs are available?,Employment,"The response consists of a single word which, ...",1.0,The response is highly illogical and does not ...,1.0
9,When is the tax deadline?,Tax Related,"The response is a fragment, 'Tax Related', whi...",1.0,The response is highly illogical and lacks any...,1.0


In [27]:
# Compare both versions side by side

comparison_df = pd.DataFrame({
    'Version': ['V1 - Basic Prompt', 'V2 - Detailed Prompt'],
    'Fluency Mean': [
        eval_result_v1.summary_metrics['fluency/mean'],
        eval_result_v2.summary_metrics['fluency/mean']
    ],
    'Coherence Mean': [
        eval_result_v1.summary_metrics['coherence/mean'],
        eval_result_v2.summary_metrics['coherence/mean']
    ]
})

comparison_df

Unnamed: 0,Version,Fluency Mean,Coherence Mean
0,V1 - Basic Prompt,2.0,1.5
1,V2 - Detailed Prompt,2.0,1.6


# Social Media

In [28]:
# Evaluation for Social Media Post Generator
# Create evaluation dataset with different prompt styles

def generate_social_media_post_v2(announcement_type, details):
    prompt = f"""Create a social media post for a government announcement.

Type: {announcement_type}
Details: {details}

Requirements:
- Maximum 280 characters
- Professional government tone
- Include 1-2 relevant hashtags
- Clear call to action if applicable
- Proper grammar

Post:"""

    response = model.generate_content(prompt)
    return response.text.strip()

In [29]:
# Create social media evaluation dataset

social_media_announcements = [
    {"type": "weather emergency", "details": "Blizzard warning tonight. 12+ inches expected. Avoid travel."},
    {"type": "holiday", "details": "Government offices closed December 25th for Christmas."},
    {"type": "school closing", "details": "All schools closed due to snow storm."},
    {"type": "event", "details": "Town hall meeting Thursday 6pm to discuss new park development."},
    {"type": "service update", "details": "Trash collection delayed one day due to holiday."}
]

social_eval_dataset = pd.DataFrame({
    "announcement_type": [a["type"] for a in social_media_announcements],
    "details": [a["details"] for a in social_media_announcements],
    "prompt": [f"{a['type']}: {a['details']}" for a in social_media_announcements]
})

social_eval_dataset

Unnamed: 0,announcement_type,details,prompt
0,weather emergency,Blizzard warning tonight. 12+ inches expected....,weather emergency: Blizzard warning tonight. 1...
1,holiday,Government offices closed December 25th for Ch...,holiday: Government offices closed December 25...
2,school closing,All schools closed due to snow storm.,school closing: All schools closed due to snow...
3,event,Town hall meeting Thursday 6pm to discuss new ...,event: Town hall meeting Thursday 6pm to discu...
4,service update,Trash collection delayed one day due to holiday.,service update: Trash collection delayed one d...


In [30]:
# Create evaluation task for social media posts

social_media_eval_task = EvalTask(
    dataset=social_eval_dataset,
    metrics=[
        MetricPromptTemplateExamples.Pointwise.FLUENCY,
        MetricPromptTemplateExamples.Pointwise.COHERENCE,
        MetricPromptTemplateExamples.Pointwise.SAFETY,
    ],
    experiment="social-media-post-comparison"
)

In [32]:
# Evaluate social media post generator v1 with longer delay

import time

run_ts = datetime.now().strftime("%Y%m%d-%H%M%S")

def social_model_v1(prompt):
    time.sleep(5)  # Increased to 5 seconds
    parts = prompt.split(": ", 1)
    announcement_type = parts[0]
    details = parts[1] if len(parts) > 1 else ""
    return generate_social_media_post(announcement_type, details)

social_eval_result_v1 = social_media_eval_task.evaluate(
    model=social_model_v1,
    experiment_run_name=f"social-media-v1-{run_ts}"
)

social_results_to_compare = []
social_results_to_compare.append(social_eval_result_v1)

INFO:vertexai.evaluation._evaluation:Generating a total of 5 responses from the custom model function.
100%|██████████| 5/5 [00:05<00:00,  1.16s/it]
INFO:vertexai.evaluation._evaluation:All 5 responses are successfully generated from the custom model function.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 5.818032656003197 seconds.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 15 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 15/15 [00:05<00:00,  2.75it/s]
INFO:vertexai.evaluation._evaluation:All 15 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:5.456173923001188 seconds


In [33]:
# View social media v1 results

print("Summary Metrics:")
print(social_eval_result_v1.summary_metrics)
print("\nMetrics Table:")
social_eval_result_v1.metrics_table

Summary Metrics:
{'row_count': 5, 'fluency/mean': np.float64(5.0), 'fluency/std': 0.0, 'coherence/mean': np.float64(5.0), 'coherence/std': 0.0, 'safety/mean': np.float64(1.0), 'safety/std': 0.0}

Metrics Table:


Unnamed: 0,announcement_type,details,prompt,response,fluency/explanation,fluency/score,coherence/explanation,coherence/score,safety/explanation,safety/score
0,weather emergency,Blizzard warning tonight. 12+ inches expected....,weather emergency: Blizzard warning tonight. 1...,Blizzard Warning in effect tonight! 12+ inches...,The response is completely free of grammatical...,5.0,The response exhibits exceptional coherence by...,5.0,The response is safe as it contains no hate sp...,1.0
1,holiday,Government offices closed December 25th for Ch...,holiday: Government offices closed December 25...,Government offices will be closed on December ...,"The response is free of grammatical errors, us...",5.0,"The response perfectly aligns with the prompt,...",5.0,The response is safe and free from any toxic l...,1.0
2,school closing,All schools closed due to snow storm.,school closing: All schools closed due to snow...,All schools are CLOSED today due to the snow s...,"The response is grammatically perfect, uses cl...",5.0,"The response is perfectly coherent, directly a...",5.0,The response is safe as it contains no hate sp...,1.0
3,event,Town hall meeting Thursday 6pm to discuss new ...,event: Town hall meeting Thursday 6pm to discu...,Join us for a town hall meeting this Thursday ...,"The response is grammatically correct, uses ap...",5.0,The response takes the concise input and expan...,5.0,The response is safe as it contains no hate sp...,1.0
4,service update,Trash collection delayed one day due to holiday.,service update: Trash collection delayed one d...,Holiday trash collection delay: Collection wil...,"The response is free of grammatical errors, us...",5.0,"The response is perfectly coherent, directly a...",5.0,The response is a harmless service announcemen...,1.0


In [34]:
# Evaluate social media post generator v2

import time

run_ts = datetime.now().strftime("%Y%m%d-%H%M%S")

def social_model_v2(prompt):
    time.sleep(5)
    parts = prompt.split(": ", 1)
    announcement_type = parts[0]
    details = parts[1] if len(parts) > 1 else ""
    return generate_social_media_post_v2(announcement_type, details)

social_eval_result_v2 = social_media_eval_task.evaluate(
    model=social_model_v2,
    experiment_run_name=f"social-media-v2-{run_ts}"
)

social_results_to_compare.append(social_eval_result_v2)

INFO:vertexai.evaluation._evaluation:Generating a total of 5 responses from the custom model function.
100%|██████████| 5/5 [00:05<00:00,  1.14s/it]
INFO:vertexai.evaluation._evaluation:All 5 responses are successfully generated from the custom model function.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 5.718515639000543 seconds.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 15 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 15/15 [00:05<00:00,  2.52it/s]
INFO:vertexai.evaluation._evaluation:All 15 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:5.9729572079995705 seconds


In [35]:
# View social media v2 results

print("Summary Metrics:")
print(social_eval_result_v2.summary_metrics)
print("\nMetrics Table:")
social_eval_result_v2.metrics_table

Summary Metrics:
{'row_count': 5, 'fluency/mean': np.float64(5.0), 'fluency/std': 0.0, 'coherence/mean': np.float64(5.0), 'coherence/std': 0.0, 'safety/mean': np.float64(1.0), 'safety/std': 0.0}

Metrics Table:


Unnamed: 0,announcement_type,details,prompt,response,fluency/explanation,fluency/score,coherence/explanation,coherence/score,safety/explanation,safety/score
0,weather emergency,Blizzard warning tonight. 12+ inches expected....,weather emergency: Blizzard warning tonight. 1...,BLIZZARD WARNING tonight. 12+ inches of snow e...,"The response is free of grammatical errors, em...",5.0,The response demonstrates exceptional coherenc...,5.0,The AI response is a helpful and informative m...,1.0
1,holiday,Government offices closed December 25th for Ch...,holiday: Government offices closed December 25...,Government offices will be closed on December ...,"The response is grammatically perfect, uses ap...",5.0,The response demonstrates excellent coherence ...,5.0,The response is safe as it contains no hate sp...,1.0
2,school closing,All schools closed due to snow storm.,school closing: All schools closed due to snow...,"Due to severe weather conditions, all schools ...","The response is free of grammatical errors, us...",5.0,The response is completely coherent. It direct...,5.0,The response is safe and contains no hate spee...,1.0
3,event,Town hall meeting Thursday 6pm to discuss new ...,event: Town hall meeting Thursday 6pm to discu...,Join us Thursday at 6 PM for a town hall meeti...,"The response is free of grammatical errors, us...",5.0,The response seamlessly takes the factual prom...,5.0,The response is free from any toxic language o...,1.0
4,service update,Trash collection delayed one day due to holiday.,service update: Trash collection delayed one d...,Heads up! Trash collection will be delayed by ...,"The response is free of grammatical errors, em...",5.0,The response takes the prompt and transforms i...,5.0,The response is a harmless and helpful service...,1.0


In [36]:
# Compare social media post versions

social_comparison_df = pd.DataFrame({
    'Version': ['V1 - Original', 'V2 - Structured'],
    'Fluency Mean': [
        social_eval_result_v1.summary_metrics['fluency/mean'],
        social_eval_result_v2.summary_metrics['fluency/mean']
    ],
    'Coherence Mean': [
        social_eval_result_v1.summary_metrics['coherence/mean'],
        social_eval_result_v2.summary_metrics['coherence/mean']
    ],
    'Safety Mean': [
        social_eval_result_v1.summary_metrics['safety/mean'],
        social_eval_result_v2.summary_metrics['safety/mean']
    ]
})

social_comparison_df

Unnamed: 0,Version,Fluency Mean,Coherence Mean,Safety Mean
0,V1 - Original,5.0,5.0,1.0
1,V2 - Structured,5.0,5.0,1.0


In [37]:
# Summary and Conclusions

print("Challenge 3: Testing and Evaluation - Complete")
print("\n" + "="*60)
print("KEY COMPONENTS IMPLEMENTED:")
print("="*60)
print("\n1. Question Classification Function")
print("   - Classifies questions into 4 government service categories")
print("   - Categories: Employment, General Information, Emergency Services, Tax Related")
print("\n2. Social Media Post Generator")
print("   - Creates posts for government announcements")
print("   - Enforces 280 character limit with hashtags")
print("\n3. Unit Tests")
print("   - 4 tests for classification function")
print("   - 3 tests for social media post generator")
print("   - All 7 tests passed successfully")
print("\n4. Evaluation API Implementation")
print("   - Compared two prompt strategies for each function")
print("   - Used metrics: Fluency, Coherence, Safety")
print("   - Classification results: Both versions performed similarly")
print("   - Social media results: Both versions achieved perfect scores")
print("\n" + "="*60)
print("All requirements met for Challenge 3")
print("="*60)

Challenge 3: Testing and Evaluation - Complete

KEY COMPONENTS IMPLEMENTED:

1. Question Classification Function
   - Classifies questions into 4 government service categories
   - Categories: Employment, General Information, Emergency Services, Tax Related

2. Social Media Post Generator
   - Creates posts for government announcements
   - Enforces 280 character limit with hashtags

3. Unit Tests
   - 4 tests for classification function
   - 3 tests for social media post generator
   - All 7 tests passed successfully

4. Evaluation API Implementation
   - Compared two prompt strategies for each function
   - Used metrics: Fluency, Coherence, Safety
   - Classification results: Both versions performed similarly
   - Social media results: Both versions achieved perfect scores

All requirements met for Challenge 3
