In [1]:
from typing import List, Literal, Union
from IPython.display import HTML, display
from strux import CSVDataSource, RegressionConfig, Sequential, exact_match, absolute_deviation
from pydantic import BaseModel, Field

In [2]:
# Define schemas for review data
class Review(BaseModel):
    text: str

class ReviewAnalysis(BaseModel):
    sentiment: Literal['positive', 'negative', 'neutral']
    rating: float = Field(ge=1, le=5)

In [3]:
# Define two different inference functions to compare
def analyze_review_v1(data: Review) -> ReviewAnalysis:
    """First version of review analyzer - simple keyword matching"""
    text = data.text.lower()
    
    # Basic sentiment analysis
    if "great" in text or "love" in text or "excellent" in text:
        sentiment = "positive"
        rating = 4.5
    elif "bad" in text or "terrible" in text or "horrible" in text:
        sentiment = "negative"
        rating = 1.5
    else:
        sentiment = "neutral"
        rating = 3.0
    
    return ReviewAnalysis(sentiment=sentiment, rating=rating)

def analyze_review_v2(data: Review) -> ReviewAnalysis:
    """Second version with slightly different logic"""
    text = data.text.lower()
    
    # Modified logic with more keywords and different rating scale
    if "great" in text or "love" in text or "excellent" in text or "amazing" in text:
        sentiment = "positive"
        rating = 5.0
    elif "bad" in text or "terrible" in text or "horrible" in text or "broke" in text:
        sentiment = "negative"
        rating = 1.0
    elif "okay" in text or "fine" in text:
        sentiment = "neutral"
        rating = 3.0
    else:
        sentiment = "neutral"
        rating = 2.5
    
    return ReviewAnalysis(sentiment=sentiment, rating=rating)


In [4]:
# Create test CSV file 
test_data = """review,annotation
"{""text"": ""Great product, love it!"", ""date"": ""2024-01-19T10:00:00Z"", ""user_id"": ""user1""}","{""sentiment"": ""positive"", ""rating"": 5.0}"
"{""text"": ""Terrible experience, would not recommend"", ""date"": ""2024-01-19T11:00:00Z"", ""user_id"": ""user2""}","{""sentiment"": ""negative"", ""rating"": 1.0}"
"{""text"": ""It's okay, nothing special"", ""date"": ""2024-01-19T12:00:00Z"", ""user_id"": ""user3""}","{""sentiment"": ""neutral"", ""rating"": 3.0}"
"{""text"": ""Amazing quality and service!"", ""date"": ""2024-01-19T13:00:00Z"", ""user_id"": ""user4""}","{""sentiment"": ""positive"", ""rating"": 4.5}"
"{""text"": ""Product broke after one use"", ""date"": ""2024-01-19T14:00:00Z"", ""user_id"": ""user5""}","{""sentiment"": ""negative"", ""rating"": 1.5}"
"""

with open("test_reviews.csv", "w") as f:
    f.write(test_data)

# Set up data source with JSON parsing
data_source = CSVDataSource.from_csv(
    schema=Review,
    file_path="test_reviews.csv",
    json_columns={
        "review": Review,
        "annotation": ReviewAnalysis
    }
)

# Create config for validation
config = RegressionConfig(
    target_schema=ReviewAnalysis,
    annotation_field="annotation"
) \
    .configure_field(
        "sentiment",
        strategy=exact_match(),  # Exact match for sentiment
        threshold=0.8,  # 80% accuracy required
        compare_with_annotation=True
    ) \
    .configure_field(
        "rating",
        strategy=absolute_deviation(0.5),  # Allow 0.5 difference in ratings
        threshold=0.9,  # 90% of ratings must be within threshold
        compare_with_annotation=True
    )

# Run first version and save as baseline
pipeline_v1 = Sequential.from_steps(
    data_source=data_source,
    steps=[
        ("analyze_review_v1", analyze_review_v1, ReviewAnalysis)
    ],
    config=config
)

In [5]:
print("Running baseline model (v1)...")
results_v1 = pipeline_v1.run()
baseline_run_id = results_v1.run_id

# Save baseline results
baseline_path = "baselines/review_baseline.json"
results_v1.save_as_baseline(baseline_path)

# Display v1 results
print("\nBaseline (v1) Results:")
print(results_v1.format_summary())

Running baseline model (v1)...

Baseline saved to: baselines/review_baseline.json
Use this baseline in future runs with:
pipeline.run(baseline_path='baselines/review_baseline.json')

Baseline (v1) Results:
Run ID: run_20250119_202249
Timestamp: 2025-01-20 04:22:49.405001+00:00
Status: FAILED

Results:
Step: analyze_review_v1
Status: FAILED

Field: sentiment
Score: 0.60 (threshold: 0.8)

  Row 1:
    Predicted: positive
    Expected:  positive

  Row 2:
    Predicted: negative
    Expected:  negative

  Row 3:
    Predicted: neutral
    Expected:  neutral

  Row 4:
    Predicted: neutral
    Expected:  positive

  Row 5:
    Predicted: neutral
    Expected:  negative

Field: rating
Score: 0.60 (threshold: 0.9)

  Row 1:
    Predicted: 4.5
    Expected:  5.0

  Row 2:
    Predicted: 1.5
    Expected:  1.0

  Row 3:
    Predicted: 3.0
    Expected:  3.0

  Row 4:
    Predicted: 3.0
    Expected:  4.5

  Row 5:
    Predicted: 3.0
    Expected:  1.5



In [6]:
# Run second version and compare against baseline
pipeline_v2 = Sequential.from_steps(
    data_source=data_source,
    steps=[
        ("analyze_review_v2", analyze_review_v2, ReviewAnalysis)
    ],
    config=config
)

print("\nRunning new model (v2)...")
results_v2 = pipeline_v2.run(baseline_path=baseline_path)

# Display v2 results and comparison
print("\nNew Version (v2) Results:")
print(results_v2.format_summary())

# Generate HTML report comparing both runs
report_path = "comparison_report.html"
display(HTML(results_v2.to_html(report_path)))
print(f"\nDetailed comparison report saved to: {report_path}")


Running new model (v2)...

Comparing results:
Baseline steps: ['analyze_review_v1']
Current steps: ['analyze_review_v2']
No exact step match, using positional match
Matching analyze_review_v1 with analyze_review_v2
Comparing field sentiment
Baseline values: ['positive', 'negative', 'neutral', 'neutral', 'neutral']
Current values: ['positive', 'negative', 'neutral', 'positive', 'negative']
Comparing field rating
Baseline values: [4.5, 1.5, 3.0, 3.0, 3.0]
Current values: [5.0, 1.0, 3.0, 5.0, 1.0]

New Version (v2) Results:
Run ID: diff_current_vs_baseline
Timestamp: 2025-01-20 04:22:49.409031+00:00
Status: PASSED

Results:
Step: analyze_review_v2
Status: PASSED

Field: sentiment
Score: 1.00 (threshold: 0.8)

  Row 1:
    Predicted: positive
    Expected:  positive

  Row 2:
    Predicted: negative
    Expected:  negative

  Row 3:
    Predicted: neutral
    Expected:  neutral

  Row 4:
    Predicted: positive
    Expected:  positive

  Row 5:
    Predicted: negative
    Expected:  negat

Sample,Input,Baseline,Current,Expected,Status
1,"text='Great product, love it!'",positive,positive,positive,⚪
2,"text='Terrible experience, would not recommend'",negative,negative,negative,⚪
3,"text=""It's okay, nothing special""",neutral,neutral,neutral,⚪
4,text='Amazing quality and service!',neutral,positive,positive,🟢
5,text='Product broke after one use',neutral,negative,negative,🟢

Sample,Input,Baseline,Current,Expected
4,text='Amazing quality and service!',neutral,positive,positive
5,text='Product broke after one use',neutral,negative,negative

Sample,Input,Baseline,Current,Expected

Sample,Input,Baseline,Current,Expected,Status
1,"text='Great product, love it!'",4.5,5.0,5.0,🟢
2,"text='Terrible experience, would not recommend'",1.5,1.0,1.0,🟢
3,"text=""It's okay, nothing special""",3.0,3.0,3.0,⚪
4,text='Amazing quality and service!',3.0,5.0,4.5,⚪
5,text='Product broke after one use',3.0,1.0,1.5,⚪

Sample,Input,Baseline,Current,Expected
1,"text='Great product, love it!'",4.5,5.0,5.0
2,"text='Terrible experience, would not recommend'",1.5,1.0,1.0

Sample,Input,Baseline,Current,Expected



Detailed comparison report saved to: comparison_report.html
