# Movie Insights Analysis - Phase 3
## Transforming Features into Client-Ready Insights

**Purpose**: Generate 7-slide analysis structure for each of 10 folk horror films

**Input**: `reviews_enhanced.csv` (3,222 reviews √ó 76 features, 80.4% gender coverage)

**Output**: Structured JSON files with slide-ready data + direct quotes

---

## Module Structure:
- **Module 0**: Setup & Data Loading
- **Module 1**: Audience Segmentation Analysis (Slide 1)
- **Module 2**: What Resonated Analysis (Slide 3)
- **Module 3**: What Didn't Work Analysis (Slide 4)
- **Module 4**: Polarization Analysis (Slide 4 continued) [TO BE BUILT]
- **Module 5**: Marketing Disconnect Analysis (Slide 5) [TO BE BUILT]
- **Module 6**: Risk Factors Analysis (Slide 6) [TO BE BUILT]
- **Module 7**: Target Audience Recommendation (Slides 2 & 7) [TO BE BUILT]
- **Module 8**: Export Functions [TO BE BUILT]
- **Module 9**: Cross-Movie Roll-Up [TO BE BUILT]

---

## Module 0: Setup & Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("‚úÖ Libraries imported successfully")

In [None]:
# Load the enhanced dataset
data_path = Path('/Users/jamesroot/Desktop/JAMES/Noetheca/Reviews/Data/reviews_enhanced.csv')

print(f"Loading dataset from: {data_path}")
df = pd.read_csv(data_path)

print(f"\n‚úÖ Dataset loaded successfully!")
print(f"   Rows: {len(df):,}")
print(f"   Columns: {len(df.columns)}")
print(f"\nüìä Movies in dataset:")
print(df['Movie_Title'].value_counts())

In [None]:
# Define movie list and helper functions
MOVIES = df['Movie_Title'].unique().tolist()

print(f"\nüìΩÔ∏è  Total movies: {len(MOVIES)}")
print(f"\nMovie list:")
for i, movie in enumerate(MOVIES, 1):
    count = len(df[df['Movie_Title'] == movie])
    print(f"  {i}. {movie:30} ({count:4} reviews)")

In [None]:
# Helper Functions

def get_movie_reviews(movie_name):
    """
    Filter dataset to specific movie
    Returns: DataFrame with only that movie's reviews
    """
    return df[df['Movie_Title'] == movie_name].copy()

def get_gender_segment(df_movie, gender):
    """
    Filter movie reviews by gender
    
    Args:
        df_movie: DataFrame filtered to one movie
        gender: 'male', 'female', or 'unknown'
    
    Returns: DataFrame with only that gender
    """
    return df_movie[df_movie['username_gender_hint'] == gender].copy()

def safe_mean(series):
    """
    Calculate mean, handling empty series gracefully
    Returns: float or None
    """
    if len(series) == 0:
        return None
    return float(series.mean())

def safe_percentage(count, total):
    """
    Calculate percentage, handling division by zero
    Returns: float (0-100) or None
    """
    if total == 0:
        return None
    return round((count / total) * 100, 1)

def extract_quotes(df_filtered, column, limit=10):
    """
    Extract quotes from reviews with Review_ID for traceability
    
    Args:
        df_filtered: DataFrame (pre-filtered for criteria)
        column: Column containing text/lists to extract
        limit: Maximum number of quotes to return
    
    Returns: List of dicts with Review_ID, Reviewer, and content
    """
    # Filter to rows where column has content
    has_content = df_filtered[df_filtered[column].notna()].copy()
    
    # If column contains lists (like love_statements), filter for non-empty lists
    if len(has_content) > 0:
        # Check if first non-null value is a string that looks like a list
        first_val = has_content[column].iloc[0]
        if isinstance(first_val, str) and first_val.startswith('['):
            has_content = has_content[has_content[column] != '[]']
    
    # Take top N by engagement (total_votes)
    top_quotes = has_content.nlargest(limit, 'total_votes')
    
    # Extract relevant fields
    quotes = []
    for _, row in top_quotes.iterrows():
        quotes.append({
            'review_id': row['Review_ID'],
            'reviewer': row['Reviewer'],
            'rating': int(row['Rating']),
            'content': row[column],
            'engagement': int(row['total_votes']) if pd.notna(row['total_votes']) else 0
        })
    
    return quotes

print("‚úÖ Helper functions defined")

In [None]:
# Verify dataset features are loaded correctly

print("üîç Verifying key features...\n")

# Check gender coverage
gender_counts = df['username_gender_hint'].value_counts()
identified = len(df[df['username_gender_hint'] != 'unknown'])
coverage = (identified / len(df)) * 100

print(f"Gender Detection:")
print(f"  Male: {gender_counts.get('male', 0):,}")
print(f"  Female: {gender_counts.get('female', 0):,}")
print(f"  Unknown: {gender_counts.get('unknown', 0):,}")
print(f"  Coverage: {coverage:.1f}%")

# Check emotion columns
emotion_cols = ['emotion_joy', 'emotion_trust', 'emotion_fear', 'emotion_surprise', 
                'emotion_sadness', 'emotion_disgust', 'emotion_anger', 'emotion_anticipation']
print(f"\nEmotion Columns: {all(col in df.columns for col in emotion_cols)}")

# Check preference phrases
print(f"\nPreference Phrases:")
print(f"  Reviews with love_statements: {len(df[df['love_count'] > 0]):,}")
print(f"  Reviews with hate_statements: {len(df[df['hate_count'] > 0]):,}")
print(f"  Reviews with wish_statements: {len(df[df['wish_count'] > 0]):,}")

# Check engagement data
print(f"\nEngagement:")
print(f"  Reviews with votes: {len(df[df['has_engagement'] == True]):,}")
print(f"  Avg votes per review: {df['total_votes'].mean():.1f}")

print("\n‚úÖ Feature verification complete!")

---
## Module 1: Audience Segmentation Analysis
**Purpose**: Generate data for Slide 1 - Overview & Ratings Distribution

**Outputs**:
- Total review count
- Rating distribution (1-10)
- Gender breakdown (with coverage %)
- Average rating & variance
- Temporal segments (when reviews were written)
- Engagement patterns

In [None]:
def audience_breakdown(movie_name):
    """
    Generate Slide 1: Overview & Ratings Distribution
    
    Returns: Dictionary with audience segmentation data
    """
    reviews = get_movie_reviews(movie_name)
    
    # Basic metrics
    total_reviews = len(reviews)
    
    # Rating distribution
    rating_dist = reviews['Rating'].value_counts().sort_index().to_dict()
    avg_rating = safe_mean(reviews['Rating'])
    rating_variance = float(reviews['Rating'].var()) if len(reviews) > 0 else None
    
    # Gender breakdown
    gender_counts = reviews['username_gender_hint'].value_counts().to_dict()
    identified = len(reviews[reviews['username_gender_hint'] != 'unknown'])
    gender_coverage = safe_percentage(identified, total_reviews)
    
    # Temporal segments
    temporal_dist = reviews['review_window'].value_counts().to_dict()
    
    # Engagement patterns
    has_engagement = len(reviews[reviews['has_engagement'] == True])
    engagement_pct = safe_percentage(has_engagement, total_reviews)
    avg_votes = safe_mean(reviews['total_votes'])
    avg_helpfulness = safe_mean(reviews['helpfulness_ratio'])
    
    # Segment by rating groups
    lovers = len(reviews[reviews['Rating'] >= 8])  # 8-10
    mixed = len(reviews[(reviews['Rating'] >= 4) & (reviews['Rating'] <= 7)])  # 4-7
    haters = len(reviews[reviews['Rating'] <= 3])  # 1-3
    
    return {
        'movie': movie_name,
        'total_reviews': total_reviews,
        'rating_distribution': rating_dist,
        'avg_rating': avg_rating,
        'rating_variance': rating_variance,
        
        'rating_segments': {
            'lovers_8_10': lovers,
            'lovers_pct': safe_percentage(lovers, total_reviews),
            'mixed_4_7': mixed,
            'mixed_pct': safe_percentage(mixed, total_reviews),
            'haters_1_3': haters,
            'haters_pct': safe_percentage(haters, total_reviews)
        },
        
        'gender_breakdown': {
            'male': gender_counts.get('male', 0),
            'female': gender_counts.get('female', 0),
            'unknown': gender_counts.get('unknown', 0),
            'coverage_pct': gender_coverage
        },
        
        'temporal_segments': temporal_dist,
        
        'engagement': {
            'reviews_with_votes': has_engagement,
            'engagement_pct': engagement_pct,
            'avg_votes_per_review': avg_votes,
            'avg_helpfulness_ratio': avg_helpfulness
        }
    }

print("‚úÖ audience_breakdown() function defined")

In [None]:
# Test Module 1 on The Witch (largest dataset)

print("üß™ Testing Module 1: Audience Segmentation\n")
print("="*80)

test_movie = "The Witch"
result = audience_breakdown(test_movie)

# Display results in readable format
print(f"\nüìä AUDIENCE BREAKDOWN: {test_movie}")
print(f"\nTotal Reviews: {result['total_reviews']:,}")
print(f"Average Rating: {result['avg_rating']:.2f}/10")
print(f"Rating Variance: {result['rating_variance']:.2f}")

print(f"\nüìà Rating Segments:")
print(f"  Lovers (8-10): {result['rating_segments']['lovers_8_10']:,} ({result['rating_segments']['lovers_pct']}%)")
print(f"  Mixed (4-7):   {result['rating_segments']['mixed_4_7']:,} ({result['rating_segments']['mixed_pct']}%)")
print(f"  Haters (1-3):  {result['rating_segments']['haters_1_3']:,} ({result['rating_segments']['haters_pct']}%)")

print(f"\nüë• Gender Breakdown ({result['gender_breakdown']['coverage_pct']}% coverage):")
print(f"  Male:    {result['gender_breakdown']['male']:,}")
print(f"  Female:  {result['gender_breakdown']['female']:,}")
print(f"  Unknown: {result['gender_breakdown']['unknown']:,}")

print(f"\nüìÖ Temporal Distribution:")
for window, count in sorted(result['temporal_segments'].items()):
    print(f"  {window:20} {count:4} reviews")

print(f"\nüí¨ Engagement:")
print(f"  Reviews with votes: {result['engagement']['reviews_with_votes']:,} ({result['engagement']['engagement_pct']}%)")
print(f"  Avg votes: {result['engagement']['avg_votes_per_review']:.1f}")
print(f"  Avg helpfulness: {result['engagement']['avg_helpfulness_ratio']:.2f}")

print("\n‚úÖ Module 1 test complete!")

---
## Module 2: What Resonated Analysis (Lovers)
**Purpose**: Generate data for Slide 3 - What Resonated (by demographic)

**Focus**: Reviews with Rating >= 8

**Outputs**:
- Total lovers count
- Gender segmentation of lovers
- Emotion profiles by gender (joy, trust, fear, anticipation)
- Love statements with quotes
- Writing style profiles (analytical vs emotional)
- Top themes/patterns

In [None]:
def what_resonated(movie_name):
    """
    Generate Slide 3: What Resonated (by demographic)
    
    Analyzes reviews with Rating >= 8
    
    Returns: Dictionary with lover insights segmented by gender
    """
    reviews = get_movie_reviews(movie_name)
    lovers = reviews[reviews['Rating'] >= 8].copy()
    
    total_lovers = len(lovers)
    
    if total_lovers == 0:
        return {
            'movie': movie_name,
            'total_lovers': 0,
            'message': 'No reviews with rating >= 8'
        }
    
    # Gender segmentation
    male_lovers = get_gender_segment(lovers, 'male')
    female_lovers = get_gender_segment(lovers, 'female')
    
    # Emotion profiles by gender (note: columns are prefixed with emotion_)
    emotion_cols = ['emotion_joy', 'emotion_trust', 'emotion_fear', 'emotion_surprise', 
                    'emotion_sadness', 'emotion_disgust', 'emotion_anger', 'emotion_anticipation']
    
    # Remove 'emotion_' prefix for cleaner output
    all_emotions = {col.replace('emotion_', ''): safe_mean(lovers[col]) for col in emotion_cols}
    male_emotions = {col.replace('emotion_', ''): safe_mean(male_lovers[col]) for col in emotion_cols} if len(male_lovers) > 0 else {}
    female_emotions = {col.replace('emotion_', ''): safe_mean(female_lovers[col]) for col in emotion_cols} if len(female_lovers) > 0 else {}
    
    # Love statements
    lovers_with_love = lovers[lovers['love_count'] > 0]
    love_quotes = extract_quotes(lovers_with_love, 'love_statements', limit=10)
    
    # Writing style analysis
    avg_reading_ease = safe_mean(lovers['flesch_reading_ease'])
    avg_grade_level = safe_mean(lovers['flesch_kincaid_grade'])
    avg_first_person = safe_mean(lovers['first_person_ratio'])
    avg_exclamations = safe_mean(lovers['exclamation_count'])
    
    # Determine writing style profile
    if avg_first_person and avg_first_person > 0.03:
        style_profile = 'emotional/personal'
    elif avg_reading_ease and avg_reading_ease < 60:
        style_profile = 'analytical/complex'
    else:
        style_profile = 'balanced'
    
    # Comparison films mentioned
    comparisons = lovers[lovers['has_comparisons'] == True]
    comparison_pct = safe_percentage(len(comparisons), total_lovers)
    
    return {
        'movie': movie_name,
        'total_lovers': total_lovers,
        'lovers_pct_of_all_reviews': safe_percentage(total_lovers, len(reviews)),
        
        'gender_segmentation': {
            'male': len(male_lovers),
            'male_pct': safe_percentage(len(male_lovers), total_lovers),
            'female': len(female_lovers),
            'female_pct': safe_percentage(len(female_lovers), total_lovers),
            'unknown': total_lovers - len(male_lovers) - len(female_lovers)
        },
        
        'emotion_profiles': {
            'all_lovers': all_emotions,
            'male_lovers': male_emotions,
            'female_lovers': female_emotions
        },
        
        'love_statements': {
            'count': len(lovers_with_love),
            'percentage': safe_percentage(len(lovers_with_love), total_lovers),
            'quotes': love_quotes
        },
        
        'writing_style': {
            'profile': style_profile,
            'avg_reading_ease': avg_reading_ease,
            'avg_grade_level': avg_grade_level,
            'avg_first_person_ratio': avg_first_person,
            'avg_exclamations': avg_exclamations
        },
        
        'comparisons': {
            'reviews_with_comparisons': len(comparisons),
            'comparison_pct': comparison_pct
        }
    }

print("‚úÖ what_resonated() function defined")

In [None]:
# Test Module 2 on The Witch

print("üß™ Testing Module 2: What Resonated (Lovers)\n")
print("="*80)

test_movie = "The Witch"
result = what_resonated(test_movie)

print(f"\n‚ù§Ô∏è  WHAT RESONATED: {test_movie}")
print(f"\nLovers (Rating >= 8): {result['total_lovers']:,} ({result['lovers_pct_of_all_reviews']}% of all reviews)")

print(f"\nüë• Gender Breakdown of Lovers:")
print(f"  Male:   {result['gender_segmentation']['male']:,} ({result['gender_segmentation']['male_pct']}%)")
print(f"  Female: {result['gender_segmentation']['female']:,} ({result['gender_segmentation']['female_pct']}%)")
print(f"  Unknown: {result['gender_segmentation']['unknown']:,}")

print(f"\nüòä Emotion Profiles (All Lovers):")
for emotion, score in result['emotion_profiles']['all_lovers'].items():
    if score:
        print(f"  {emotion.capitalize():12} {score:.3f}")

if result['emotion_profiles']['male_lovers'] and result['emotion_profiles']['female_lovers']:
    print(f"\nüìä Gender Emotion Comparison (Top 3 emotions):")
    print(f"\n  Male Lovers:")
    male_sorted = sorted(result['emotion_profiles']['male_lovers'].items(), key=lambda x: x[1] if x[1] else 0, reverse=True)[:3]
    for emotion, score in male_sorted:
        if score:
            print(f"    {emotion.capitalize():12} {score:.3f}")
    
    print(f"\n  Female Lovers:")
    female_sorted = sorted(result['emotion_profiles']['female_lovers'].items(), key=lambda x: x[1] if x[1] else 0, reverse=True)[:3]
    for emotion, score in female_sorted:
        if score:
            print(f"    {emotion.capitalize():12} {score:.3f}")

print(f"\nüí¨ Love Statements:")
print(f"  Reviews with love statements: {result['love_statements']['count']} ({result['love_statements']['percentage']}%)")

if result['love_statements']['quotes']:
    print(f"\n  üìù Top 3 Love Quotes (by engagement):")
    for i, quote in enumerate(result['love_statements']['quotes'][:3], 1):
        print(f"\n  {i}. [{quote['review_id']}] by {quote['reviewer']} (Rating: {quote['rating']}/10, Votes: {quote['engagement']})")
        content = quote['content'][:200] + '...' if len(quote['content']) > 200 else quote['content']
        print(f"     {content}")

print(f"\n‚úçÔ∏è  Writing Style:")
print(f"  Profile: {result['writing_style']['profile']}")
print(f"  Avg Reading Ease: {result['writing_style']['avg_reading_ease']:.1f}")
print(f"  Avg Grade Level: {result['writing_style']['avg_grade_level']:.1f}")
print(f"  Avg First Person Usage: {result['writing_style']['avg_first_person_ratio']:.3f}")
print(f"  Avg Exclamations: {result['writing_style']['avg_exclamations']:.1f}")

print(f"\nüé¨ Comparisons:")
print(f"  Reviews mentioning other films: {result['comparisons']['reviews_with_comparisons']} ({result['comparisons']['comparison_pct']}%)")

print("\n‚úÖ Module 2 test complete!")

---
## Module 3: What Didn't Work Analysis (Haters)
**Purpose**: Generate data for Slide 4 - What Didn't Work (polarization points)

**Focus**: Reviews with Rating <= 3

**Outputs**:
- Total haters count
- Gender segmentation of haters
- Emotion profiles (anger, disgust, sadness)
- Hate statements and wish statements with quotes
- Comparison films (what did they expect vs what they got)
- Common pain points

In [None]:
def what_didnt_work(movie_name):
    """
    Generate Slide 4: What Didn't Work (polarization points)
    
    Analyzes reviews with Rating <= 3
    
    Returns: Dictionary with hater insights
    """
    reviews = get_movie_reviews(movie_name)
    haters = reviews[reviews['Rating'] <= 3].copy()
    
    total_haters = len(haters)
    
    if total_haters == 0:
        return {
            'movie': movie_name,
            'total_haters': 0,
            'message': 'No reviews with rating <= 3'
        }
    
    # Gender segmentation
    male_haters = get_gender_segment(haters, 'male')
    female_haters = get_gender_segment(haters, 'female')
    
    # Emotion profiles - focus on negative emotions
    negative_emotions = ['emotion_anger', 'emotion_disgust', 'emotion_sadness', 'emotion_fear']
    
    all_emotions = {col.replace('emotion_', ''): safe_mean(haters[col]) for col in negative_emotions}
    male_emotions = {col.replace('emotion_', ''): safe_mean(male_haters[col]) for col in negative_emotions} if len(male_haters) > 0 else {}
    female_emotions = {col.replace('emotion_', ''): safe_mean(female_haters[col]) for col in negative_emotions} if len(female_haters) > 0 else {}
    
    # Hate statements
    haters_with_hate = haters[haters['hate_count'] > 0]
    hate_quotes = extract_quotes(haters_with_hate, 'hate_statements', limit=10)
    
    # Wish statements ("I wish it had...")
    haters_with_wish = haters[haters['wish_count'] > 0]
    wish_quotes = extract_quotes(haters_with_wish, 'wish_statements', limit=10)
    
    # Comparison films - what did they expect?
    haters_with_comparisons = haters[haters['has_comparisons'] == True]
    comparison_pct = safe_percentage(len(haters_with_comparisons), total_haters)
    
    # Extract mentioned movies from haters
    mentioned_movies = []
    if 'movies_mentioned' in haters.columns:
        for movies in haters['movies_mentioned'].dropna():
            if isinstance(movies, str) and movies != '[]':
                # Parse the list string
                import ast
                try:
                    movie_list = ast.literal_eval(movies)
                    mentioned_movies.extend(movie_list)
                except:
                    pass
    
    from collections import Counter
    movie_mentions = dict(Counter(mentioned_movies).most_common(10))
    
    # Writing style
    avg_reading_ease = safe_mean(haters['flesch_reading_ease'])
    avg_grade_level = safe_mean(haters['flesch_kincaid_grade'])
    avg_exclamations = safe_mean(haters['exclamation_count'])
    avg_caps = safe_mean(haters['caps_word_count'])
    
    # Question count - indicates confusion/disappointment
    haters_with_questions = haters[haters['question_count'] > 0]
    question_pct = safe_percentage(len(haters_with_questions), total_haters)
    
    return {
        'movie': movie_name,
        'total_haters': total_haters,
        'haters_pct_of_all_reviews': safe_percentage(total_haters, len(reviews)),
        
        'gender_segmentation': {
            'male': len(male_haters),
            'male_pct': safe_percentage(len(male_haters), total_haters),
            'female': len(female_haters),
            'female_pct': safe_percentage(len(female_haters), total_haters),
            'unknown': total_haters - len(male_haters) - len(female_haters)
        },
        
        'emotion_profiles': {
            'all_haters': all_emotions,
            'male_haters': male_emotions,
            'female_haters': female_emotions
        },
        
        'hate_statements': {
            'count': len(haters_with_hate),
            'percentage': safe_percentage(len(haters_with_hate), total_haters),
            'quotes': hate_quotes
        },
        
        'wish_statements': {
            'count': len(haters_with_wish),
            'percentage': safe_percentage(len(haters_with_wish), total_haters),
            'quotes': wish_quotes
        },
        
        'comparisons': {
            'reviews_with_comparisons': len(haters_with_comparisons),
            'comparison_pct': comparison_pct,
            'mentioned_movies': movie_mentions
        },
        
        'writing_indicators': {
            'avg_reading_ease': avg_reading_ease,
            'avg_grade_level': avg_grade_level,
            'avg_exclamations': avg_exclamations,
            'avg_caps_words': avg_caps,
            'reviews_with_questions': len(haters_with_questions),
            'question_pct': question_pct
        }
    }

print("‚úÖ what_didnt_work() function defined")

In [None]:
# Test Module 3 on The Witch

print("üß™ Testing Module 3: What Didn't Work (Haters)\n")
print("="*80)

test_movie = "The Witch"
result = what_didnt_work(test_movie)

print(f"\nüíî WHAT DIDN'T WORK: {test_movie}")
print(f"\nHaters (Rating <= 3): {result['total_haters']:,} ({result['haters_pct_of_all_reviews']}% of all reviews)")

print(f"\nüë• Gender Breakdown of Haters:")
print(f"  Male:   {result['gender_segmentation']['male']:,} ({result['gender_segmentation']['male_pct']}%)")
print(f"  Female: {result['gender_segmentation']['female']:,} ({result['gender_segmentation']['female_pct']}%)")
print(f"  Unknown: {result['gender_segmentation']['unknown']:,}")

print(f"\nüò† Negative Emotion Profiles (All Haters):")
for emotion, score in sorted(result['emotion_profiles']['all_haters'].items(), key=lambda x: x[1] if x[1] else 0, reverse=True):
    if score:
        print(f"  {emotion.capitalize():12} {score:.3f}")

print(f"\nüí¨ Hate Statements:")
print(f"  Reviews with hate statements: {result['hate_statements']['count']} ({result['hate_statements']['percentage']}%)")

if result['hate_statements']['quotes']:
    print(f"\n  üìù Top 3 Hate Quotes (by engagement):")
    for i, quote in enumerate(result['hate_statements']['quotes'][:3], 1):
        print(f"\n  {i}. [{quote['review_id']}] by {quote['reviewer']} (Rating: {quote['rating']}/10, Votes: {quote['engagement']})")
        content = quote['content'][:200] + '...' if len(quote['content']) > 200 else quote['content']
        print(f"     {content}")

print(f"\nüôè Wish Statements:")
print(f"  Reviews with wish statements: {result['wish_statements']['count']} ({result['wish_statements']['percentage']}%)")

if result['wish_statements']['quotes']:
    print(f"\n  üìù Top 3 Wish Quotes (by engagement):")
    for i, quote in enumerate(result['wish_statements']['quotes'][:3], 1):
        print(f"\n  {i}. [{quote['review_id']}] by {quote['reviewer']} (Rating: {quote['rating']}/10, Votes: {quote['engagement']})")
        content = quote['content'][:200] + '...' if len(quote['content']) > 200 else quote['content']
        print(f"     {content}")

print(f"\nüé¨ Comparisons (What They Expected):")
print(f"  Reviews mentioning other films: {result['comparisons']['reviews_with_comparisons']} ({result['comparisons']['comparison_pct']}%)")

if result['comparisons']['mentioned_movies']:
    print(f"\n  Most Mentioned Films:")
    for movie, count in list(result['comparisons']['mentioned_movies'].items())[:5]:
        print(f"    {movie:40} {count:3} mentions")

print(f"\n‚úçÔ∏è  Writing Indicators (Emotional Intensity):")
print(f"  Avg Reading Ease: {result['writing_indicators']['avg_reading_ease']:.1f}")
print(f"  Avg Grade Level: {result['writing_indicators']['avg_grade_level']:.1f}")
print(f"  Avg Exclamations: {result['writing_indicators']['avg_exclamations']:.1f}")
print(f"  Avg CAPS Words: {result['writing_indicators']['avg_caps_words']:.1f}")
print(f"  Reviews with Questions: {result['writing_indicators']['reviews_with_questions']} ({result['writing_indicators']['question_pct']}%)")

print("\n‚úÖ Module 3 test complete!")

---
## üéØ Next Steps

**Modules 1-3 Complete!** ‚úÖ

**Ready to build:**
- Module 4: Polarization Analysis
- Module 5: Marketing Disconnect Analysis
- Module 6: Risk Factors Analysis
- Module 7: Target Audience Recommendation
- Module 8: Export Functions (generate JSON files)
- Module 9: Cross-Movie Roll-Up

**Current Status:**
- ‚úÖ Data loading and helper functions working
- ‚úÖ Audience breakdown analysis (Slide 1 data)
- ‚úÖ What resonated analysis (Slide 3 data)
- ‚úÖ What didn't work analysis (Slide 4 data)
- ‚úÖ Gender segmentation functioning at 80.4% coverage
- ‚úÖ Quote extraction with traceability (Review_IDs)
- ‚úÖ Emotion profiling by demographic

**Testing passed on:** The Witch (1,105 reviews)

----

# MODULE 4: POLARIZATION ANALYSIS

**Purpose**: Generate data for Slide 4 (continued) - What Divides Audiences

**Approach**: Compare lovers (8-10) vs haters (1-3) to identify polarizing elements

**Outputs**:
- Polarization metrics (rating variance, vote distribution)
- Emotion divergence (which emotions differ most between lovers/haters)
- Theme contradictions (what lovers praise vs haters criticize)
- Gender-based polarization patterns
- Review timing patterns (early vs late polarization)

In [None]:
def polarization_analysis(movie_name):
    """
    Generate Slide 4 (continued): Polarization Analysis
    
    Compares lovers (8-10) vs haters (1-3) to identify what divides audiences
    
    Returns: Dictionary with polarization insights
    """
    reviews = get_movie_reviews(movie_name)
    
    # Get lovers and haters
    lovers = reviews[reviews['Rating'] >= 8].copy()
    haters = reviews[reviews['Rating'] <= 3].copy()
    
    total_reviews = len(reviews)
    
    # Basic polarization metrics
    rating_variance = float(reviews['Rating'].var())
    rating_std = float(reviews['Rating'].std())
    
    # Bimodal distribution check (high variance + gap in middle ratings)
    middle_reviews = len(reviews[(reviews['Rating'] >= 4) & (reviews['Rating'] <= 7)])
    middle_pct = safe_percentage(middle_reviews, total_reviews)
    
    # Determine polarization level
    if rating_variance > 8 and middle_pct and middle_pct < 30:
        polarization_level = "HIGHLY_POLARIZING"
    elif rating_variance > 6:
        polarization_level = "MODERATELY_POLARIZING"
    else:
        polarization_level = "CONSENSUS"
    
    # Emotion divergence analysis
    emotion_cols = ['emotion_joy', 'emotion_trust', 'emotion_fear', 'emotion_surprise', 
                    'emotion_sadness', 'emotion_disgust', 'emotion_anger', 'emotion_anticipation']
    
    emotion_divergence = {}
    for col in emotion_cols:
        lovers_mean = safe_mean(lovers[col])
        haters_mean = safe_mean(haters[col])
        
        if lovers_mean is not None and haters_mean is not None:
            divergence = abs(lovers_mean - haters_mean)
            emotion_name = col.replace('emotion_', '')
            emotion_divergence[emotion_name] = {
                'lovers_score': lovers_mean,
                'haters_score': haters_mean,
                'divergence': divergence
            }
    
    # Sort by divergence to find most polarizing emotions
    sorted_emotions = sorted(emotion_divergence.items(), 
                            key=lambda x: x[1]['divergence'], 
                            reverse=True)
    
    # Writing style differences
    style_differences = {
        'reading_ease': {
            'lovers': safe_mean(lovers['flesch_reading_ease']),
            'haters': safe_mean(haters['flesch_reading_ease']),
            'difference': safe_mean(haters['flesch_reading_ease']) - safe_mean(lovers['flesch_reading_ease']) if safe_mean(haters['flesch_reading_ease']) and safe_mean(lovers['flesch_reading_ease']) else None
        },
        'grade_level': {
            'lovers': safe_mean(lovers['flesch_kincaid_grade']),
            'haters': safe_mean(haters['flesch_kincaid_grade']),
            'difference': safe_mean(haters['flesch_kincaid_grade']) - safe_mean(lovers['flesch_kincaid_grade']) if safe_mean(haters['flesch_kincaid_grade']) and safe_mean(lovers['flesch_kincaid_grade']) else None
        },
        'first_person_usage': {
            'lovers': safe_mean(lovers['first_person_ratio']),
            'haters': safe_mean(haters['first_person_ratio']),
            'difference': safe_mean(haters['first_person_ratio']) - safe_mean(lovers['first_person_ratio']) if safe_mean(haters['first_person_ratio']) and safe_mean(lovers['first_person_ratio']) else None
        },
        'exclamations': {
            'lovers': safe_mean(lovers['exclamation_count']),
            'haters': safe_mean(haters['exclamation_count']),
            'difference': safe_mean(haters['exclamation_count']) - safe_mean(lovers['exclamation_count']) if safe_mean(haters['exclamation_count']) and safe_mean(lovers['exclamation_count']) else None
        }
    }
    
    # Engagement patterns
    lovers_avg_votes = safe_mean(lovers['total_votes'])
    haters_avg_votes = safe_mean(haters['total_votes'])
    
    lovers_polarization = safe_mean(lovers['vote_polarization'])
    haters_polarization = safe_mean(haters['vote_polarization'])
    
    # Temporal polarization (early vs late reviews)
    early_reviews = reviews[reviews['review_window'].isin(['Opening Year', 'Year 2'])]
    late_reviews = reviews[reviews['review_window'].isin(['Years 4-5', '5+ Years'])]
    
    early_avg_rating = safe_mean(early_reviews['Rating'])
    late_avg_rating = safe_mean(late_reviews['Rating'])
    
    temporal_shift = None
    if early_avg_rating and late_avg_rating:
        temporal_shift = late_avg_rating - early_avg_rating
    
    # Gender polarization
    male_reviews = reviews[reviews['username_gender_hint'] == 'male']
    female_reviews = reviews[reviews['username_gender_hint'] == 'female']
    
    male_avg_rating = safe_mean(male_reviews['Rating'])
    female_avg_rating = safe_mean(female_reviews['Rating'])
    
    gender_rating_gap = None
    if male_avg_rating and female_avg_rating:
        gender_rating_gap = abs(male_avg_rating - female_avg_rating)
    
    # Identify contradictions (lovers praise vs haters criticize)
    # Find themes mentioned by both groups
    lovers_comparisons = lovers[lovers['has_comparisons'] == True]
    haters_comparisons = haters[haters['has_comparisons'] == True]
    
    comparison_contradiction_pct = None
    if len(lovers) > 0 and len(haters) > 0:
        lovers_compare_pct = len(lovers_comparisons) / len(lovers) * 100
        haters_compare_pct = len(haters_comparisons) / len(haters) * 100
        comparison_contradiction_pct = abs(lovers_compare_pct - haters_compare_pct)
    
    return {
        'movie': movie_name,
        
        'polarization_metrics': {
            'level': polarization_level,
            'rating_variance': rating_variance,
            'rating_std_dev': rating_std,
            'middle_ground_pct': middle_pct,
            'lovers_pct': safe_percentage(len(lovers), total_reviews),
            'haters_pct': safe_percentage(len(haters), total_reviews)
        },
        
        'emotion_divergence': {
            'top_5_divergent': [(emotion, data) for emotion, data in sorted_emotions[:5]],
            'all_emotions': emotion_divergence
        },
        
        'writing_style_differences': style_differences,
        
        'engagement_patterns': {
            'lovers_avg_votes': lovers_avg_votes,
            'haters_avg_votes': haters_avg_votes,
            'lovers_vote_polarization': lovers_polarization,
            'haters_vote_polarization': haters_polarization
        },
        
        'temporal_polarization': {
            'early_avg_rating': early_avg_rating,
            'late_avg_rating': late_avg_rating,
            'temporal_shift': temporal_shift,
            'shift_direction': 'improved' if temporal_shift and temporal_shift > 0 else 'declined' if temporal_shift and temporal_shift < 0 else 'stable'
        },
        
        'gender_polarization': {
            'male_avg_rating': male_avg_rating,
            'female_avg_rating': female_avg_rating,
            'gender_rating_gap': gender_rating_gap,
            'gap_significance': 'significant' if gender_rating_gap and gender_rating_gap > 1.0 else 'minimal'
        },
        
        'theme_contradictions': {
            'lovers_use_comparisons_pct': safe_percentage(len(lovers_comparisons), len(lovers)) if len(lovers) > 0 else None,
            'haters_use_comparisons_pct': safe_percentage(len(haters_comparisons), len(haters)) if len(haters) > 0 else None,
            'comparison_gap': comparison_contradiction_pct
        }
    }

print("‚úÖ polarization_analysis() function defined")

In [None]:
# Test Module 4 on The Witch

print("üß™ Testing Module 4: Polarization Analysis\n")
print("="*80)

test_movie = "The Witch"
result = polarization_analysis(test_movie)

print(f"\n‚ö° POLARIZATION ANALYSIS: {test_movie}")

print(f"\nüìä Polarization Level: {result['polarization_metrics']['level']}")
print(f"  Rating Variance: {result['polarization_metrics']['rating_variance']:.2f}")
print(f"  Rating Std Dev: {result['polarization_metrics']['rating_std_dev']:.2f}")
print(f"  Middle Ground (4-7): {result['polarization_metrics']['middle_ground_pct']}%")
print(f"  Lovers (8-10): {result['polarization_metrics']['lovers_pct']}%")
print(f"  Haters (1-3): {result['polarization_metrics']['haters_pct']}%")

print(f"\nüòäüò† Emotion Divergence (Top 5 Most Polarizing):")
for emotion, data in result['emotion_divergence']['top_5_divergent']:
    print(f"\n  {emotion.capitalize()}:")
    print(f"    Lovers: {data['lovers_score']:.3f}")
    print(f"    Haters: {data['haters_score']:.3f}")
    print(f"    Gap: {data['divergence']:.3f}")

print(f"\n‚úçÔ∏è  Writing Style Differences:")
for metric, data in result['writing_style_differences'].items():
    if data['difference'] is not None:
        print(f"\n  {metric.replace('_', ' ').title()}:")
        print(f"    Lovers: {data['lovers']:.2f}")
        print(f"    Haters: {data['haters']:.2f}")
        print(f"    Difference: {data['difference']:+.2f}")

print(f"\nüí¨ Engagement Patterns:")
print(f"  Lovers avg votes: {result['engagement_patterns']['lovers_avg_votes']:.1f}")
print(f"  Haters avg votes: {result['engagement_patterns']['haters_avg_votes']:.1f}")
print(f"  Lovers vote polarization: {result['engagement_patterns']['lovers_vote_polarization']:.3f}")
print(f"  Haters vote polarization: {result['engagement_patterns']['haters_vote_polarization']:.3f}")

print(f"\nüìÖ Temporal Polarization:")
print(f"  Early reviews (Opening/Year 2) avg: {result['temporal_polarization']['early_avg_rating']:.2f}")
print(f"  Late reviews (Years 4-5/5+) avg: {result['temporal_polarization']['late_avg_rating']:.2f}")
print(f"  Temporal shift: {result['temporal_polarization']['temporal_shift']:+.2f} ({result['temporal_polarization']['shift_direction']})")

print(f"\nüë• Gender Polarization:")
print(f"  Male avg rating: {result['gender_polarization']['male_avg_rating']:.2f}")
print(f"  Female avg rating: {result['gender_polarization']['female_avg_rating']:.2f}")
print(f"  Gender gap: {result['gender_polarization']['gender_rating_gap']:.2f} ({result['gender_polarization']['gap_significance']})")

print(f"\nüé¨ Theme Contradictions:")
print(f"  Lovers using comparisons: {result['theme_contradictions']['lovers_use_comparisons_pct']}%")
print(f"  Haters using comparisons: {result['theme_contradictions']['haters_use_comparisons_pct']}%")
print(f"  Comparison usage gap: {result['theme_contradictions']['comparison_gap']:.1f}%")

print("\n‚úÖ Module 4 test complete!")

---
## Module 5: Marketing Disconnect Analysis
**Purpose**: Generate data for Slide 5 - Marketing vs Audience Focus

**Approach**: Identify gaps between what marketing emphasized and what audiences discussed

**Outputs**:
- Comparison film mentions (what audiences compared to)
- Expectation indicators (wish statements, disappointment markers)
- Theme emphasis gaps (marketing focus vs review focus)
- Sentiment by review timing (early disappointment vs later satisfaction)

**Note**: This module requires manual input of marketing themes for complete analysis. 
Without marketing data, it focuses on audience discussion patterns and expectation mismatches.

In [None]:
def marketing_disconnect_analysis(movie_name, marketing_themes=None):
    """
    Generate Slide 5: Marketing Disconnect Analysis
    
    Identifies gaps between audience expectations/discussions and film delivery
    
    Args:
        movie_name: Name of the movie to analyze
        marketing_themes: Optional list of themes emphasized in marketing
                         Example: ['witch', 'period piece', 'family drama', 'horror']
    
    Returns: Dictionary with marketing disconnect insights
    """
    reviews = get_movie_reviews(movie_name)
    
    # Segment by rating for expectation analysis
    lovers = reviews[reviews['Rating'] >= 8].copy()
    mixed = reviews[(reviews['Rating'] >= 4) & (reviews['Rating'] <= 7)].copy()
    haters = reviews[reviews['Rating'] <= 3].copy()
    
    # 1. COMPARISON FILMS - What did audiences compare this to?
    # Extract mentioned movies from reviews
    mentioned_movies = []
    for movies in reviews['movies_mentioned'].dropna():
        if isinstance(movies, str) and movies != '[]':
            import ast
            try:
                movie_list = ast.literal_eval(movies)
                mentioned_movies.extend(movie_list)
            except:
                pass
    
    from collections import Counter
    comparison_films = dict(Counter(mentioned_movies).most_common(15))
    
    # Filter out self-references (the movie being reviewed)
    comparison_films_filtered = {film: count for film, count in comparison_films.items() 
                                  if movie_name.lower() not in film.lower()}
    
    # 2. EXPECTATION MISMATCH INDICATORS
    # Wish statements by rating group
    lovers_with_wish = lovers[lovers['wish_count'] > 0]
    haters_with_wish = haters[haters['wish_count'] > 0]
    
    wish_quotes_lovers = extract_quotes(lovers_with_wish, 'wish_statements', limit=5)
    wish_quotes_haters = extract_quotes(haters_with_wish, 'wish_statements', limit=5)
    
    # Question patterns (confusion/uncertainty)
    lovers_with_questions = lovers[lovers['question_count'] > 0]
    haters_with_questions = haters[haters['question_count'] > 0]
    
    # 3. TEMPORAL SENTIMENT ANALYSIS
    # Early reviews often reflect marketing-driven expectations
    early_reviews = reviews[reviews['review_window'].isin(['Opening Year', 'Year 2'])]
    late_reviews = reviews[reviews['review_window'].isin(['Years 4-5', '5+ Years'])]
    
    early_sentiment = {
        'avg_rating': safe_mean(early_reviews['Rating']),
        'avg_vader_compound': safe_mean(early_reviews['vader_compound']),
        'hate_statement_pct': safe_percentage(len(early_reviews[early_reviews['hate_count'] > 0]), len(early_reviews)),
        'wish_statement_pct': safe_percentage(len(early_reviews[early_reviews['wish_count'] > 0]), len(early_reviews))
    }
    
    late_sentiment = {
        'avg_rating': safe_mean(late_reviews['Rating']),
        'avg_vader_compound': safe_mean(late_reviews['vader_compound']),
        'hate_statement_pct': safe_percentage(len(late_reviews[late_reviews['hate_count'] > 0]), len(late_reviews)),
        'wish_statement_pct': safe_percentage(len(late_reviews[late_reviews['wish_count'] > 0]), len(late_reviews))
    }
    
    # 4. COMPARISON USAGE BY RATING GROUP
    # Do different groups reference comparisons differently?
    lovers_comparisons = lovers[lovers['has_comparisons'] == True]
    haters_comparisons = haters[haters['has_comparisons'] == True]
    
    comparison_patterns = {
        'lovers_pct': safe_percentage(len(lovers_comparisons), len(lovers)),
        'haters_pct': safe_percentage(len(haters_comparisons), len(haters)),
        'gap': safe_percentage(len(lovers_comparisons), len(lovers)) - safe_percentage(len(haters_comparisons), len(haters)) if len(lovers) > 0 and len(haters) > 0 else None
    }
    
    # 5. SENTIMENT POLARITY (positive vs negative language)
    lovers_vader = safe_mean(lovers['vader_compound'])
    haters_vader = safe_mean(haters['vader_compound'])
    
    sentiment_gap = None
    if lovers_vader is not None and haters_vader is not None:
        sentiment_gap = lovers_vader - haters_vader
    
    # 6. MARKETING THEME ANALYSIS (if provided)
    marketing_analysis = None
    if marketing_themes:
        marketing_analysis = {
            'provided_themes': marketing_themes,
            'note': 'Theme frequency analysis requires text mining implementation',
            'status': 'Manual review recommended'
        }
    else:
        marketing_analysis = {
            'status': 'No marketing themes provided',
            'note': 'Analysis limited to audience discussion patterns'
        }
    
    # 7. DISAPPOINTMENT INDICATORS
    # Reviews with high wish_count + low rating = expectation mismatch
    disappointed = reviews[(reviews['Rating'] <= 5) & (reviews['wish_count'] > 0)]
    disappointment_rate = safe_percentage(len(disappointed), len(reviews))
    
    disappointed_quotes = extract_quotes(disappointed, 'wish_statements', limit=10)
    
    return {
        'movie': movie_name,
        
        'comparison_films': {
            'all_mentions': comparison_films_filtered,
            'top_5': dict(list(comparison_films_filtered.items())[:5]),
            'total_unique_films': len(comparison_films_filtered),
            'total_mentions': sum(comparison_films_filtered.values())
        },
        
        'expectation_mismatch': {
            'lovers_with_wishes': {
                'count': len(lovers_with_wish),
                'percentage': safe_percentage(len(lovers_with_wish), len(lovers)),
                'sample_quotes': wish_quotes_lovers
            },
            'haters_with_wishes': {
                'count': len(haters_with_wish),
                'percentage': safe_percentage(len(haters_with_wish), len(haters)),
                'sample_quotes': wish_quotes_haters
            },
            'confusion_indicators': {
                'lovers_with_questions_pct': safe_percentage(len(lovers_with_questions), len(lovers)),
                'haters_with_questions_pct': safe_percentage(len(haters_with_questions), len(haters))
            }
        },
        
        'temporal_sentiment': {
            'early_reviews': early_sentiment,
            'late_reviews': late_sentiment,
            'rating_shift': late_sentiment['avg_rating'] - early_sentiment['avg_rating'] if early_sentiment['avg_rating'] and late_sentiment['avg_rating'] else None,
            'interpretation': 'improved' if late_sentiment['avg_rating'] and early_sentiment['avg_rating'] and late_sentiment['avg_rating'] > early_sentiment['avg_rating'] else 'declined' if late_sentiment['avg_rating'] and early_sentiment['avg_rating'] else 'unknown'
        },
        
        'comparison_patterns': comparison_patterns,
        
        'sentiment_polarity': {
            'lovers_vader': lovers_vader,
            'haters_vader': haters_vader,
            'sentiment_gap': sentiment_gap
        },
        
        'marketing_theme_analysis': marketing_analysis,
        
        'disappointment_indicators': {
            'disappointed_reviewer_pct': disappointment_rate,
            'disappointed_count': len(disappointed),
            'sample_disappointment_quotes': disappointed_quotes
        }
    }

print("‚úÖ marketing_disconnect_analysis() function defined")

In [None]:
# Test Module 5 on The Witch (without marketing themes)

print("üß™ Testing Module 5: Marketing Disconnect Analysis\n")
print("="*80)

test_movie = "The Witch"
result = marketing_disconnect_analysis(test_movie)

print(f"\nüìä MARKETING DISCONNECT ANALYSIS: {test_movie}")
print(f"\nNote: Analysis performed WITHOUT marketing theme data")
print(f"      Focus is on audience discussion patterns and expectation gaps\n")

print(f"üé¨ Comparison Films (What Audiences Referenced):")
print(f"  Total unique films mentioned: {result['comparison_films']['total_unique_films']}")
print(f"  Total mentions: {result['comparison_films']['total_mentions']}")
print(f"\n  Top 5 Most Compared Films:")
for i, (film, count) in enumerate(list(result['comparison_films']['top_5'].items())[:5], 1):
    print(f"    {i}. {film:40} ({count:3} mentions)")

print(f"\n‚ùì Expectation Mismatch Indicators:")
print(f"\n  Lovers (8-10) expressing wishes:")
print(f"    Count: {result['expectation_mismatch']['lovers_with_wishes']['count']} ({result['expectation_mismatch']['lovers_with_wishes']['percentage']}%)")

if result['expectation_mismatch']['lovers_with_wishes']['sample_quotes']:
    print(f"\n    Sample wish from lover:")
    quote = result['expectation_mismatch']['lovers_with_wishes']['sample_quotes'][0]
    content = quote['content'][:150] + '...' if len(quote['content']) > 150 else quote['content']
    print(f"      [{quote['review_id']}] Rating: {quote['rating']}/10")
    print(f"      {content}")

print(f"\n  Haters (1-3) expressing wishes:")
print(f"    Count: {result['expectation_mismatch']['haters_with_wishes']['count']} ({result['expectation_mismatch']['haters_with_wishes']['percentage']}%)")

if result['expectation_mismatch']['haters_with_wishes']['sample_quotes']:
    print(f"\n    Sample wish from hater:")
    quote = result['expectation_mismatch']['haters_with_wishes']['sample_quotes'][0]
    content = quote['content'][:150] + '...' if len(quote['content']) > 150 else quote['content']
    print(f"      [{quote['review_id']}] Rating: {quote['rating']}/10")
    print(f"      {content}")

print(f"\n  Confusion indicators (question usage):")
print(f"    Lovers with questions: {result['expectation_mismatch']['confusion_indicators']['lovers_with_questions_pct']}%")
print(f"    Haters with questions: {result['expectation_mismatch']['confusion_indicators']['haters_with_questions_pct']}%")

print(f"\nüìÖ Temporal Sentiment Shift:")
print(f"  Early reviews (Opening/Year 2):")
print(f"    Avg rating: {result['temporal_sentiment']['early_reviews']['avg_rating']:.2f}")
print(f"    VADER sentiment: {result['temporal_sentiment']['early_reviews']['avg_vader_compound']:.3f}")
print(f"    Hate statements: {result['temporal_sentiment']['early_reviews']['hate_statement_pct']}%")
print(f"    Wish statements: {result['temporal_sentiment']['early_reviews']['wish_statement_pct']}%")

print(f"\n  Late reviews (Years 4-5/5+):")
print(f"    Avg rating: {result['temporal_sentiment']['late_reviews']['avg_rating']:.2f}")
print(f"    VADER sentiment: {result['temporal_sentiment']['late_reviews']['avg_vader_compound']:.3f}")
print(f"    Hate statements: {result['temporal_sentiment']['late_reviews']['hate_statement_pct']}%")
print(f"    Wish statements: {result['temporal_sentiment']['late_reviews']['wish_statement_pct']}%")

print(f"\n  Rating shift: {result['temporal_sentiment']['rating_shift']:+.2f} ({result['temporal_sentiment']['interpretation']})")

print(f"\nüîç Comparison Usage Patterns:")
print(f"  Lovers using comparisons: {result['comparison_patterns']['lovers_pct']}%")
print(f"  Haters using comparisons: {result['comparison_patterns']['haters_pct']}%")
print(f"  Gap: {result['comparison_patterns']['gap']:+.1f} percentage points")

print(f"\nüòäüò† Sentiment Polarity (VADER):")
print(f"  Lovers: {result['sentiment_polarity']['lovers_vader']:.3f}")
print(f"  Haters: {result['sentiment_polarity']['haters_vader']:.3f}")
print(f"  Gap: {result['sentiment_polarity']['sentiment_gap']:.3f}")

print(f"\nüíî Disappointment Indicators:")
print(f"  Disappointed reviewers (low rating + wishes): {result['disappointment_indicators']['disappointed_count']} ({result['disappointment_indicators']['disappointed_reviewer_pct']}%)")

if result['disappointment_indicators']['sample_disappointment_quotes']:
    print(f"\n  Top 3 Disappointment Quotes (by engagement):")
    for i, quote in enumerate(result['disappointment_indicators']['sample_disappointment_quotes'][:3], 1):
        print(f"\n    {i}. [{quote['review_id']}] Rating: {quote['rating']}/10, Votes: {quote['engagement']}")
        content = quote['content'][:150] + '...' if len(quote['content']) > 150 else quote['content']
        print(f"       {content}")

print(f"\nüì¢ Marketing Theme Analysis:")
print(f"  Status: {result['marketing_theme_analysis']['status']}")
print(f"  Note: {result['marketing_theme_analysis']['note']}")

print("\n‚úÖ Module 5 test complete!")

In [None]:
# Test Module 5 WITH marketing themes (optional - demonstrates full functionality)

print("üß™ Testing Module 5: Marketing Disconnect Analysis WITH Themes\n")
print("="*80)

# Example marketing themes for The Witch
# (These would come from actual marketing materials)
marketing_themes_example = [
    'witch',
    'period piece',
    'new england',
    '1630s',
    'family',
    'horror',
    'supernatural',
    'folktale'
]

test_movie = "The Witch"
result = marketing_disconnect_analysis(test_movie, marketing_themes=marketing_themes_example)

print(f"\nüìä MARKETING DISCONNECT ANALYSIS: {test_movie}")
print(f"\nNote: Analysis performed WITH example marketing themes\n")

print(f"üì¢ Marketing Theme Analysis:")
print(f"  Status: {result['marketing_theme_analysis']['status']}")
print(f"  Provided themes: {', '.join(result['marketing_theme_analysis']['provided_themes'])}")
print(f"  Note: {result['marketing_theme_analysis']['note']}")

print("\n‚úÖ Full Module 5 test complete!")

---

# Module 6 Risk Factors Analysis

In [None]:
def extract_contextual_quotes(df_filtered, search_terms=None, limit=5, context_sentences=2):
    """
    Extract quotes from full review text with surrounding context
    
    Args:
        df_filtered: Pre-filtered DataFrame (already segmented by risk criteria)
        search_terms: List of keywords to search for (optional - if None, just returns top reviews)
        limit: Maximum number of quotes to return
        context_sentences: Number of sentences before/after to include for context
    
    Returns: List of dicts with review_id, contextual quote, engagement
    """
    import re
    
    quotes = []
    
    # Sort by engagement to get most impactful reviews
    candidates = df_filtered.nlargest(limit * 3, 'total_votes') if len(df_filtered) > limit * 3 else df_filtered.sort_values('total_votes', ascending=False)
    
    for _, row in candidates.iterrows():
        if len(quotes) >= limit:
            break
            
        text = row['Review_Text']
        if pd.isna(text) or len(text) == 0:
            continue
        
        # Split into sentences (handle multiple punctuation patterns)
        sentences = re.split(r'(?<=[.!?])\s+', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]  # Filter tiny fragments
        
        if not sentences:
            continue
        
        # If search terms provided, find relevant sentences
        if search_terms:
            found_match = False
            for i, sentence in enumerate(sentences):
                sentence_lower = sentence.lower()
                
                # Check if any search term appears in this sentence
                if any(term.lower() in sentence_lower for term in search_terms):
                    # Get context window
                    start_idx = max(0, i - context_sentences)
                    end_idx = min(len(sentences), i + context_sentences + 1)
                    
                    context = ' '.join(sentences[start_idx:end_idx])
                    
                    # Add ellipsis if we're not at the beginning/end
                    if start_idx > 0:
                        context = '...' + context
                    if end_idx < len(sentences):
                        context = context + '...'
                    
                    quotes.append({
                        'review_id': row['Review_ID'],
                        'reviewer': row['Reviewer'],
                        'rating': int(row['Rating']),
                        'review_title': row['Review_Title'] if pd.notna(row['Review_Title']) else '',
                        'quote': context,
                        'engagement': int(row['total_votes']) if pd.notna(row['total_votes']) else 0,
                        'matched_term': [t for t in search_terms if t.lower() in sentence_lower][0]  # Which term matched
                    })
                    
                    found_match = True
                    break  # Only one quote per review
            
            if found_match:
                continue
        
        # If no search terms or no match found, use opening of review
        if not search_terms or not found_match:
            # Take first 2-3 sentences as representative quote
            opening = ' '.join(sentences[:3])
            if len(sentences) > 3:
                opening += '...'
            
            quotes.append({
                'review_id': row['Review_ID'],
                'reviewer': row['Reviewer'],
                'rating': int(row['Rating']),
                'review_title': row['Review_Title'] if pd.notna(row['Review_Title']) else '',
                'quote': opening,
                'engagement': int(row['total_votes']) if pd.notna(row['total_votes']) else 0,
                'matched_term': None
            })
    
    return quotes[:limit]

print("‚úÖ extract_contextual_quotes() helper function defined")

In [None]:
def risk_factors_analysis(movie_name):
    """
    Generate Slide 6: Risk Factors Analysis
    
    Identifies potential marketing/audience risks based on review patterns
    Uses contextual quote extraction for better insight
    
    Returns: Dictionary with risk assessments and mitigation recommendations
    """
    reviews = get_movie_reviews(movie_name)
    
    if len(reviews) == 0:
        return {
            'movie': movie_name,
            'message': 'No reviews available for analysis'
        }
    
    # Segment by rating
    lovers = reviews[reviews['Rating'] >= 8].copy()
    mixed = reviews[(reviews['Rating'] >= 4) & (reviews['Rating'] <= 7)].copy()
    haters = reviews[reviews['Rating'] <= 3].copy()
    
    # Initialize risk flags
    risks = []
    
    # ========================================
    # RISK 1: INTENSITY WARNING
    # High fear + negative emotions = "too intense" for casual audiences
    # ========================================
    
    avg_fear = safe_mean(reviews['emotion_fear'])
    avg_disgust = safe_mean(reviews['emotion_disgust'])
    hate_review_pct = safe_percentage(len(haters), len(reviews))
    
    intensity_score = 0
    if avg_fear and avg_fear > 0.10:  # Top quartile fear
        intensity_score += 1
    if avg_disgust and avg_disgust > 0.08:  # Top quartile disgust
        intensity_score += 1
    if hate_review_pct and hate_review_pct > 20:  # High hate rate
        intensity_score += 1
    
    # Extract quotes mentioning intensity/fear/disturbing elements
    intensity_quotes = extract_contextual_quotes(
        haters,
        search_terms=['disturbing', 'scary', 'intense', 'graphic', 'dark', 'uncomfortable', 'creepy', 'frightening'],
        limit=3,
        context_sentences=1
    )
    
    intensity_risk = {
        'risk_level': 'HIGH' if intensity_score >= 2 else 'MODERATE' if intensity_score == 1 else 'LOW',
        'avg_fear': avg_fear,
        'avg_disgust': avg_disgust,
        'hate_review_pct': hate_review_pct,
        'intensity_score': intensity_score,
        'interpretation': None,
        'mitigation': None,
        'sample_quotes': intensity_quotes
    }
    
    if intensity_score >= 2:
        intensity_risk['interpretation'] = "Film likely too intense for mainstream horror audiences. High fear/disgust + significant hate rate suggests casual viewers will be turned off."
        intensity_risk['mitigation'] = "Market to horror enthusiasts, not casual fans. Use content warnings. Emphasize 'elevated horror' or 'arthouse' positioning. Target A24/Neon audience, not mainstream theaters."
        risks.append('INTENSITY_WARNING')
    elif intensity_score == 1:
        intensity_risk['interpretation'] = "Moderate intensity that may polarize. Some audiences will find it too much."
        intensity_risk['mitigation'] = "Clear genre positioning in marketing. Avoid misleading trailers that suggest lighter tone."
    
    # ========================================
    # RISK 2: COMPLEXITY BARRIER
    # High reading complexity + negative reviews = film too "difficult"
    # ========================================
    
    # Compare haters' vs lovers' writing complexity
    haters_reading_ease = safe_mean(haters['flesch_reading_ease'])
    lovers_reading_ease = safe_mean(lovers['flesch_reading_ease'])
    
    # If haters write SIMPLER reviews than lovers, suggests complexity barrier
    complexity_gap = None
    if haters_reading_ease and lovers_reading_ease:
        complexity_gap = haters_reading_ease - lovers_reading_ease
    
    # Extract quotes about pacing/confusion/arthouse criticism
    complexity_quotes = extract_contextual_quotes(
        haters,
        search_terms=['slow', 'boring', 'nothing happens', 'pretentious', 'confusing', 'pointless', 'dragged', 'waste of time', "doesn't make sense"],
        limit=3,
        context_sentences=1
    )
    
    complexity_risk = {
        'risk_level': 'HIGH' if complexity_gap and complexity_gap > 10 and hate_review_pct and hate_review_pct > 15 else 'MODERATE' if complexity_gap and complexity_gap > 5 else 'LOW',
        'haters_reading_ease': haters_reading_ease,
        'lovers_reading_ease': lovers_reading_ease,
        'complexity_gap': complexity_gap,
        'interpretation': None,
        'mitigation': None,
        'sample_quotes': complexity_quotes
    }
    
    if complexity_gap and complexity_gap > 10 and hate_review_pct and hate_review_pct > 15:
        complexity_risk['interpretation'] = "Significant complexity barrier detected. Haters write much simpler reviews than lovers, suggesting film is 'too arthouse' for general audiences."
        complexity_risk['mitigation'] = "Position as arthouse/festival film. Target film critics and serious horror fans. Consider platform release (streaming/VOD) rather than wide theatrical. Emphasize auteur credentials."
        risks.append('COMPLEXITY_BARRIER')
    elif complexity_gap and complexity_gap > 5:
        complexity_risk['interpretation'] = "Moderate complexity barrier. Film may be too slow/cerebral for some viewers."
        complexity_risk['mitigation'] = "Marketing should prepare audiences for 'slow burn' pacing. Use critic quotes emphasizing atmosphere over action."
    
    # ========================================
    # RISK 3: EARLY NEGATIVE BUZZ
    # Opening year reviews with low ratings + high engagement = bad word of mouth
    # ========================================
    
    early_reviews = reviews[reviews['review_window'].isin(['Opening Year', 'Year 2'])]
    early_negative = early_reviews[(early_reviews['Rating'] <= 5) & (early_reviews['total_votes'] > 20)]
    
    early_negative_pct = safe_percentage(len(early_negative), len(early_reviews)) if len(early_reviews) > 0 else None
    early_avg_rating = safe_mean(early_reviews['Rating'])
    
    # Extract most engaged early negative reviews
    early_buzz_quotes = extract_contextual_quotes(
        early_negative,
        search_terms=['disappointed', 'misled', 'expected', 'overhyped', 'waste', 'boring', 'misleading'],
        limit=3,
        context_sentences=2
    )
    
    early_buzz_risk = {
        'risk_level': 'HIGH' if early_negative_pct and early_negative_pct > 30 else 'MODERATE' if early_negative_pct and early_negative_pct > 15 else 'LOW',
        'early_negative_count': len(early_negative),
        'early_negative_pct': early_negative_pct,
        'early_avg_rating': early_avg_rating,
        'total_early_reviews': len(early_reviews),
        'interpretation': None,
        'mitigation': None,
        'sample_quotes': early_buzz_quotes
    }
    
    if early_negative_pct and early_negative_pct > 30:
        early_buzz_risk['interpretation'] = "High early negative buzz detected. Opening audiences were disappointed and vocal about it."
        early_buzz_risk['mitigation'] = "Avoid wide theatrical release. Consider festival circuit first to build critical support. Use critic screenings to generate positive reviews before public release. Platform release strategy."
        risks.append('EARLY_NEGATIVE_BUZZ')
    elif early_negative_pct and early_negative_pct > 15:
        early_buzz_risk['interpretation'] = "Moderate early negative buzz. Some opening audiences felt misled."
        early_buzz_risk['mitigation'] = "Ensure marketing accurately represents film tone. Screen for target audience first, not general public."
    
    # ========================================
    # RISK 4: PACING ISSUES
    # High boredom indicators from haters
    # ========================================
    
    haters_with_wishes = haters[haters['wish_count'] > 0]
    boredom_pct = safe_percentage(len(haters_with_wishes), len(haters)) if len(haters) > 0 else None
    
    # Extract pacing complaints with context
    pacing_quotes = extract_contextual_quotes(
        haters,
        search_terms=['slow', 'boring', 'nothing happens', 'dragged', 'pacing', 'tedious', 'dull', 'uneventful', 'wished', 'wanted more'],
        limit=3,
        context_sentences=2
    )
    
    pacing_risk = {
        'risk_level': 'HIGH' if boredom_pct and boredom_pct > 15 and hate_review_pct and hate_review_pct > 20 else 'MODERATE' if boredom_pct and boredom_pct > 10 else 'LOW',
        'haters_with_wishes_pct': boredom_pct,
        'interpretation': None,
        'mitigation': None,
        'sample_quotes': pacing_quotes
    }
    
    if boredom_pct and boredom_pct > 15 and hate_review_pct and hate_review_pct > 20:
        pacing_risk['interpretation'] = "Pacing issues detected. High percentage of haters expressing wishes (what they wanted but didn't get), suggesting film is too slow or doesn't deliver expected payoff."
        pacing_risk['mitigation'] = "Marketing must emphasize 'slow burn' nature upfront. Don't promise action/scares that aren't delivered. Target patient, atmosphere-focused horror fans. Consider re-edit if in post-production."
        risks.append('PACING_ISSUES')
    elif boredom_pct and boredom_pct > 10:
        pacing_risk['interpretation'] = "Moderate pacing concerns. Some viewers wanted more action/payoff."
        pacing_risk['mitigation'] = "Set expectations clearly in marketing. Use critic quotes about 'atmospheric' and 'meditative' qualities."
    
    # ========================================
    # RISK 5: EXPECTATION MISMATCH
    # Disappointed + comparisons = marketing misled audiences
    # ========================================
    
    disappointed = reviews[(reviews['Rating'] <= 5) & (reviews['wish_count'] > 0) & (reviews['has_comparisons'] == True)]
    disappointment_rate = safe_percentage(len(disappointed), len(reviews))
    
    # Extract expectation mismatch quotes
    expectation_quotes = extract_contextual_quotes(
        disappointed,
        search_terms=['expected', 'disappointed', 'thought it would', 'hoped for', 'nothing like', 'misleading', 'overhyped', 'not what I', 'wished'],
        limit=3,
        context_sentences=2
    )
    
    # What films did disappointed viewers compare to?
    disappointed_comparisons = []
    for movies in disappointed['movies_mentioned'].dropna():
        if isinstance(movies, str) and movies != '[]':
            import ast
            try:
                movie_list = ast.literal_eval(movies)
                disappointed_comparisons.extend(movie_list)
            except:
                pass
    
    from collections import Counter
    top_disappointed_comparisons = dict(Counter(disappointed_comparisons).most_common(5))
    
    expectation_risk = {
        'risk_level': 'HIGH' if disappointment_rate and disappointment_rate > 10 else 'MODERATE' if disappointment_rate and disappointment_rate > 5 else 'LOW',
        'disappointment_rate': disappointment_rate,
        'disappointed_count': len(disappointed),
        'top_comparison_films': top_disappointed_comparisons,
        'interpretation': None,
        'mitigation': None,
        'sample_quotes': expectation_quotes
    }
    
    if disappointment_rate and disappointment_rate > 10:
        expectation_risk['interpretation'] = "High expectation mismatch. Significant portion of audience felt misled - they had wishes unfulfilled and compared to films with different tone/style."
        expectation_risk['mitigation'] = "Audit marketing materials carefully. Ensure trailers/posters accurately represent film. Look at comparison films - are we being compared to wrong genre/style? Reposition marketing if needed."
        risks.append('EXPECTATION_MISMATCH')
    elif disappointment_rate and disappointment_rate > 5:
        expectation_risk['interpretation'] = "Moderate expectation issues. Some audiences felt the film didn't match their expectations."
        expectation_risk['mitigation'] = "Review marketing positioning. Ensure genre signals are clear and accurate."
    
    # ========================================
    # RISK 6: GENDER POLARIZATION
    # Significant gender rating gap = marketing may alienate one gender
    # ========================================
    
    male_reviews = reviews[reviews['username_gender_hint'] == 'male']
    female_reviews = reviews[reviews['username_gender_hint'] == 'female']
    
    male_avg_rating = safe_mean(male_reviews['Rating'])
    female_avg_rating = safe_mean(female_reviews['Rating'])
    
    gender_gap = None
    if male_avg_rating and female_avg_rating:
        gender_gap = abs(male_avg_rating - female_avg_rating)
    
    # Determine which gender rates higher
    gender_preference = None
    if male_avg_rating and female_avg_rating:
        if male_avg_rating > female_avg_rating:
            gender_preference = 'male'
        elif female_avg_rating > male_avg_rating:
            gender_preference = 'female'
        else:
            gender_preference = 'neutral'
    
    # Extract quotes from lower-rating gender
    lower_rating_gender = 'female' if gender_preference == 'male' else 'male'
    gender_segment = female_reviews if gender_preference == 'male' else male_reviews
    gender_haters = gender_segment[gender_segment['Rating'] <= 5]
    
    gender_quotes = extract_contextual_quotes(
        gender_haters,
        search_terms=None,  # Just get top engaged reviews from this segment
        limit=3,
        context_sentences=2
    )
    
    gender_risk = {
        'risk_level': 'HIGH' if gender_gap and gender_gap > 1.5 else 'MODERATE' if gender_gap and gender_gap > 1.0 else 'LOW',
        'gender_gap': gender_gap,
        'male_avg_rating': male_avg_rating,
        'female_avg_rating': female_avg_rating,
        'gender_preference': gender_preference,
        'male_count': len(male_reviews),
        'female_count': len(female_reviews),
        'interpretation': None,
        'mitigation': None,
        'sample_quotes': gender_quotes
    }
    
    if gender_gap and gender_gap > 1.5:
        gender_risk['interpretation'] = f"Significant gender polarization detected. {gender_preference.capitalize()} reviewers rate {gender_gap:.2f} points higher. Film may alienate {lower_rating_gender} audiences."
        gender_risk['mitigation'] = f"Marketing should acknowledge gender appeal skew. Target {gender_preference} audiences primarily. If trying to broaden appeal, understand WHY {lower_rating_gender} audiences dislike it and address in positioning."
        risks.append('GENDER_POLARIZATION')
    elif gender_gap and gender_gap > 1.0:
        gender_risk['interpretation'] = f"Moderate gender gap. {gender_preference.capitalize()} audiences prefer it somewhat."
        gender_risk['mitigation'] = "Consider gender-specific marketing angles for different platforms."
    
    # ========================================
    # OVERALL RISK ASSESSMENT
    # ========================================
    
    total_risk_score = sum([
        1 if intensity_risk['risk_level'] == 'HIGH' else 0.5 if intensity_risk['risk_level'] == 'MODERATE' else 0,
        1 if complexity_risk['risk_level'] == 'HIGH' else 0.5 if complexity_risk['risk_level'] == 'MODERATE' else 0,
        1 if early_buzz_risk['risk_level'] == 'HIGH' else 0.5 if early_buzz_risk['risk_level'] == 'MODERATE' else 0,
        1 if pacing_risk['risk_level'] == 'HIGH' else 0.5 if pacing_risk['risk_level'] == 'MODERATE' else 0,
        1 if expectation_risk['risk_level'] == 'HIGH' else 0.5 if expectation_risk['risk_level'] == 'MODERATE' else 0,
        1 if gender_risk['risk_level'] == 'HIGH' else 0.5 if gender_risk['risk_level'] == 'MODERATE' else 0
    ])
    
    overall_risk = 'HIGH' if total_risk_score >= 3 else 'MODERATE' if total_risk_score >= 1.5 else 'LOW'
    
    return {
        'movie': movie_name,
        'overall_risk_assessment': {
            'risk_level': overall_risk,
            'risk_score': total_risk_score,
            'high_risks': [r for r in risks],
            'risk_count': len(risks)
        },
        'intensity_risk': intensity_risk,
        'complexity_risk': complexity_risk,
        'early_buzz_risk': early_buzz_risk,
        'pacing_risk': pacing_risk,
        'expectation_risk': expectation_risk,
        'gender_risk': gender_risk
    }

print("‚úÖ risk_factors_analysis() function defined (with contextual quotes)")

In [None]:
# Test Module 6 (Rewritten) on The Witch

print("üß™ Testing Module 6: Risk Factors Analysis (REWRITTEN - Contextual Quotes)\n")
print("="*80)

test_movie = "The Witch"
result = risk_factors_analysis(test_movie)

print(f"\n‚ö†Ô∏è  RISK FACTORS ANALYSIS: {test_movie}")

# Overall Assessment
print(f"\nüìä OVERALL RISK ASSESSMENT:")
print(f"  Risk Level: {result['overall_risk_assessment']['risk_level']}")
print(f"  Risk Score: {result['overall_risk_assessment']['risk_score']:.1f}/6.0")
print(f"  High Risks Identified: {result['overall_risk_assessment']['risk_count']}")

if result['overall_risk_assessment']['high_risks']:
    print(f"\n  ‚ö†Ô∏è  Critical Risk Flags:")
    for i, risk in enumerate(result['overall_risk_assessment']['high_risks'], 1):
        print(f"    {i}. {risk.replace('_', ' ').title()}")

# Detailed Risk Breakdown with CONTEXTUAL QUOTES
print(f"\n" + "="*80)
print(f"\n1Ô∏è‚É£  INTENSITY RISK: {result['intensity_risk']['risk_level']}")
print(f"  Avg Fear Score: {result['intensity_risk']['avg_fear']:.3f}")
print(f"  Avg Disgust Score: {result['intensity_risk']['avg_disgust']:.3f}")
print(f"  Hate Review %: {result['intensity_risk']['hate_review_pct']}%")
if result['intensity_risk']['interpretation']:
    print(f"\n  üí° Interpretation:")
    print(f"     {result['intensity_risk']['interpretation']}")
    print(f"\n  üõ°Ô∏è  Mitigation:")
    print(f"     {result['intensity_risk']['mitigation']}")

if result['intensity_risk']['sample_quotes']:
    print(f"\n  üìù Sample Intensity Complaints (Contextual):")
    for i, quote in enumerate(result['intensity_risk']['sample_quotes'], 1):
        print(f"\n    {i}. [{quote['review_id']}] {quote['reviewer']} - {quote['rating']}/10 ({quote['engagement']} votes)")
        if quote['review_title']:
            print(f"       Title: \"{quote['review_title']}\"")
        if quote.get('matched_term'):
            print(f"       Matched: '{quote['matched_term']}'")
        print(f"       \"{quote['quote']}\"")

print(f"\n" + "="*80)
print(f"\n2Ô∏è‚É£  COMPLEXITY BARRIER: {result['complexity_risk']['risk_level']}")
print(f"  Lovers Reading Ease: {result['complexity_risk']['lovers_reading_ease']:.1f}")
print(f"  Haters Reading Ease: {result['complexity_risk']['haters_reading_ease']:.1f}")
if result['complexity_risk']['complexity_gap']:
    print(f"  Gap: {result['complexity_risk']['complexity_gap']:+.1f} (haters write simpler)")
if result['complexity_risk']['interpretation']:
    print(f"\n  üí° Interpretation:")
    print(f"     {result['complexity_risk']['interpretation']}")
    print(f"\n  üõ°Ô∏è  Mitigation:")
    print(f"     {result['complexity_risk']['mitigation']}")

if result['complexity_risk']['sample_quotes']:
    print(f"\n  üìù Sample Complexity Complaints (Contextual):")
    for i, quote in enumerate(result['complexity_risk']['sample_quotes'], 1):
        print(f"\n    {i}. [{quote['review_id']}] {quote['reviewer']} - {quote['rating']}/10 ({quote['engagement']} votes)")
        if quote['review_title']:
            print(f"       Title: \"{quote['review_title']}\"")
        if quote.get('matched_term'):
            print(f"       Matched: '{quote['matched_term']}'")
        print(f"       \"{quote['quote']}\"")

print(f"\n" + "="*80)
print(f"\n3Ô∏è‚É£  EARLY NEGATIVE BUZZ: {result['early_buzz_risk']['risk_level']}")
print(f"  Early Reviews: {result['early_buzz_risk']['total_early_reviews']}")
print(f"  Early Negative (high engagement): {result['early_buzz_risk']['early_negative_count']}")
if result['early_buzz_risk']['early_negative_pct']:
    print(f"  Early Negative %: {result['early_buzz_risk']['early_negative_pct']}%")
if result['early_buzz_risk']['early_avg_rating']:
    print(f"  Early Avg Rating: {result['early_buzz_risk']['early_avg_rating']:.2f}")
if result['early_buzz_risk']['interpretation']:
    print(f"\n  üí° Interpretation:")
    print(f"     {result['early_buzz_risk']['interpretation']}")
    print(f"\n  üõ°Ô∏è  Mitigation:")
    print(f"     {result['early_buzz_risk']['mitigation']}")

if result['early_buzz_risk']['sample_quotes']:
    print(f"\n  üìù Sample Early Negative Complaints (Contextual):")
    for i, quote in enumerate(result['early_buzz_risk']['sample_quotes'], 1):
        print(f"\n    {i}. [{quote['review_id']}] {quote['reviewer']} - {quote['rating']}/10 ({quote['engagement']} votes)")
        if quote['review_title']:
            print(f"       Title: \"{quote['review_title']}\"")
        if quote.get('matched_term'):
            print(f"       Matched: '{quote['matched_term']}'")
        print(f"       \"{quote['quote']}\"")

print(f"\n" + "="*80)
print(f"\n4Ô∏è‚É£  PACING ISSUES: {result['pacing_risk']['risk_level']}")
if result['pacing_risk']['haters_with_wishes_pct']:
    print(f"  Haters Expressing Wishes: {result['pacing_risk']['haters_with_wishes_pct']}%")
if result['pacing_risk']['interpretation']:
    print(f"\n  üí° Interpretation:")
    print(f"     {result['pacing_risk']['interpretation']}")
    print(f"\n  üõ°Ô∏è  Mitigation:")
    print(f"     {result['pacing_risk']['mitigation']}")

if result['pacing_risk']['sample_quotes']:
    print(f"\n  üìù Sample Pacing Complaints (Contextual):")
    for i, quote in enumerate(result['pacing_risk']['sample_quotes'], 1):
        print(f"\n    {i}. [{quote['review_id']}] {quote['reviewer']} - {quote['rating']}/10 ({quote['engagement']} votes)")
        if quote['review_title']:
            print(f"       Title: \"{quote['review_title']}\"")
        if quote.get('matched_term'):
            print(f"       Matched: '{quote['matched_term']}'")
        print(f"       \"{quote['quote']}\"")

print(f"\n" + "="*80)
print(f"\n5Ô∏è‚É£  EXPECTATION MISMATCH: {result['expectation_risk']['risk_level']}")
if result['expectation_risk']['disappointment_rate']:
    print(f"  Disappointed Reviewers: {result['expectation_risk']['disappointed_count']} ({result['expectation_risk']['disappointment_rate']}%)")
if result['expectation_risk']['top_comparison_films']:
    print(f"\n  Films Mentioned by Disappointed Viewers:")
    for film, count in list(result['expectation_risk']['top_comparison_films'].items())[:3]:
        print(f"    - {film} ({count} mentions)")
if result['expectation_risk']['interpretation']:
    print(f"\n  üí° Interpretation:")
    print(f"     {result['expectation_risk']['interpretation']}")
    print(f"\n  üõ°Ô∏è  Mitigation:")
    print(f"     {result['expectation_risk']['mitigation']}")

if result['expectation_risk']['sample_quotes']:
    print(f"\n  üìù Sample Expectation Complaints (Contextual):")
    for i, quote in enumerate(result['expectation_risk']['sample_quotes'], 1):
        print(f"\n    {i}. [{quote['review_id']}] {quote['reviewer']} - {quote['rating']}/10 ({quote['engagement']} votes)")
        if quote['review_title']:
            print(f"       Title: \"{quote['review_title']}\"")
        if quote.get('matched_term'):
            print(f"       Matched: '{quote['matched_term']}'")
        print(f"       \"{quote['quote']}\"")

print(f"\n" + "="*80)
print(f"\n6Ô∏è‚É£  GENDER POLARIZATION: {result['gender_risk']['risk_level']}")
print(f"  Male Avg Rating: {result['gender_risk']['male_avg_rating']:.2f} (n={result['gender_risk']['male_count']})")
print(f"  Female Avg Rating: {result['gender_risk']['female_avg_rating']:.2f} (n={result['gender_risk']['female_count']})")
if result['gender_risk']['gender_gap']:
    print(f"  Gender Gap: {result['gender_risk']['gender_gap']:.2f} points")
    print(f"  Preference: {result['gender_risk']['gender_preference'].upper()}")
if result['gender_risk']['interpretation']:
    print(f"\n  üí° Interpretation:")
    print(f"     {result['gender_risk']['interpretation']}")
    print(f"\n  üõ°Ô∏è  Mitigation:")
    print(f"     {result['gender_risk']['mitigation']}")

if result['gender_risk']['sample_quotes']:
    lower_gender = 'female' if result['gender_risk']['gender_preference'] == 'male' else 'male'
    print(f"\n  üìù Sample Quotes from {lower_gender.capitalize()} Haters (Contextual):")
    for i, quote in enumerate(result['gender_risk']['sample_quotes'], 1):
        print(f"\n    {i}. [{quote['review_id']}] {quote['reviewer']} - {quote['rating']}/10 ({quote['engagement']} votes)")
        if quote['review_title']:
            print(f"       Title: \"{quote['review_title']}\"")
        print(f"       \"{quote['quote']}\"")

print("\n" + "="*80)
print("\n‚úÖ Module 6 test complete (with contextual quotes)!")

---

# Module 7 Reach Strategy

In [None]:
def infer_reach_strategy(audience_characteristics, comparison_films, risk_factors):
    """
    Infer marketing channels based on audience characteristics
    
    Args:
        audience_characteristics: Dict with avg_grade_level, avg_engagement, emotions, etc.
        comparison_films: List of films this audience mentions
        risk_factors: Dict from Module 6 to inform positioning
    
    Returns: List of specific channel recommendations
    """
    channels = []
    notes = []
    
    # Sophistication-based channels
    avg_grade = audience_characteristics.get('avg_reading_grade', 0)
    if avg_grade > 10:
        channels.extend([
            'Film festival audiences (Sundance, Fantastic Fest, SXSW)',
            'Letterboxd power users (500+ reviews, arthouse focus)',
            'Film Twitter influencers (critics, essayists)',
            'Criterion Channel subscribers'
        ])
        notes.append('High sophistication - position as arthouse/festival film')
    elif avg_grade > 8:
        channels.extend([
            'A24/Neon social media followers',
            'Letterboxd users rating similar films 7+',
            'Film podcast listeners (The Big Picture, Blank Check)'
        ])
        notes.append('Moderate sophistication - indie film audience')
    
    # Engagement-based channels
    avg_engagement = audience_characteristics.get('avg_engagement', 0)
    if avg_engagement > 50:
        channels.extend([
            'Reddit r/TrueFilm community (active discussers)',
            'YouTube film essayists (in-depth analysis)',
            'Film Discord servers'
        ])
        notes.append('High engagement - these are evangelists who drive word-of-mouth')
    
    # Genre/emotion-based channels
    emotions = audience_characteristics.get('top_emotions', [])
    if 'fear' in emotions[:2]:  # Fear is top emotion
        channels.extend([
            'Shudder subscribers',
            'Reddit r/horror community',
            'Horror podcasts (Faculty of Horror, Post Mortem)',
            'Fangoria / Rue Morgue readers'
        ])
        notes.append('Horror enthusiasts - not casual scary movie fans')
    
    # Risk factor positioning
    if risk_factors.get('complexity_risk', {}).get('risk_level') == 'HIGH':
        channels.append('Position as "elevated horror" / "slow burn" to set expectations')
        notes.append('CRITICAL: Marketing must emphasize arthouse nature to avoid complexity barrier')
    
    if risk_factors.get('intensity_risk', {}).get('risk_level') in ['HIGH', 'MODERATE']:
        notes.append('Content warnings recommended - intensity may turn off casual viewers')
    
    # Gender-based channels
    gender_pref = audience_characteristics.get('gender_preference')
    if gender_pref == 'male':
        notes.append('Male-skewing audience - consider male-focused horror communities')
    elif gender_pref == 'female':
        notes.append('Female-skewing audience - emphasize in marketing to female horror fans')
    
    # Comparison film strategy
    if comparison_films:
        top_comparisons = list(comparison_films.items())[:5]
        comp_text = ', '.join([film for film, count in top_comparisons])
        channels.append(f'Cross-promote with fans of: {comp_text}')
        notes.append(f'Audiences actively compare to these films - use in positioning')
    
    # Platform strategy
    channels.extend([
        'Platform release strategy: Limited theatrical ‚Üí VOD/Streaming',
        'Target: Specialty theaters in major metro areas (LA, NYC, Austin, Portland)'
    ])
    
    return {
        'channels': channels,
        'strategic_notes': notes
    }

print("‚úÖ infer_reach_strategy() helper function defined")

In [None]:
def target_audience_recommendation(movie_name):
    """
    Generate Slides 2 & 7: Target Audience Recommendation
    
    Uses 4-criteria framework: Passionate + Understanding + Scalable + Reachable
    
    Returns: Dictionary with three audience tiers and reach strategy
    """
    reviews = get_movie_reviews(movie_name)
    
    if len(reviews) == 0:
        return {
            'movie': movie_name,
            'message': 'No reviews available for analysis'
        }
    
    # Get lovers, mixed, haters
    lovers = reviews[reviews['Rating'] >= 8].copy()
    mixed = reviews[(reviews['Rating'] >= 4) & (reviews['Rating'] <= 7)].copy()
    haters = reviews[reviews['Rating'] <= 3].copy()
    
    # ========================================
    # PRIMARY AUDIENCE: Passionate + Understanding + Scalable
    # ========================================
    
    # Criteria:
    # - Passionate: love_count > 0 OR exclamation_count > 0 OR high engagement (votes > 10)
    # - Understanding: reading grade > 8 AND has_comparisons = True (can contextualize)
    # - Scalable: n > 30 (large enough to be a pattern)
    
    primary = lovers[
        (
            (lovers['love_count'] > 0) |
            (lovers['exclamation_count'] > 0) |
            (lovers['total_votes'] > 10)
        ) &
        (lovers['flesch_kincaid_grade'] > 8.0) &
        (lovers['has_comparisons'] == True)
    ].copy()
    
    primary_size = len(primary)
    primary_pct = safe_percentage(primary_size, len(reviews))
    
    # Check scalability threshold
    is_scalable = primary_size >= 30
    
    if primary_size < 30:
        # Loosen criteria if too small
        primary = lovers[
            (
                (lovers['love_count'] > 0) |
                (lovers['exclamation_count'] > 0) |
                (lovers['total_votes'] > 10)
            ) &
            (lovers['flesch_kincaid_grade'] > 7.0)  # Lower bar
        ].copy()
        primary_size = len(primary)
        primary_pct = safe_percentage(primary_size, len(reviews))
        is_scalable = primary_size >= 30
    
    # Demographics
    primary_male = len(primary[primary['username_gender_hint'] == 'male'])
    primary_female = len(primary[primary['username_gender_hint'] == 'female'])
    primary_gender_pct = safe_percentage(primary_male + primary_female, primary_size)
    
    primary_gender_skew = 'neutral'
    if primary_male > primary_female * 1.5:
        primary_gender_skew = 'male'
    elif primary_female > primary_male * 1.5:
        primary_gender_skew = 'female'
    
    # Characteristics
    primary_avg_rating = safe_mean(primary['Rating'])
    primary_avg_grade = safe_mean(primary['flesch_kincaid_grade'])
    primary_avg_engagement = safe_mean(primary['total_votes'])
    
    # Passion indicators
    primary_love_count = len(primary[primary['love_count'] > 0])
    primary_exclamation_count = len(primary[primary['exclamation_count'] > 0])
    primary_high_engagement = len(primary[primary['total_votes'] > 20])
    
    # Psychographics - emotions
    emotion_cols = ['emotion_joy', 'emotion_trust', 'emotion_fear', 'emotion_surprise', 
                    'emotion_sadness', 'emotion_disgust', 'emotion_anger', 'emotion_anticipation']
    primary_emotions = {}
    for col in emotion_cols:
        emotion_name = col.replace('emotion_', '')
        primary_emotions[emotion_name] = safe_mean(primary[col])
    
    # Sort emotions by intensity
    sorted_emotions = sorted(primary_emotions.items(), key=lambda x: x[1] if x[1] else 0, reverse=True)
    top_3_emotions = [e[0] for e in sorted_emotions[:3]]
    
    # Comparison films mentioned by primary audience
    primary_comparisons = []
    for movies in primary['movies_mentioned'].dropna():
        if isinstance(movies, str) and movies != '[]':
            import ast
            try:
                movie_list = ast.literal_eval(movies)
                primary_comparisons.extend(movie_list)
            except:
                pass
    
    from collections import Counter
    primary_comparison_films = dict(Counter(primary_comparisons).most_common(10))
    
    # Extract passion quotes (why they loved it)
    primary_passion_quotes = extract_contextual_quotes(
        primary,
        search_terms=['love', 'amazing', 'brilliant', 'masterpiece', 'perfect', 'favorite', 'incredible', 'beautiful'],
        limit=3,
        context_sentences=2
    )
    
    # Extract sophistication quotes (how they articulate themes)
    primary_sophistication_quotes = extract_contextual_quotes(
        primary[primary['has_comparisons'] == True],
        search_terms=None,  # Get most engaged reviews with comparisons
        limit=3,
        context_sentences=2
    )
    
    # ========================================
    # SECONDARY AUDIENCE: Positive but Less Passionate
    # ========================================
    
    # Criteria:
    # - Rating 7-8 (positive but not ecstatic)
    # - OR lovers who don't meet primary criteria
    
    secondary = reviews[
        ((reviews['Rating'] >= 7) & (reviews['Rating'] < 8)) |
        ((reviews['Rating'] >= 8) & (~reviews.index.isin(primary.index)))
    ].copy()
    
    secondary_size = len(secondary)
    secondary_pct = safe_percentage(secondary_size, len(reviews))
    
    # Demographics
    secondary_male = len(secondary[secondary['username_gender_hint'] == 'male'])
    secondary_female = len(secondary[secondary['username_gender_hint'] == 'female'])
    
    secondary_gender_skew = 'neutral'
    if secondary_male > secondary_female * 1.5:
        secondary_gender_skew = 'male'
    elif secondary_female > secondary_male * 1.5:
        secondary_gender_skew = 'female'
    
    # Characteristics
    secondary_avg_rating = safe_mean(secondary['Rating'])
    secondary_avg_grade = safe_mean(secondary['flesch_kincaid_grade'])
    secondary_avg_engagement = safe_mean(secondary['total_votes'])
    
    # ========================================
    # TERTIARY AUDIENCE: AVOID - Wrong Fit
    # ========================================
    
    # These are haters - wrong expectations, complexity barrier, etc.
    tertiary = haters.copy()
    
    tertiary_size = len(tertiary)
    tertiary_pct = safe_percentage(tertiary_size, len(reviews))
    
    # What did they expect? (comparison films)
    tertiary_comparisons = []
    for movies in tertiary['movies_mentioned'].dropna():
        if isinstance(movies, str) and movies != '[]':
            import ast
            try:
                movie_list = ast.literal_eval(movies)
                tertiary_comparisons.extend(movie_list)
            except:
                pass
    
    tertiary_comparison_films = dict(Counter(tertiary_comparisons).most_common(5))
    
    # Top complaints
    tertiary_avg_grade = safe_mean(tertiary['flesch_kincaid_grade'])
    tertiary_wish_pct = safe_percentage(len(tertiary[tertiary['wish_count'] > 0]), len(tertiary))
    
    # Complexity barrier?
    complexity_barrier = False
    if primary_avg_grade and tertiary_avg_grade:
        if primary_avg_grade - tertiary_avg_grade > 2.5:
            complexity_barrier = True
    
    # Extract warning quotes (why to avoid marketing to them)
    tertiary_warning_quotes = extract_contextual_quotes(
        tertiary,
        search_terms=['waste', 'boring', 'nothing happens', 'slow', 'disappointed', 'overhyped', 'pretentious'],
        limit=3,
        context_sentences=2
    )
    
    # ========================================
    # REACH STRATEGY INFERENCE
    # ========================================
    
    # Get risk factors from Module 6 (if we want to reference them)
    # For now, we'll create a simplified version
    risk_factors_summary = {
        'complexity_risk': {
            'risk_level': 'HIGH' if complexity_barrier else 'LOW'
        }
    }
    
    primary_reach = infer_reach_strategy(
        audience_characteristics={
            'avg_reading_grade': primary_avg_grade,
            'avg_engagement': primary_avg_engagement,
            'top_emotions': top_3_emotions,
            'gender_preference': primary_gender_skew
        },
        comparison_films=primary_comparison_films,
        risk_factors=risk_factors_summary
    )
    
    # ========================================
    # MESSAGING RECOMMENDATIONS
    # ========================================
    
    # Based on what resonated with primary audience
    messaging = {
        'tone': 'sophisticated' if primary_avg_grade and primary_avg_grade > 9 else 'accessible',
        'keywords': [],
        'avoid': []
    }
    
    # Add keywords based on emotions
    if 'fear' in top_3_emotions:
        messaging['keywords'].append('atmospheric horror')
    if 'anticipation' in top_3_emotions:
        messaging['keywords'].append('suspenseful')
    if 'sadness' in top_3_emotions:
        messaging['keywords'].append('emotional depth')
    
    # Add comparison film positioning
    if primary_comparison_films:
        top_comp = list(primary_comparison_films.keys())[:3]
        messaging['keywords'].append(f"For fans of: {', '.join(top_comp)}")
    
    # What to avoid based on tertiary complaints
    if tertiary_wish_pct and tertiary_wish_pct > 10:
        messaging['avoid'].append('Promising jump scares or action that isn\'t delivered')
    if complexity_barrier:
        messaging['avoid'].append('Marketing to mainstream horror audiences expecting conventional scares')
    
    # ========================================
    # RETURN STRUCTURE
    # ========================================
    
    return {
        'movie': movie_name,
        
        'primary_audience': {
            'tier': 'PRIMARY - Core Target',
            'size': primary_size,
            'percentage': primary_pct,
            'criteria': 'Passionate + Understanding + Scalable',
            'is_scalable': is_scalable,
            'scalability_note': 'Sufficient size for viable audience' if is_scalable else 'WARNING: Small sample - may be too niche',
            
            'demographics': {
                'gender_skew': primary_gender_skew,
                'male_count': primary_male,
                'female_count': primary_female,
                'male_pct': safe_percentage(primary_male, primary_size),
                'female_pct': safe_percentage(primary_female, primary_size),
                'gender_coverage': primary_gender_pct
            },
            
            'characteristics': {
                'avg_rating': primary_avg_rating,
                'avg_reading_grade': primary_avg_grade,
                'avg_engagement': primary_avg_engagement,
                'sophistication_level': 'High' if primary_avg_grade and primary_avg_grade > 10 else 'Moderate' if primary_avg_grade and primary_avg_grade > 8 else 'Standard'
            },
            
            'passion_indicators': {
                'love_statements': primary_love_count,
                'love_pct': safe_percentage(primary_love_count, primary_size),
                'exclamations': primary_exclamation_count,
                'exclamation_pct': safe_percentage(primary_exclamation_count, primary_size),
                'high_engagement': primary_high_engagement,
                'high_engagement_pct': safe_percentage(primary_high_engagement, primary_size)
            },
            
            'psychographics': {
                'top_3_emotions': top_3_emotions,
                'all_emotions': primary_emotions,
                'comparison_films': primary_comparison_films,
                'top_5_comparisons': dict(list(primary_comparison_films.items())[:5])
            },
            
            'reach_strategy': primary_reach['channels'],
            'strategic_notes': primary_reach['strategic_notes'],
            
            'sample_quotes': {
                'passion': primary_passion_quotes,
                'sophistication': primary_sophistication_quotes
            }
        },
        
        'secondary_audience': {
            'tier': 'SECONDARY - Broader Appeal',
            'size': secondary_size,
            'percentage': secondary_pct,
            'criteria': 'Positive (7-8 rating) or lovers not meeting primary criteria',
            
            'demographics': {
                'gender_skew': secondary_gender_skew,
                'male_count': secondary_male,
                'female_count': secondary_female
            },
            
            'characteristics': {
                'avg_rating': secondary_avg_rating,
                'avg_reading_grade': secondary_avg_grade,
                'avg_engagement': secondary_avg_engagement,
                'sophistication_level': 'Moderate' if secondary_avg_grade and secondary_avg_grade > 8 else 'Standard'
            },
            
            'reach_strategy': [
                'Broader horror platforms (Shudder, genre streaming)',
                'Social media (targeted ads to horror fans)',
                'VOD platforms (Amazon, iTunes horror categories)',
                'Genre festivals and conventions'
            ],
            
            'messaging_notes': 'More accessible positioning than primary audience - emphasize entertainment value alongside artistic merit'
        },
        
        'tertiary_avoid': {
            'tier': 'TERTIARY - Avoid Marketing To',
            'size': tertiary_size,
            'percentage': tertiary_pct,
            'warning': '‚ö†Ô∏è  DO NOT market to this segment - wrong expectations, will generate negative word-of-mouth',
            
            'characteristics': {
                'avg_rating': safe_mean(tertiary['Rating']),
                'avg_reading_grade': tertiary_avg_grade,
                'complexity_barrier': complexity_barrier,
                'wish_statement_pct': tertiary_wish_pct
            },
            
            'wrong_expectations': {
                'expected_films': tertiary_comparison_films,
                'top_complaints': 'Expected mainstream horror with scares' if tertiary_wish_pct and tertiary_wish_pct > 10 else 'Film too slow/complex'
            },
            
            'sample_warnings': tertiary_warning_quotes
        },
        
        'messaging_recommendations': messaging
    }

print("‚úÖ target_audience_recommendation() function defined")

In [None]:
# Test Module 7 on The Witch

print("üß™ Testing Module 7: Target Audience Recommendation\n")
print("="*80)

test_movie = "The Witch"
result = target_audience_recommendation(test_movie)

print(f"\nüéØ TARGET AUDIENCE RECOMMENDATION: {test_movie}\n")

# ========================================
# PRIMARY AUDIENCE
# ========================================

print("="*80)
print(f"\n1Ô∏è‚É£  PRIMARY AUDIENCE - Core Target")
print(f"\nüìä Size & Scalability:")
print(f"  Count: {result['primary_audience']['size']} ({result['primary_audience']['percentage']}% of all reviews)")
print(f"  Scalable: {'‚úÖ YES' if result['primary_audience']['is_scalable'] else '‚ö†Ô∏è  NO'}")
print(f"  Note: {result['primary_audience']['scalability_note']}")
print(f"  Criteria: {result['primary_audience']['criteria']}")

print(f"\nüë• Demographics:")
print(f"  Gender Skew: {result['primary_audience']['demographics']['gender_skew'].upper()}")
print(f"  Male: {result['primary_audience']['demographics']['male_count']} ({result['primary_audience']['demographics']['male_pct']}%)")
print(f"  Female: {result['primary_audience']['demographics']['female_count']} ({result['primary_audience']['demographics']['female_pct']}%)")

print(f"\nüìà Characteristics:")
print(f"  Avg Rating: {result['primary_audience']['characteristics']['avg_rating']:.2f}/10")
print(f"  Avg Reading Grade: {result['primary_audience']['characteristics']['avg_reading_grade']:.1f}")
print(f"  Sophistication: {result['primary_audience']['characteristics']['sophistication_level']}")
print(f"  Avg Engagement: {result['primary_audience']['characteristics']['avg_engagement']:.1f} votes")

print(f"\nüî• Passion Indicators:")
print(f"  Love Statements: {result['primary_audience']['passion_indicators']['love_statements']} ({result['primary_audience']['passion_indicators']['love_pct']}%)")
print(f"  Exclamations: {result['primary_audience']['passion_indicators']['exclamations']} ({result['primary_audience']['passion_indicators']['exclamation_pct']}%)")
print(f"  High Engagement (20+ votes): {result['primary_audience']['passion_indicators']['high_engagement']} ({result['primary_audience']['passion_indicators']['high_engagement_pct']}%)")

print(f"\nüß† Psychographics:")
print(f"  Top 3 Emotions: {', '.join([e.capitalize() for e in result['primary_audience']['psychographics']['top_3_emotions']])}")
print(f"\n  Top Comparison Films:")
for film, count in list(result['primary_audience']['psychographics']['top_5_comparisons'].items())[:5]:
    print(f"    - {film} ({count} mentions)")

print(f"\nüì¢ Reach Strategy:")
for i, channel in enumerate(result['primary_audience']['reach_strategy'], 1):
    print(f"  {i}. {channel}")

print(f"\nüí° Strategic Notes:")
for i, note in enumerate(result['primary_audience']['strategic_notes'], 1):
    print(f"  {i}. {note}")

print(f"\nüí¨ Sample Quotes - PASSION:")
for i, quote in enumerate(result['primary_audience']['sample_quotes']['passion'], 1):
    print(f"\n  {i}. [{quote['review_id']}] {quote['reviewer']} - {quote['rating']}/10 ({quote['engagement']} votes)")
    if quote['review_title']:
        print(f"     Title: \"{quote['review_title']}\"")
    if quote.get('matched_term'):
        print(f"     Matched: '{quote['matched_term']}'")
    print(f"     \"{quote['quote']}\"")

print(f"\nüí¨ Sample Quotes - SOPHISTICATION (Theme Articulation):")
for i, quote in enumerate(result['primary_audience']['sample_quotes']['sophistication'], 1):
    print(f"\n  {i}. [{quote['review_id']}] {quote['reviewer']} - {quote['rating']}/10 ({quote['engagement']} votes)")
    if quote['review_title']:
        print(f"     Title: \"{quote['review_title']}\"")
    print(f"     \"{quote['quote']}\"")

# ========================================
# SECONDARY AUDIENCE
# ========================================

print("\n" + "="*80)
print(f"\n2Ô∏è‚É£  SECONDARY AUDIENCE - Broader Appeal")
print(f"\nüìä Size:")
print(f"  Count: {result['secondary_audience']['size']} ({result['secondary_audience']['percentage']}% of all reviews)")
print(f"  Criteria: {result['secondary_audience']['criteria']}")

print(f"\nüë• Demographics:")
print(f"  Gender Skew: {result['secondary_audience']['demographics']['gender_skew'].upper()}")
print(f"  Male: {result['secondary_audience']['demographics']['male_count']}")
print(f"  Female: {result['secondary_audience']['demographics']['female_count']}")

print(f"\nüìà Characteristics:")
print(f"  Avg Rating: {result['secondary_audience']['characteristics']['avg_rating']:.2f}/10")
print(f"  Avg Reading Grade: {result['secondary_audience']['characteristics']['avg_reading_grade']:.1f}")
print(f"  Sophistication: {result['secondary_audience']['characteristics']['sophistication_level']}")

print(f"\nüì¢ Reach Strategy:")
for i, channel in enumerate(result['secondary_audience']['reach_strategy'], 1):
    print(f"  {i}. {channel}")

print(f"\nüí° Messaging Note:")
print(f"  {result['secondary_audience']['messaging_notes']}")

# ========================================
# TERTIARY - AVOID
# ========================================

print("\n" + "="*80)
print(f"\n3Ô∏è‚É£  TERTIARY AUDIENCE - AVOID MARKETING TO")
print(f"\n‚ö†Ô∏è  {result['tertiary_avoid']['warning']}")
print(f"\nüìä Size:")
print(f"  Count: {result['tertiary_avoid']['size']} ({result['tertiary_avoid']['percentage']}% of all reviews)")

print(f"\nüìâ Characteristics:")
print(f"  Avg Rating: {result['tertiary_avoid']['characteristics']['avg_rating']:.2f}/10")
print(f"  Avg Reading Grade: {result['tertiary_avoid']['characteristics']['avg_reading_grade']:.1f}")
print(f"  Complexity Barrier: {'YES - Too arthouse for them' if result['tertiary_avoid']['characteristics']['complexity_barrier'] else 'No'}")
print(f"  Wish Statements: {result['tertiary_avoid']['characteristics']['wish_statement_pct']}% (unfulfilled expectations)")

print(f"\n‚ùå Wrong Expectations:")
print(f"  {result['tertiary_avoid']['wrong_expectations']['top_complaints']}")
if result['tertiary_avoid']['wrong_expectations']['expected_films']:
    print(f"\n  Films They Expected (but this isn't):")
    for film, count in list(result['tertiary_avoid']['wrong_expectations']['expected_films'].items())[:3]:
        print(f"    - {film} ({count} mentions)")

print(f"\n‚ö†Ô∏è  Sample Warning Quotes:")
for i, quote in enumerate(result['tertiary_avoid']['sample_warnings'], 1):
    print(f"\n  {i}. [{quote['review_id']}] {quote['reviewer']} - {quote['rating']}/10 ({quote['engagement']} votes)")
    if quote['review_title']:
        print(f"     Title: \"{quote['review_title']}\"")
    if quote.get('matched_term'):
        print(f"     Matched: '{quote['matched_term']}'")
    print(f"     \"{quote['quote']}\"")

# ========================================
# MESSAGING RECOMMENDATIONS
# ========================================

print("\n" + "="*80)
print(f"\nüì£ MESSAGING RECOMMENDATIONS")
print(f"\n  Tone: {result['messaging_recommendations']['tone'].upper()}")
print(f"\n  Keywords:")
for keyword in result['messaging_recommendations']['keywords']:
    print(f"    ‚Ä¢ {keyword}")
print(f"\n  Avoid:")
for avoid in result['messaging_recommendations']['avoid']:
    print(f"    ‚Ä¢ {avoid}")

print("\n" + "="*80)
print("\n‚úÖ Module 7 test complete!")

---

# Module 8 JSON Export 

In [None]:
# ============================================================================
# MODULE 8: JSON EXPORT FUNCTIONS
# ============================================================================

import json
from pathlib import Path
from datetime import datetime

def export_movie_analysis(movie_name, output_dir='../insights'):
    """
    Run all analysis modules for a single movie and export to JSON
    
    Args:
        movie_name: Name of the movie to analyze
        output_dir: Directory to save JSON file (default: ../insights)
    
    Returns: Dictionary with all analysis results
    """
    
    print(f"\n{'='*80}")
    print(f"üé¨ Analyzing: {movie_name}")
    print(f"{'='*80}\n")
    
    # Run all analysis modules
    results = {
        'movie': movie_name,
        'analysis_date': datetime.now().isoformat(),
        'modules': {}
    }
    
    # Module 1: Audience Breakdown
    print("üìä Module 1: Audience Breakdown...")
    try:
        results['modules']['audience_breakdown'] = audience_breakdown(movie_name)
        print("   ‚úÖ Complete")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
        results['modules']['audience_breakdown'] = {'error': str(e)}
    
    # Module 2: What Resonated
    print("‚ù§Ô∏è  Module 2: What Resonated...")
    try:
        results['modules']['what_resonated'] = what_resonated(movie_name)
        print("   ‚úÖ Complete")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
        results['modules']['what_resonated'] = {'error': str(e)}
    
    # Module 3: What Didn't Work
    print("üíî Module 3: What Didn't Work...")
    try:
        results['modules']['what_didnt_work'] = what_didnt_work(movie_name)
        print("   ‚úÖ Complete")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
        results['modules']['what_didnt_work'] = {'error': str(e)}
    
    # Module 4: Polarization Analysis
    print("‚ö° Module 4: Polarization Analysis...")
    try:
        results['modules']['polarization'] = polarization_analysis(movie_name)
        print("   ‚úÖ Complete")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
        results['modules']['polarization'] = {'error': str(e)}
    
    # Module 5: Marketing Disconnect
    print("üìä Module 5: Marketing Disconnect...")
    try:
        results['modules']['marketing_disconnect'] = marketing_disconnect_analysis(movie_name)
        print("   ‚úÖ Complete")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
        results['modules']['marketing_disconnect'] = {'error': str(e)}
    
    # Module 6: Risk Factors
    print("‚ö†Ô∏è  Module 6: Risk Factors...")
    try:
        results['modules']['risk_factors'] = risk_factors_analysis(movie_name)
        print("   ‚úÖ Complete")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
        results['modules']['risk_factors'] = {'error': str(e)}
    
    # Module 7: Target Audience (if function exists)
    try:
        print("üéØ Module 7: Target Audience...")
        results['modules']['target_audience'] = target_audience_recommendation(movie_name)
        print("   ‚úÖ Complete")
    except NameError:
        print("üéØ Module 7: Target Audience... ‚è≠Ô∏è  Skipped (function not defined)")
        results['modules']['target_audience'] = {'status': 'skipped', 'reason': 'function not defined'}
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
        results['modules']['target_audience'] = {'error': str(e)}
    
    # Create output directory if it doesn't exist
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Generate filename (sanitize movie name)
    safe_movie_name = movie_name.lower().replace(' ', '_').replace("'", "")
    filename = f"{safe_movie_name}.json"
    filepath = output_path / filename
    
    # Export to JSON
    print(f"\nüíæ Exporting to JSON...")
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"   ‚úÖ Saved: {filepath}")
    print(f"\n{'='*80}\n")
    
    return results


def export_all_movies(output_dir='../insights'):
    """
    Run analysis on all 10 movies and export individual JSON files
    
    Args:
        output_dir: Directory to save JSON files (default: ../insights)
    
    Returns: Dictionary with all movie results
    """
    
    all_results = {
        'export_date': datetime.now().isoformat(),
        'total_movies': len(MOVIES),
        'movies': {}
    }
    
    print(f"\nüé¨ BATCH EXPORT: Analyzing {len(MOVIES)} movies")
    print(f"üìÅ Output directory: {output_dir}\n")
    
    for i, movie in enumerate(MOVIES, 1):
        print(f"\n[{i}/{len(MOVIES)}] Processing: {movie}")
        
        try:
            result = export_movie_analysis(movie, output_dir)
            # Create safe filename outside f-string
            safe_name = movie.lower().replace(' ', '_')
            safe_name = safe_name.replace("'", "")
            json_filename = safe_name + ".json"
            
            all_results['movies'][movie] = {
                'status': 'success',
                'filepath': json_filename
            }
        except Exception as e:
            print(f"   ‚ùå FAILED: {e}")
            all_results['movies'][movie] = {
                'status': 'failed',
                'error': str(e)
            }
    
    # Export master index file
    output_path = Path(output_dir)
    master_file = output_path / '_index.json'
    
    with open(master_file, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    
    print(f"\n{'='*80}")
    print(f"‚úÖ BATCH EXPORT COMPLETE")
    print(f"{'='*80}")
    print(f"\nüìä Summary:")
    print(f"   Total movies: {len(MOVIES)}")
    
    successful = sum(1 for m in all_results['movies'].values() if m['status'] == 'success')
    failed = sum(1 for m in all_results['movies'].values() if m['status'] == 'failed')
    
    print(f"   Successful: {successful}")
    print(f"   Failed: {failed}")
    print(f"   Master index: {master_file}")
    print(f"\n")
    
    return all_results


def create_combined_export(output_dir='../insights', output_file='all_movies_combined.json'):
    """
    Create a single JSON file with all movie analyses combined
    
    Args:
        output_dir: Directory containing individual JSON files
        output_file: Name of combined output file
    
    Returns: Combined results dictionary
    """
    
    print(f"\nüì¶ Creating combined export...")
    
    output_path = Path(output_dir)
    combined_data = {
        'export_date': datetime.now().isoformat(),
        'total_movies': len(MOVIES),
        'movies': {}
    }
    
    # Read all individual movie JSON files
    for movie in MOVIES:
        # Create safe filename outside f-string
        safe_movie_name = movie.lower().replace(' ', '_')
        safe_movie_name = safe_movie_name.replace("'", "")
        json_filename = safe_movie_name + ".json"
        
        filepath = output_path / json_filename
        
        if filepath.exists():
            with open(filepath, 'r', encoding='utf-8') as f:
                movie_data = json.load(f)
                combined_data['movies'][movie] = movie_data
            print(f"   ‚úÖ Loaded: {movie}")
        else:
            print(f"   ‚ö†Ô∏è  Missing: {movie}")
            combined_data['movies'][movie] = {'status': 'file_not_found'}
    
    # Save combined file
    combined_filepath = output_path / output_file
    with open(combined_filepath, 'w', encoding='utf-8') as f:
        json.dump(combined_data, f, indent=2, ensure_ascii=False)
    
    file_size_kb = combined_filepath.stat().st_size / 1024
    print(f"\n   üíæ Combined file saved: {combined_filepath}")
    print(f"   üìä Size: {file_size_kb:.1f} KB")
    print(f"\n")
    
    return combined_data


def quick_export(movie_name, output_dir='../insights'):
    """Quick export for a single movie (convenience wrapper)"""
    return export_movie_analysis(movie_name, output_dir)

print("‚úÖ Module 8: JSON Export Functions defined")

In [None]:
# Test Module 8 with The Witch
print("üß™ Testing Module 8: Single Movie Export\n")

result = quick_export("The Witch")

---

# Export all 

In [None]:
all_results = export_all_movies()

---

# MODULE 9: CROSS-MOVIE SYNTHESIS

In [None]:

# MODULE 9: CROSS-MOVIE SYNTHESIS
# ============================================================================

import json
from pathlib import Path
from collections import Counter, defaultdict
import numpy as np

print("="*80)
print("MODULE 9: CROSS-MOVIE SYNTHESIS")
print("="*80)
print("\nüì¶ Loading all movie data...\n")

# Load all JSON files
insights_dir = Path('../insights')
all_movies_data = {}

for movie in MOVIES:
    safe_name = movie.lower().replace(' ', '_').replace("'", "")
    filepath = insights_dir / f"{safe_name}.json"
    
    if filepath.exists():
        with open(filepath, 'r', encoding='utf-8') as f:
            all_movies_data[movie] = json.load(f)
        print(f"  ‚úÖ Loaded: {movie}")
    else:
        print(f"  ‚ö†Ô∏è  Missing: {movie}")

print(f"\n‚úÖ Loaded {len(all_movies_data)} movies")
print("="*80 + "\n")

In [None]:
# Helper functions for cross-movie analysis

def extract_rating_stats(movies_data):
    """Extract basic rating statistics across all movies"""
    stats = []
    for movie, data in movies_data.items():
        if 'audience_breakdown' in data['modules']:
            mod = data['modules']['audience_breakdown']
            stats.append({
                'movie': movie,
                'total_reviews': mod['total_reviews'],
                'avg_rating': mod['avg_rating'],
                'rating_variance': mod['rating_variance'],
                'lovers_pct': mod['rating_segments']['lovers_pct'],
                'haters_pct': mod['rating_segments']['haters_pct']
            })
    return stats

def extract_risk_patterns(movies_data):
    """Extract risk factors across all movies"""
    risks = []
    for movie, data in movies_data.items():
        if 'risk_factors' in data['modules']:
            mod = data['modules']['risk_factors']
            if 'overall_risk_assessment' in mod:
                risks.append({
                    'movie': movie,
                    'risk_level': mod['overall_risk_assessment']['risk_level'],
                    'risk_score': mod['overall_risk_assessment']['risk_score'],
                    'high_risks': mod['overall_risk_assessment'].get('high_risks', [])
                })
    return risks

def extract_polarization_patterns(movies_data):
    """Extract polarization data across all movies"""
    polar = []
    for movie, data in movies_data.items():
        if 'polarization' in data['modules']:
            mod = data['modules']['polarization']
            polar.append({
                'movie': movie,
                'level': mod['polarization_metrics']['level'],
                'variance': mod['polarization_metrics']['rating_variance'],
                'temporal_shift': mod['temporal_polarization']['temporal_shift'],
                'gender_gap': mod['gender_polarization']['gender_rating_gap']
            })
    return polar

def extract_audience_quotes(movies_data, module_name, quote_type, limit=3):
    """Extract top quotes from a specific module across all movies"""
    all_quotes = []
    
    for movie, data in movies_data.items():
        if module_name in data['modules']:
            mod = data['modules'][module_name]
            
            # Navigate to quotes based on module structure
            if quote_type in mod:
                if 'quotes' in mod[quote_type]:
                    quotes = mod[quote_type]['quotes'][:limit]
                    for q in quotes:
                        q['source_movie'] = movie
                        all_quotes.append(q)
    
    # Sort by engagement
    all_quotes.sort(key=lambda x: x.get('engagement', 0), reverse=True)
    return all_quotes

print("‚úÖ Helper functions defined")

In [None]:
print("\n" + "="*80)
print("ANALYSIS 1: SUCCESS FACTORS")
print("="*80 + "\n")

# Get rating statistics
rating_stats = extract_rating_stats(all_movies_data)

# Sort by average rating
rating_stats_sorted = sorted(rating_stats, key=lambda x: x['avg_rating'], reverse=True)

print("üìä Films Ranked by Average Rating:\n")
for i, stat in enumerate(rating_stats_sorted, 1):
    print(f"  {i}. {stat['movie']:25} - {stat['avg_rating']:.2f}/10 (Variance: {stat['rating_variance']:.2f})")

# Identify high performers (avg rating > 7.0)
high_performers = [s for s in rating_stats if s['avg_rating'] > 7.0]
low_performers = [s for s in rating_stats if s['avg_rating'] < 6.0]

print(f"\n‚úÖ High Performers (>7.0): {len(high_performers)} films")
for hp in high_performers:
    print(f"   ‚Ä¢ {hp['movie']} - {hp['avg_rating']:.2f}/10")

print(f"\n‚ùå Low Performers (<6.0): {len(low_performers)} films")
for lp in low_performers:
    print(f"   ‚Ä¢ {lp['movie']} - {lp['avg_rating']:.2f}/10")

# Extract success patterns
print("\nüîç Success Patterns:")
if high_performers:
    avg_lover_pct = np.mean([hp['lovers_pct'] for hp in high_performers])
    avg_hater_pct = np.mean([hp['haters_pct'] for hp in high_performers])
    print(f"   High performers average: {avg_lover_pct:.1f}% lovers, {avg_hater_pct:.1f}% haters")

if low_performers:
    avg_lover_pct_low = np.mean([lp['lovers_pct'] for lp in low_performers])
    avg_hater_pct_low = np.mean([lp['haters_pct'] for lp in low_performers])
    print(f"   Low performers average: {avg_lover_pct_low:.1f}% lovers, {avg_hater_pct_low:.1f}% haters")

print("\n" + "="*80 + "\n")

In [None]:
print("\n" + "="*80)
print("ANALYSIS 2: UNIVERSAL AUDIENCE PATTERNS")
print("="*80 + "\n")

# Aggregate audience data across all films
total_reviews = 0
total_lovers = 0
total_haters = 0
gender_distribution = {'male': 0, 'female': 0, 'unknown': 0}

for movie, data in all_movies_data.items():
    if 'audience_breakdown' in data['modules']:
        mod = data['modules']['audience_breakdown']
        total_reviews += mod['total_reviews']
        total_lovers += mod['rating_segments']['lovers_8_10']
        total_haters += mod['rating_segments']['haters_1_3']
        
        gender_distribution['male'] += mod['gender_breakdown']['male']
        gender_distribution['female'] += mod['gender_breakdown']['female']
        gender_distribution['unknown'] += mod['gender_breakdown']['unknown']

print(f"üìä Aggregate Statistics Across {len(all_movies_data)} Films:\n")
print(f"   Total Reviews Analyzed: {total_reviews:,}")
print(f"   Total Lovers (8-10): {total_lovers:,} ({total_lovers/total_reviews*100:.1f}%)")
print(f"   Total Haters (1-3): {total_haters:,} ({total_haters/total_reviews*100:.1f}%)")

print(f"\nüë• Gender Distribution:\n")
identified = gender_distribution['male'] + gender_distribution['female']
print(f"   Male: {gender_distribution['male']:,} ({gender_distribution['male']/total_reviews*100:.1f}%)")
print(f"   Female: {gender_distribution['female']:,} ({gender_distribution['female']/total_reviews*100:.1f}%)")
print(f"   Unknown: {gender_distribution['unknown']:,} ({gender_distribution['unknown']/total_reviews*100:.1f}%)")
print(f"   Gender ID Coverage: {identified/total_reviews*100:.1f}%")

# Extract emotion patterns from lovers
print(f"\nüòä Emotion Patterns (Lovers Across All Films):\n")

all_lover_emotions = defaultdict(list)
for movie, data in all_movies_data.items():
    if 'what_resonated' in data['modules']:
        emotions = data['modules']['what_resonated']['emotion_profiles']['all_lovers']
        for emotion, score in emotions.items():
            if score:
                all_lover_emotions[emotion].append(score)

# Average emotions across all films
avg_emotions = {emotion: np.mean(scores) for emotion, scores in all_lover_emotions.items()}
sorted_emotions = sorted(avg_emotions.items(), key=lambda x: x[1], reverse=True)

print("   Top 5 Emotions in Positive Reviews:")
for emotion, score in sorted_emotions[:5]:
    print(f"      {emotion.capitalize():12} {score:.3f}")

print("\n" + "="*80 + "\n")

In [None]:
print("\n" + "="*80)
print("ANALYSIS 3: POLARIZATION PATTERNS")
print("="*80 + "\n")

# Get polarization data
polar_data = extract_polarization_patterns(all_movies_data)

print("‚ö° Polarization Levels Across Films:\n")
for pd in sorted(polar_data, key=lambda x: x['variance'], reverse=True):
    print(f"   {pd['movie']:25} - {pd['level']:20} (Variance: {pd['variance']:.2f})")

# Count polarization levels
level_counts = Counter([pd['level'] for pd in polar_data])
print(f"\nüìä Polarization Distribution:")
for level, count in level_counts.items():
    print(f"   {level}: {count} films ({count/len(polar_data)*100:.1f}%)")

# Temporal shift analysis
print(f"\nüìÖ Temporal Rating Shifts (Early vs Late Reviews):\n")
for pd in polar_data:
    if pd['temporal_shift']:
        direction = "üìà Improved" if pd['temporal_shift'] > 0 else "üìâ Declined"
        print(f"   {pd['movie']:25} {direction:12} {pd['temporal_shift']:+.2f} points")

# Gender gap analysis
print(f"\nüë• Gender Rating Gaps:\n")
for pd in sorted(polar_data, key=lambda x: x['gender_gap'] if x['gender_gap'] else 0, reverse=True):
    if pd['gender_gap'] and pd['gender_gap'] > 0.5:
        significance = "‚ö†Ô∏è  Significant" if pd['gender_gap'] > 1.0 else "Moderate"
        print(f"   {pd['movie']:25} {significance:15} {pd['gender_gap']:.2f} point gap")

print("\n" + "="*80 + "\n")

In [None]:
print("\n" + "="*80)
print("ANALYSIS 4: RISK PATTERNS")
print("="*80 + "\n")

# Get risk data
risk_data = extract_risk_patterns(all_movies_data)

print("‚ö†Ô∏è  Risk Levels Across Films:\n")
for rd in sorted(risk_data, key=lambda x: x['risk_score'], reverse=True):
    print(f"   {rd['movie']:25} - {rd['risk_level']:10} (Score: {rd['risk_score']:.1f}/6.0)")
    if rd['high_risks']:
        for risk in rd['high_risks']:
            print(f"      üö® {risk}")

# Count common risks
all_high_risks = []
for rd in risk_data:
    all_high_risks.extend(rd['high_risks'])

risk_frequency = Counter(all_high_risks)
print(f"\nüìä Most Common HIGH Risk Factors:\n")
if risk_frequency:
    for risk, count in risk_frequency.most_common():
        print(f"   {risk:30} - {count} films ({count/len(risk_data)*100:.1f}%)")
else:
    print("   No HIGH risk factors identified across films")

# Extract risk quotes
print(f"\nüí¨ Sample Risk Evidence (Complexity Barrier):\n")
complexity_quotes = []
for movie, data in all_movies_data.items():
    if 'risk_factors' in data['modules']:
        if 'complexity_risk' in data['modules']['risk_factors']:
            complexity = data['modules']['risk_factors']['complexity_risk']
            if complexity['risk_level'] == 'HIGH' and complexity.get('sample_quotes'):
                quote = complexity['sample_quotes'][0]
                quote['source_movie'] = movie
                complexity_quotes.append(quote)

for i, quote in enumerate(complexity_quotes[:3], 1):
    print(f"   {i}. {quote['source_movie']} - {quote['reviewer']} ({quote['rating']}/10):")
    print(f"      \"{quote['quote'][:150]}...\"")
    print()

print("="*80 + "\n")

In [None]:
print("\n" + "="*80)
print("ANALYSIS 5: MARKETING STRATEGY INSIGHTS")
print("="*80 + "\n")

# Analyze temporal sentiment shifts (indicator of marketing alignment)
print("üìä Marketing Effectiveness (Temporal Shift Analysis):\n")

improved_films = []
declined_films = []

for pd in polar_data:
    if pd['temporal_shift']:
        if pd['temporal_shift'] > 1.0:
            improved_films.append((pd['movie'], pd['temporal_shift']))
        elif pd['temporal_shift'] < -0.5:
            declined_films.append((pd['movie'], pd['temporal_shift']))

if improved_films:
    print("‚úÖ Films That Found Their Audience (Improved Over Time):\n")
    for movie, shift in sorted(improved_films, key=lambda x: x[1], reverse=True):
        print(f"   {movie:25} +{shift:.2f} points (late reviews better)")
        
    print("\n   üí° Interpretation: These films likely had misaligned initial marketing")
    print("      but found their true audience through word-of-mouth.\n")

if declined_films:
    print("‚ùå Films That Lost Momentum (Declined Over Time):\n")
    for movie, shift in sorted(declined_films, key=lambda x: x[1]):
        print(f"   {movie:25} {shift:.2f} points (late reviews worse)")
    print()

# Early negative buzz analysis
print("‚ö†Ô∏è  Early Negative Buzz Patterns:\n")

early_buzz_issues = []
for movie, data in all_movies_data.items():
    if 'risk_factors' in data['modules']:
        if 'early_buzz_risk' in data['modules']['risk_factors']:
            buzz = data['modules']['risk_factors']['early_buzz_risk']
            if buzz['risk_level'] in ['HIGH', 'MODERATE']:
                early_buzz_issues.append({
                    'movie': movie,
                    'level': buzz['risk_level'],
                    'negative_pct': buzz['early_negative_pct']
                })

for issue in sorted(early_buzz_issues, key=lambda x: x['negative_pct'] if x['negative_pct'] else 0, reverse=True):
    print(f"   {issue['movie']:25} {issue['level']:10} ({issue['negative_pct']:.1f}% early negative)")

print("\n   üí° Recommendation: Films with high early negative buzz need:")
print("      ‚Ä¢ Festival circuit first (build critical support)")
print("      ‚Ä¢ Platform release (not wide theatrical)")
print("      ‚Ä¢ Accurate marketing (set correct expectations)\n")

print("="*80 + "\n")

In [None]:
print("\n" + "="*80)
print("GENERATING SYNTHESIS REPORT")
print("="*80 + "\n")

synthesis_report = {
    'report_date': datetime.now().isoformat(),
    'total_movies_analyzed': len(all_movies_data),
    'total_reviews_analyzed': total_reviews,
    
    'key_findings': {
        'success_factors': {
            'high_performers': len(high_performers),
            'avg_lover_percentage': np.mean([hp['lovers_pct'] for hp in high_performers]) if high_performers else None,
            'interpretation': "High-performing folk horror films maintain 50%+ lover rate despite polarization"
        },
        
        'audience_profile': {
            'total_engaged_viewers': total_reviews,
            'lover_rate': f"{total_lovers/total_reviews*100:.1f}%",
            'hater_rate': f"{total_haters/total_reviews*100:.1f}%",
            'gender_skew': 'male' if gender_distribution['male'] > gender_distribution['female'] * 2 else 'balanced',
            'male_percentage': f"{gender_distribution['male']/total_reviews*100:.1f}%",
            'female_percentage': f"{gender_distribution['female']/total_reviews*100:.1f}%",
            'top_emotions': [emotion for emotion, score in sorted_emotions[:3]]
        },
        
        'polarization_insights': {
            'highly_polarizing_count': level_counts.get('HIGHLY_POLARIZING', 0),
            'consensus_count': level_counts.get('CONSENSUS', 0),
            'films_that_improved': len(improved_films),
            'interpretation': "Folk horror is inherently polarizing - 50% of films are HIGHLY_POLARIZING"
        },
        
        'risk_factors': {
            'most_common_risk': risk_frequency.most_common(1)[0][0] if risk_frequency else 'None',
            'risk_frequency': dict(risk_frequency),
            'interpretation': "Complexity barrier is the primary risk - arthouse positioning is critical"
        },
        
        'marketing_recommendations': {
            'festival_first': len([f for f in improved_films if f[1] > 1.5]),
            'platform_release_candidates': len(early_buzz_issues),
            'key_insight': "Films that improved over time needed better initial positioning"
        }
    }
}

# Save synthesis report
synthesis_file = Path('../insights/module_9_synthesis.json')
with open(synthesis_file, 'w', encoding='utf-8') as f:
    json.dump(synthesis_report, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Synthesis report saved: {synthesis_file}")
print(f"\nüìä Key Metrics Summary:")
print(f"   ‚Ä¢ {len(all_movies_data)} movies analyzed")
print(f"   ‚Ä¢ {total_reviews:,} total reviews")
print(f"   ‚Ä¢ {len(high_performers)} high performers identified")
print(f"   ‚Ä¢ {len(risk_frequency)} unique risk factors found")
print(f"   ‚Ä¢ {len(improved_films)} films improved over time")

print("\n" + "="*80 + "\n")

In [None]:
print("\n" + "="*80)
print("EXECUTIVE SUMMARY: FOLK HORROR LANDSCAPE")
print("="*80 + "\n")

print("üìå KEY FINDINGS:\n")

print("1. AUDIENCE SIZE & ENGAGEMENT")
print(f"   ‚Ä¢ {total_reviews:,} reviews across 10 films")
print(f"   ‚Ä¢ {total_lovers/total_reviews*100:.1f}% are lovers (8-10 rating)")
print(f"   ‚Ä¢ {total_haters/total_reviews*100:.1f}% are haters (1-3 rating)")
print(f"   ‚Ä¢ Genre is HIGHLY POLARIZING by nature\n")

print("2. AUDIENCE DEMOGRAPHICS")
print(f"   ‚Ä¢ {gender_distribution['male']/total_reviews*100:.1f}% male skew")
print(f"   ‚Ä¢ {gender_distribution['female']/total_reviews*100:.1f}% female")
print(f"   ‚Ä¢ Top emotions: {', '.join([e for e, _ in sorted_emotions[:3]])}")
print(f"   ‚Ä¢ Sophisticated viewers (higher reading grades)\n")

print("3. SUCCESS PATTERNS")
if high_performers:
    print(f"   ‚Ä¢ High performers maintain ~{np.mean([hp['lovers_pct'] for hp in high_performers]):.0f}% lover rate")
    print("   ‚Ä¢ Films that succeed:")
    for hp in high_performers[:3]:
        print(f"     - {hp['movie']} ({hp['avg_rating']:.1f}/10)")
print()

print("4. COMMON RISKS")
if risk_frequency:
    print(f"   ‚Ä¢ Most common: {risk_frequency.most_common(1)[0][0]}")
    print("   ‚Ä¢ Affects: " + ", ".join([rd['movie'] for rd in risk_data if risk_frequency.most_common(1)[0][0] in rd['high_risks']]))
print()

print("5. MARKETING INSIGHTS")
if improved_films:
    print(f"   ‚Ä¢ {len(improved_films)} films improved +1.0 points over time")
    print("   ‚Ä¢ Indicates initial marketing misalignment")
    print("   ‚Ä¢ Festival-first strategy works better")
print()

print("6. POSITIONING IMPLICATIONS FOR 'ROOTS'")
print("   ‚Ä¢ Target arthouse horror audience (not mainstream)")
print("   ‚Ä¢ Expect 50%+ polarization (this is normal for genre)")
print("   ‚Ä¢ Platform release > wide theatrical")
print("   ‚Ä¢ Build through festivals and critics first")
print("   ‚Ä¢ Male-skewing but sophisticated audience")

print("\n" + "="*80 + "\n")
print("‚úÖ Module 9 Complete!")
print("="*80 + "\n")