In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import RobustScaler, StandardScaler
import warnings
warnings.filterwarnings('ignore')


In [2]:
data_path="C:/Users/russe/OneDrive/Desktop/Portfolio/Next AAA Title"

In [None]:
movies_ratings_df = pd.read_csv(
        f'{data_path}/Processed/movies_ratings.csv',
        sep=','
    )

movies_df = pd.read_csv(
        f'{data_path}/Raw/movies.csv',
        sep=','
    )

ratings_df = pd.read_csv(
        f'{data_path}/Raw/ratings.csv',
        sep=','
    )

actors_df = pd.read_csv(
        f'{data_path}/Raw/actors.csv',
        sep=','
    )

principals_df = pd.read_csv(
        f'{data_path}/Raw/principals.csv',
        sep=','
    )

movies_metrics_df = pd.read_csv(
        f'{data_path}/Processed/movies_metrics.csv',
        sep=','
    )

actors_metrics_df = pd.read_csv(
        f'{data_path}/Processed/actors_metrics.csv',
        sep=',')

In [35]:
# Global configuration
SCORING_METHOD = 'percentile'  # Options: 'percentile', 'robust', 'zscore'

COMPONENT_WEIGHTS = {
    'critical_reception': 0.30,
    'commercial_proxy': 0.25,
    'audience_engagement': 0.25,
    'actor_popularity': 0.20
}

In [None]:
def normalize_score(series, method='percentile'):

    # Remove NaN values for calculation
    valid_data = series.dropna()
    
    if len(valid_data) == 0:
        return pd.Series(50, index=series.index)  # Default to middle score
    
    if method == 'percentile':
        # Percentile ranking (most stable distribution)
        normalized = series.rank(pct=True, method='average') * 100
        
    elif method == 'robust':
        # RobustScaler: handles outliers better than MinMaxScaler
        scaler = RobustScaler()
        scaled = scaler.fit_transform(series.values.reshape(-1, 1)).flatten()
        # Convert to 0-100 range (using 5th and 95th percentiles as bounds)
        p5, p95 = np.percentile(scaled, [5, 95])
        normalized = pd.Series(
            np.clip((scaled - p5) / (p95 - p5) * 100, 0, 100),
            index=series.index
        )
        
    elif method == 'zscore':
        # Z-score with sigmoid transformation
        z_scores = stats.zscore(valid_data)
        # Sigmoid transformation to 0-100
        sigmoid = 1 / (1 + np.exp(-z_scores/2))
        normalized = pd.Series(sigmoid * 100, index=valid_data.index)
        normalized = normalized.reindex(series.index, fill_value=50)
        
    else:
        # Fallback to percentile
        normalized = series.rank(pct=True, method='average') * 100
        
    return normalized


def print_score_statistics(df, score_column, component_name):
    """
    Print statistics for a score component.
    """
    print(f"\n✅ {component_name} Statistics:")
    print(f"   Mean: {df[score_column].mean():.1f}")
    print(f"   Median: {df[score_column].median():.1f}")
    print(f"   Std Dev: {df[score_column].std():.1f}")
    print(f"   Min: {df[score_column].min():.1f}")
    print(f"   Max: {df[score_column].max():.1f}")


# ============================================
# Critical Reception Score Function
# ============================================

def calculate_critical_reception_score(df, method='percentile'):

    print("\n" + "="*60)
    print("📊 CALCULATING CRITICAL RECEPTION SCORE")
    print("="*60)
    
    df = df.copy()
    
    # 1. Weighted rating (Bayesian average for credibility)
    m = df['numVotes'].quantile(0.25)  # Use 25th percentile as minimum threshold
    C = df['averageRating'].median()    # Use median for robustness
    
    df['weighted_rating'] = (
        (df['numVotes'] / (df['numVotes'] + m)) * df['averageRating'] +
        (m / (df['numVotes'] + m)) * C
    )
    
    # 2. Rating variance (lower variance = consistent appeal)
    median_rating = df['averageRating'].median()
    df['rating_consistency'] = np.exp(-abs(df['averageRating'] - median_rating) / 2)
    
    # 3. Vote credibility factor
    df['vote_credibility'] = np.log10(df['numVotes'] + 10) / np.log10(df['numVotes'].max() + 10)
    
    # Normalize components
    df['weighted_rating_norm'] = normalize_score(df['weighted_rating'], method)
    df['rating_consistency_norm'] = normalize_score(df['rating_consistency'], method)
    df['vote_credibility_norm'] = normalize_score(df['vote_credibility'], method)
    
    # Combined score with weights
    df['critical_reception_score'] = (
        0.60 * df['weighted_rating_norm'] +
        0.25 * df['rating_consistency_norm'] +
        0.15 * df['vote_credibility_norm']
    )
    
    print_score_statistics(df, 'critical_reception_score', 'Critical Reception Score')
    
    return df


# ============================================
# Commercial Proxy Score Function
# ============================================

def calculate_commercial_proxy_score(df, method='percentile'):
 
    print("\n" + "="*60)
    print("💰 CALCULATING COMMERCIAL PROXY SCORE")
    print("="*60)
    
    df = df.copy()
    
    # 1. Audience reach (logarithmic scale)
    df['audience_reach'] = np.log10(df['numVotes'] + 1)
    
    # 2. Genre-relative performance (vectorized)
    # Pre-calculate genre medians once
    genre_stats = {}
    if 'genres' in df.columns:
        all_genres = set()
        for genres in df['genres'].dropna():
            all_genres.update([g.strip() for g in genres.split(',')])
        
        for genre in all_genres:
            genre_movies = df[df['genres'].str.contains(genre, na=False)]
            if len(genre_movies) > 0:
                genre_stats[genre] = {
                    'median_votes': genre_movies['numVotes'].median(),
                    'median_rating': genre_movies['averageRating'].median()
                }
    
    # Calculate genre performance efficiently
    def get_genre_performance(row):
        if pd.isna(row['genres']) or not genre_stats:
            return 1.0
        
        genres = [g.strip() for g in row['genres'].split(',')]
        median_votes = [genre_stats.get(g, {}).get('median_votes', row['numVotes']) 
                      for g in genres]
        
        if median_votes:
            return row['numVotes'] / max(median_votes)
        return 1.0
    
    df['genre_relative_performance'] = df.apply(get_genre_performance, axis=1)
    
    # 3. Market penetration (percentile within release year)
    if 'startYear' in df.columns:
        df['year_percentile'] = df.groupby('startYear')['numVotes'].rank(pct=True) * 100
    else:
        df['year_percentile'] = 50  # Default if no year data
    
    # Normalize components
    df['audience_reach_norm'] = normalize_score(df['audience_reach'], method)
    df['genre_relative_norm'] = normalize_score(df['genre_relative_performance'], method)
    df['year_percentile_norm'] = df['year_percentile']  # Already in percentile
    
    # Combined score
    df['commercial_proxy_score'] = (
        0.50 * df['audience_reach_norm'] +
        0.30 * df['genre_relative_norm'] +
        0.20 * df['year_percentile_norm']
    )
    
    print_score_statistics(df, 'commercial_proxy_score', 'Commercial Proxy Score')
    
    return df


# ============================================
# Audience Engagement Score Function
# ============================================

def calculate_audience_engagement_score(df, method='percentile'):
    
    print("\n" + "="*60)
    print("👥 CALCULATING AUDIENCE ENGAGEMENT SCORE")
    print("="*60)
    
    df = df.copy()
    
    # 1. Rating achievement (continuous, not binary)
    # Sigmoid function for smooth transition around 7.0 threshold
    df['rating_achievement'] = 100 / (1 + np.exp(-2 * (df['averageRating'] - 7.0)))
    
    # 2. Engagement velocity (rating * log(votes))
    df['engagement_velocity'] = df['averageRating'] * np.log10(df['numVotes'] + 1)
    
    # 3. Entertainment efficiency (quality per hour)
    # Handle missing runtime with median
    median_runtime = df['runtimeMinutes'].median() if 'runtimeMinutes' in df.columns else 120
    if 'runtimeMinutes' in df.columns:
        df['runtime_clean'] = df['runtimeMinutes'].fillna(median_runtime)
    else:
        df['runtime_clean'] = median_runtime
    
    df['entertainment_efficiency'] = (df['averageRating'] * 60) / df['runtime_clean'].clip(lower=30)
    
    # 4. Audience consensus (inverse of rating variance, simulated)
    df['audience_consensus'] = (df['averageRating'] / 10) * np.log10(df['numVotes'] + 1)
    
    # Normalize components
    df['rating_achievement_norm'] = normalize_score(df['rating_achievement'], method)
    df['engagement_velocity_norm'] = normalize_score(df['engagement_velocity'], method)
    df['entertainment_efficiency_norm'] = normalize_score(df['entertainment_efficiency'], method)
    df['audience_consensus_norm'] = normalize_score(df['audience_consensus'], method)
    
    # Combined score
    df['audience_engagement_score'] = (
        0.35 * df['rating_achievement_norm'] +
        0.30 * df['engagement_velocity_norm'] +
        0.20 * df['entertainment_efficiency_norm'] +
        0.15 * df['audience_consensus_norm']
    )
    
    print_score_statistics(df, 'audience_engagement_score', 'Audience Engagement Score')
    
    return df


def calculate_actor_popularity_score(
    movies_df, 
    actor_metrics_df, 
    principals_df,
    min_actors: int = 1,
    verbose: bool = True
) :
    

    
    # Validate input DataFrames
    try:
        # Check required columns
        if 'tconst' not in movies_df.columns:
            raise ValueError("movies_df must contain 'tconst' column")
        
        required_actor_cols = ['nconst', 'popularity_score']
        missing_cols = [col for col in required_actor_cols if col not in actor_metrics_df.columns]
        if missing_cols:
            raise ValueError(f"actor_metrics_df missing columns: {missing_cols}")
        
        required_principals_cols = ['tconst', 'nconst', 'category']
        missing_cols = [col for col in required_principals_cols if col not in principals_df.columns]
        if missing_cols:
            raise ValueError(f"principals_df missing columns: {missing_cols}")
            
    except ValueError as e:
        print(f"❌ Validation Error: {e}")
        return movies_df
    
    # Start processing
    start_time = pd.Timestamp.now()
    
    # Step 1: Filter principals to only actors/actresses
    actor_principals = principals_df[
        principals_df['category'].isin(['actor', 'actress'])
    ].copy()
    
    if verbose:
        print(f"✅ Filtered to {len(actor_principals):,} actor/actress records")
    
    # Step 2: Merge with actor popularity scores (vectorized operation)
    actor_principals_with_scores = actor_principals.merge(
        actor_metrics_df[['nconst', 'popularity_score']],
        on='nconst',
        how='left'
    )
    
    # Filter out actors without popularity scores
    actor_principals_with_scores = actor_principals_with_scores[
        actor_principals_with_scores['popularity_score'].notna() & 
        (actor_principals_with_scores['popularity_score'] > 0)
    ]
    
    if verbose:
        print(f"✅ Found popularity scores for {len(actor_principals_with_scores):,} cast records")
    
    # Step 3: Calculate aggregated metrics per movie (vectorized groupby)
    movie_actor_metrics = actor_principals_with_scores.groupby('tconst').agg({
        'popularity_score': ['mean', 'sum', 'max', 'std', 'count']
    }).round(2)
    
    # Flatten column names
    movie_actor_metrics.columns = [
        'actor_popularity_score',  # average
        'total_cast_popularity',   # sum
        'max_actor_popularity',    # max
        'actor_popularity_std',    # standard deviation
        'cast_actor_count'        # count
    ]
    movie_actor_metrics.reset_index(inplace=True)
    
    # Step 4: Merge back with movies_df (vectorized merge)
    result_df = movies_df.merge(
        movie_actor_metrics,
        on='tconst',
        how='left'
    )
    
    # Step 5: Handle missing values
    result_df['actor_popularity_score'] = result_df['actor_popularity_score'].fillna(0)
    result_df['total_cast_popularity'] = result_df['total_cast_popularity'].fillna(0)
    result_df['max_actor_popularity'] = result_df['max_actor_popularity'].fillna(0)
    result_df['actor_popularity_std'] = result_df['actor_popularity_std'].fillna(0)
    result_df['cast_actor_count'] = result_df['cast_actor_count'].fillna(0).astype(int)
    
    # Calculate additional metrics
    result_df['cast_diversity_score'] = np.where(
        result_df['actor_popularity_score'] > 0,
        result_df['actor_popularity_std'] / result_df['actor_popularity_score'],
        0
    ).round(3)
    
    # Processing time
    processing_time = (pd.Timestamp.now() - start_time).total_seconds()
    
    if verbose:
        # Summary statistics
        movies_with_cast = result_df[result_df['cast_actor_count'] >= min_actors]
        
        print(f"\n📊 PROCESSING SUMMARY:")
        print(f"   Processing time: {processing_time:.2f} seconds")
        print(f"   Total movies processed: {len(result_df):,}")
        print(f"   Movies with cast data: {len(movies_with_cast):,}")
        print(f"   Average Actor Score: {movies_with_cast['actor_popularity_score'].mean():.2f}")
        print(f"   Median Actor Score: {movies_with_cast['actor_popularity_score'].median():.2f}")
        print(f"   Max Actor Score: {movies_with_cast['actor_popularity_score'].max():.2f}")
    
    return result_df


# ============================================
#  Actor Popularity Normalization Function
# ============================================

def normalize_actor_popularity_score(df, method='percentile'):
    
    print("\n" + "="*60)
    print("🌟 NORMALIZING ACTOR POPULARITY SCORE")
    print("="*60)
    
    if 'actor_popularity_score' not in df.columns:
        print("⚠️ Warning: actor_popularity_score column not found")
        print("   Creating default score of 50")
        df['actor_popularity_score'] = 50
        df['actor_popularity_score_norm'] = 50
        return df
    
    # Apply normalization
    df['actor_popularity_score_norm'] = normalize_score(df['actor_popularity_score'], method)
    
    print_score_statistics(df, 'actor_popularity_score_norm', 'Actor Popularity Score (Normalized)')
    
    return df

In [41]:
def calculate_aaa_score(df, weights=None):
    
    print("\n" + "="*60)
    print("🏆 CALCULATING FINAL AAA SCORE")
    print("="*60)
    
    if weights is None:
        weights = COMPONENT_WEIGHTS
    
    # Map component names to column names
    score_columns = {
        'critical_reception': 'critical_reception_score',
        'commercial_proxy': 'commercial_proxy_score',
        'audience_engagement': 'audience_engagement_score',
        'actor_popularity': 'actor_popularity_score_norm'
    }
    
    # Ensure all component scores exist
    for component, column in score_columns.items():
        if column not in df.columns:
            print(f"⚠️ Warning: {column} not found, using default value 50")
            df[column] = 50
    
    # Calculate weighted AAA score
    df['aaa_score'] = sum(
        weights[component] * df[column]
        for component, column in score_columns.items()
    )
    
    # Add score tier classification
    df['score_tier'] = pd.cut(
        df['aaa_score'],
        bins=[0, 20, 40, 60, 80, 100],
        labels=['F', 'D', 'C', 'B', 'A']
    )
    
    # Print comprehensive statistics
    print(f"\n📊 FINAL AAA SCORE DISTRIBUTION:")
    print(f"   Mean: {df['aaa_score'].mean():.1f}")
    print(f"   Median: {df['aaa_score'].median():.1f}")
    print(f"   Std Dev: {df['aaa_score'].std():.1f}")
    print(f"   Min: {df['aaa_score'].min():.1f}")
    print(f"   Max: {df['aaa_score'].max():.1f}")
    
    print(f"\n📈 SCORE PERCENTILES:")
    percentiles = [10, 25, 50, 75, 90, 95, 99]
    for p in percentiles:
        value = np.percentile(df['aaa_score'].dropna(), p)
        print(f"   {p}th percentile: {value:.1f}")
    
    print(f"\n🎯 TIER DISTRIBUTION:")
    tier_dist = df['score_tier'].value_counts().sort_index()
    for tier, count in tier_dist.items():
        pct = (count / len(df)) * 100
        print(f"   {tier}: {count:,} movies ({pct:.1f}%)")
    
    return df

In [42]:
def validate_score_distribution(df, score_column):
    
    print(f"\n🔍 Validating {score_column} distribution...")
    
    mean = df[score_column].mean()
    median = df[score_column].median()
    std = df[score_column].std()
    min_val = df[score_column].min()
    max_val = df[score_column].max()
    
    health_checks = {
        'mean_near_50': 40 <= mean <= 60,
        'median_near_50': 40 <= median <= 60,
        'good_spread': 20 <= std <= 35,
        'full_range': max_val - min_val > 70,
        'min_reasonable': min_val < 20,
        'max_reasonable': max_val > 80
    }
    
    print("Distribution Health Check:")
    for check, passed in health_checks.items():
        status = "✅" if passed else "❌"
        print(f"  {status} {check}")
    
    all_passed = all(health_checks.values())
    if all_passed:
        print(f"✅ {score_column} distribution is healthy!")
    else:
        print(f"⚠️ {score_column} distribution needs adjustment")
    
    return all_passed


def get_top_movies(df, n=10, score_column='aaa_score'):
    
    print(f"\n🏆 TOP {n} MOVIES BY {score_column.upper()}:")
    
    # Select relevant columns for display
    display_columns = ['primaryTitle', 'startYear', score_column]
    if 'score_tier' in df.columns:
        display_columns.append('score_tier')
    
    # Add other score components if they exist
    optional_columns = ['averageRating', 'numVotes', 'genres']
    for col in optional_columns:
        if col in df.columns:
            display_columns.append(col)
    
    top_movies = df.nlargest(n, score_column)[display_columns]
    
    for i, (idx, movie) in enumerate(top_movies.iterrows(), 1):
        title = movie.get('primaryTitle', 'Unknown')
        year = movie.get('startYear', 'N/A')
        score = movie[score_column]
        tier = movie.get('score_tier', 'N/A')
        
        print(f"\n{i:2d}. {title} ({year})")
        print(f"    Score: {score:.1f} | Tier: {tier}")
        
        if 'averageRating' in movie:
            print(f"    Rating: {movie['averageRating']:.1f} | Votes: {movie['numVotes']:,}")
    
    return top_movies

In [43]:
def run_complete_scoring_pipeline(movies_df, actor_metrics_df=None, principals_df=None, 
                                 scoring_method='percentile', validate=True):
   
    print("\n" + "="*70)
    print("🎬 STARTING MOVIE AAA SCORING PIPELINE")
    print("="*70)
    print(f"📊 Processing {len(movies_df):,} movies")
    print(f"📈 Using {scoring_method} normalization method")
    
    # Make a copy to avoid modifying original
    df = movies_df.copy()
    
    # Step 1: Calculate Critical Reception Score
    df = calculate_critical_reception_score(df, method=scoring_method)
    if validate:
        validate_score_distribution(df, 'critical_reception_score')
    
    # Step 2: Calculate Commercial Proxy Score
    df = calculate_commercial_proxy_score(df, method=scoring_method)
    if validate:
        validate_score_distribution(df, 'commercial_proxy_score')
    
    # Step 3: Calculate Audience Engagement Score
    df = calculate_audience_engagement_score(df, method=scoring_method)
    if validate:
        validate_score_distribution(df, 'audience_engagement_score')
    
    # Step 4: Calculate Actor Popularity Score (if data provided)
    if actor_metrics_df is not None and principals_df is not None:
        df = calculate_actor_popularity_score(
            df, 
            actor_metrics_df, 
            principals_df,
            verbose=True
        )
    else:
        print("\n⚠️ Actor metrics or principals data not provided")
        print("   Using default actor popularity score of 0")
        df['actor_popularity_score'] = 0
    
    # Step 5: Normalize Actor Popularity Score
    df = normalize_actor_popularity_score(df, method=scoring_method)
    if validate:
        validate_score_distribution(df, 'actor_popularity_score_norm')
    
    # Step 6: Calculate Final AAA Score
    df = calculate_aaa_score(df)
    if validate:
        validate_score_distribution(df, 'aaa_score')
    
    # Step 7: Show top movies
    get_top_movies(df, n=10, score_column='aaa_score')
    
    print("\n" + "="*70)
    print("✅ SCORING PIPELINE COMPLETE!")
    print("="*70)
    
    return df


In [44]:
scored_movies = run_complete_scoring_pipeline(
    movies_metrics_df,
    actor_metrics_df= actors_metrics_df,  # Optional
    principals_df=principals_df,  # Optional
    scoring_method='percentile',  # Options: 'percentile', 'robust', 'zscore'
    validate=True
)


🎬 STARTING MOVIE AAA SCORING PIPELINE
📊 Processing 152,447 movies
📈 Using percentile normalization method

📊 CALCULATING CRITICAL RECEPTION SCORE

✅ Critical Reception Score Statistics:
   Mean: 50.0
   Median: 54.8
   Std Dev: 19.4
   Min: 5.8
   Max: 81.8

🔍 Validating critical_reception_score distribution...
Distribution Health Check:
  ✅ mean_near_50
  ✅ median_near_50
  ❌ good_spread
  ✅ full_range
  ✅ min_reasonable
  ✅ max_reasonable
⚠️ critical_reception_score distribution needs adjustment

💰 CALCULATING COMMERCIAL PROXY SCORE

✅ Commercial Proxy Score Statistics:
   Mean: 50.0
   Median: 49.1
   Std Dev: 28.2
   Min: 0.2
   Max: 100.0

🔍 Validating commercial_proxy_score distribution...
Distribution Health Check:
  ✅ mean_near_50
  ✅ median_near_50
  ✅ good_spread
  ✅ full_range
  ✅ min_reasonable
  ✅ max_reasonable
✅ commercial_proxy_score distribution is healthy!

👥 CALCULATING AUDIENCE ENGAGEMENT SCORE

✅ Audience Engagement Score Statistics:
   Mean: 50.0
   Median: 53.0


In [45]:
scored_movies.to_csv(f'{data_path}/Processed/movie_metrics_scores.csv')
