In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [5]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
data_path="C:/Users/russe/OneDrive/Desktop/Portfolio/Next AAA Title"

In [19]:
movies_ratings_df = pd.read_csv(
        f'{data_path}/movies_ratings.csv',
        sep=','
    )

movies_df = pd.read_csv(
        f'{data_path}/movies.csv',
        sep=','
    )

ratings_df = pd.read_csv(
        f'{data_path}/ratings.csv',
        sep=','
    )

names_df = pd.read_csv(
        f'{data_path}/names.csv',
        sep=','
    )

principals_df = pd.read_csv(
        f'{data_path}/principals.csv',
        sep=','
    )

In [21]:
def analyze_2020_highest_rated_movies(movies_df):
    """
    BUSINESS QUESTION 1: What is the highest rated movie in 2020? 
    How do we balance ratings and number of votes?
    
    BUSINESS CONTEXT: Studios need to understand what constitutes "quality"
    - Simple ratings can be misleading (few votes)
    - Need credible metrics for investment decisions
    - Balance critical acclaim with audience validation
    """
    
    print("\n" + "="*70)
    print("🏆 BUSINESS ANALYSIS 1: 2020 HIGHEST RATED MOVIES")
    print("="*70)
    
    # Filter 2020 movies
    movies_2020 = movies_df[movies_df['startYear'] == 2020].copy()
    
    
    # RATING METHODOLOGY COMPARISON
    print(f"\n🎯 Rating Methodology Analysis:")
    
    # Method 1: Simple Average (baseline)
    simple_top = movies_2020.nlargest(5, 'averageRating')
    
    # Method 2: Credibility Filter (100+ votes)
    credible_movies = movies_2020[movies_2020['numVotes'] >= 100].copy()
    credible_top = credible_movies.nlargest(5, 'averageRating') if len(credible_movies) > 0 else pd.DataFrame()
    
    # Method 3: IMDb Weighted Rating Formula
    # WR = (v/(v+m)) * R + (m/(v+m)) * C
    # v = votes, m = minimum threshold, R = rating, C = average rating
    
    m = 1000  # minimum votes threshold (industry standard)
    C = movies_2020['averageRating'].mean()
    
    movies_2020['weighted_rating'] = (
        (movies_2020['numVotes'] / (movies_2020['numVotes'] + m)) * movies_2020['averageRating'] +
        (m / (movies_2020['numVotes'] + m)) * C
    )
    
    weighted_top = movies_2020.nlargest(5, 'weighted_rating')
    
    # Method 4: Bayesian Confidence Interval
    # Accounts for rating uncertainty with limited votes
    movies_2020['confidence_lower'] = movies_2020.apply(
        lambda row: max(0, row['averageRating'] - 1.96 * np.sqrt(
            (row['averageRating'] * (10 - row['averageRating'])) / max(row['numVotes'], 1)
        )), axis=1
    )
    
    confidence_top = movies_2020.nlargest(5, 'confidence_lower')
    
    # RESULTS PRESENTATION
    print(f"\n📋 METHODOLOGY COMPARISON RESULTS:")
    
    print(f"\n1️⃣ Simple Average Rating (Top 5):")
    for i, (_, movie) in enumerate(simple_top.iterrows(), 1):
        print(f"   {i}. {movie['primaryTitle']}: {movie['averageRating']:.1f}/10 ({movie['numVotes']:,} votes)")
    
    if len(credible_top) > 0:
        print(f"\n2️⃣ Credible Movies Only (100+ votes):")
        for i, (_, movie) in enumerate(credible_top.iterrows(), 1):
            print(f"   {i}. {movie['primaryTitle']}: {movie['averageRating']:.1f}/10 ({movie['numVotes']:,} votes)")
    
    print(f"\n3️⃣ IMDb Weighted Rating (Recommended):")
    for i, (_, movie) in enumerate(weighted_top.iterrows(), 1):
        print(f"   {i}. {movie['primaryTitle']}: {movie['weighted_rating']:.2f} ({movie['numVotes']:,} votes)")
    
    print(f"\n4️⃣ Bayesian Confidence (Conservative):")
    for i, (_, movie) in enumerate(confidence_top.iterrows(), 1):
        print(f"   {i}. {movie['primaryTitle']}: {movie['confidence_lower']:.2f} confidence ({movie['numVotes']:,} votes)")
    
    return movies_2020, weighted_top

# Execute 2020 analysis
movies_2020_df, top_2020_movies = analyze_2020_highest_rated_movies(movies_ratings_df)



🏆 BUSINESS ANALYSIS 1: 2020 HIGHEST RATED MOVIES

🎯 Rating Methodology Analysis:

📋 METHODOLOGY COMPARISON RESULTS:

1️⃣ Simple Average Rating (Top 5):
   1. A Better Life: 9.8/10 (66 votes)
   2. The Choice of Staying: 9.8/10 (60 votes)
   3. Floripes: 9.5/10 (86 votes)
   4. ForeverMoore; The Angelo project: 9.5/10 (60 votes)
   5. Qawwali - Music of the Mystics: 9.5/10 (52 votes)

2️⃣ Credible Movies Only (100+ votes):
   1. Joshua: 9.4/10 (490 votes)
   2. Impionçable: 9.3/10 (102 votes)
   3. A Worm in the Heart: 9.1/10 (114 votes)
   4. Maduve Madri Sari Hogtane: 9.1/10 (139 votes)
   5. Moscow. Dormitory area: 9.0/10 (144 votes)

3️⃣ IMDb Weighted Rating (Recommended):
   1. Soorarai Pottru: 8.58 (129,501 votes)
   2. Demon Slayer: Kimetsu no Yaiba - Mt. Natagumo Arc: 8.54 (17,831 votes)
   3. Dil Bechara: 8.28 (136,657 votes)
   4. Hamilton: 8.28 (132,450 votes)
   5. The Father: 8.19 (219,125 votes)

4️⃣ Bayesian Confidence (Conservative):
   1. A Better Life: 9.46 confidence

In [22]:
def add_rating_methodologies(movies_df, min_votes_threshold=1000):
    """
    Add comprehensive rating methodology columns to movies DataFrame
    
    Parameters:
    movies_df: DataFrame with 'averageRating' and 'numVotes' columns
    min_votes_threshold: Minimum votes for weighted rating calculation (default: 1000)
    
    Returns:
    DataFrame with added rating methodology columns
    """
    
    print("🎯 Adding Rating Methodology Columns to Movies Table")
    print("=" * 60)
    
    # Verify required columns exist
    required_cols = ['averageRating', 'numVotes']
    missing_cols = [col for col in required_cols if col not in movies_df.columns]
    
    if missing_cols:
        print(f"❌ Missing required columns: {missing_cols}")
        return movies_df
    
    print(f"✅ Processing {len(movies_df):,} movies...")
    
    # Create a copy to avoid modifying original
    df = movies_df.copy()
    
    # ========================================================================
    # METHOD 1: CREDIBILITY FLAG
    # ========================================================================
    
    print("\n📊 Method 1: Credibility Assessment")
    
    # High credibility threshold (1000+ votes)
    df['high_credibility'] = df['numVotes'] >= min_votes_threshold
    
    # Medium credibility threshold (100+ votes) 
    df['medium_credibility'] = df['numVotes'] >= 100
    
    # Basic credibility threshold (10+ votes)
    df['basic_credibility'] = df['numVotes'] >= 10
    
    # Credibility score (0-100)
    max_votes = df['numVotes'].max()
    df['credibility_score'] = np.minimum(100, (df['numVotes'] / 1000) * 100)
    
    high_cred_count = df['high_credibility'].sum()
    medium_cred_count = df['medium_credibility'].sum()
    
    print(f"  High credibility (1000+ votes): {high_cred_count:,} movies ({high_cred_count/len(df)*100:.1f}%)")
    print(f"  Medium credibility (100+ votes): {medium_cred_count:,} movies ({medium_cred_count/len(df)*100:.1f}%)")
    
    # ========================================================================
    # METHOD 2: IMDB WEIGHTED RATING
    # ========================================================================
    
    print(f"\n⚖️ Method 2: IMDb Weighted Rating (threshold: {min_votes_threshold})")
    
    # IMDb Weighted Rating Formula: WR = (v/(v+m)) * R + (m/(v+m)) * C
    # v = votes, m = minimum threshold, R = rating, C = average rating
    
    m = min_votes_threshold
    C = df['averageRating'].mean()
    
    df['weighted_rating'] = (
        (df['numVotes'] / (df['numVotes'] + m)) * df['averageRating'] +
        (m / (df['numVotes'] + m)) * C
    ).round(2)
    
    print(f"  Average rating (C): {C:.2f}")
    print(f"  Weighted rating range: {df['weighted_rating'].min():.2f} - {df['weighted_rating'].max():.2f}")
    
    # ========================================================================
    # METHOD 3: BAYESIAN CONFIDENCE INTERVALS
    # ========================================================================
    
    print(f"\n📊 Method 3: Bayesian Confidence Analysis")
    
    # Lower confidence bound (conservative estimate)
    df['confidence_lower'] = df.apply(
        lambda row: max(0, row['averageRating'] - 1.96 * np.sqrt(
            (row['averageRating'] * (10 - row['averageRating'])) / max(row['numVotes'], 1)
        )), axis=1
    ).round(2)
    
    # Upper confidence bound (optimistic estimate)
    df['confidence_upper'] = df.apply(
        lambda row: min(10, row['averageRating'] + 1.96 * np.sqrt(
            (row['averageRating'] * (10 - row['averageRating'])) / max(row['numVotes'], 1)
        )), axis=1
    ).round(2)
    
    # Confidence interval width (uncertainty measure)
    df['confidence_width'] = (df['confidence_upper'] - df['confidence_lower']).round(2)
    
    print(f"  Average confidence width: {df['confidence_width'].mean():.2f}")
    print(f"  Confidence range: {df['confidence_lower'].min():.2f} - {df['confidence_upper'].max():.2f}")
    
    # ========================================================================
    # METHOD 4: NORMALIZED RATING SCORES
    # ========================================================================
    
    print(f"\n📈 Method 4: Normalized Rating Scores")
    
    # Percentile ranking (0-100)
    df['rating_percentile'] = (df['averageRating'].rank(pct=True) * 100).round(1)
    
    # Vote-weighted percentile (considering popularity)
    df['weighted_percentile'] = (df['weighted_rating'].rank(pct=True) * 100).round(1)
    
    # Z-score normalization
    rating_mean = df['averageRating'].mean()
    rating_std = df['averageRating'].std()
    df['rating_zscore'] = ((df['averageRating'] - rating_mean) / rating_std).round(2)
    
    print(f"  Rating percentiles: 0-100")
    print(f"  Z-score range: {df['rating_zscore'].min():.2f} - {df['rating_zscore'].max():.2f}")
    
    # ========================================================================
    # METHOD 5: COMPOSITE QUALITY SCORES
    # ========================================================================
    
    print(f"\n🏆 Method 5: Composite Quality Scores")
    
    # Quality Score: Combines rating quality with vote credibility
    # Formula: 70% weighted rating + 30% credibility score
    df['quality_score'] = (
        0.7 * (df['weighted_rating'] / 10 * 100) +
        0.3 * df['credibility_score']
    ).round(1)
    
    # Excellence Score: High bar for truly exceptional movies
    # Requires both high rating AND high confidence
    df['excellence_score'] = (
        df['confidence_lower'] * df['credibility_score'] / 100
    ).round(2)
    
    # Popularity-Adjusted Score: Balances critical acclaim with mass appeal
    log_votes = np.log10(df['numVotes'] + 1)
    max_log_votes = log_votes.max()
    popularity_factor = log_votes / max_log_votes
    
    df['popularity_adjusted_score'] = (
        df['averageRating'] * (0.7 + 0.3 * popularity_factor)
    ).round(2)
    
    print(f"  Quality score range: {df['quality_score'].min():.1f} - {df['quality_score'].max():.1f}")
    print(f"  Excellence score range: {df['excellence_score'].min():.2f} - {df['excellence_score'].max():.2f}")
    
    # ========================================================================
    # METHOD 6: TIER CLASSIFICATIONS
    # ========================================================================
    
    print(f"\n🏅 Method 6: Tier Classifications")
    
    # Rating Tier (based on weighted rating)
    df['rating_tier'] = pd.cut(
        df['weighted_rating'],
        bins=[0, 5.0, 6.5, 7.5, 8.5, 10.0],
        labels=['Poor', 'Average', 'Good', 'Excellent', 'Masterpiece'],
        include_lowest=True
    )
    
    # Credibility Tier (based on vote count)
    df['credibility_tier'] = pd.cut(
        df['numVotes'],
        bins=[0, 100, 1000, 10000, 100000, float('inf')],
        labels=['Minimal', 'Basic', 'Solid', 'High', 'Massive'],
        include_lowest=True
    )
    
    # Overall Tier (composite assessment)
    df['overall_tier'] = pd.cut(
        df['quality_score'],
        bins=[0, 40, 60, 75, 85, 100],
        labels=['Below Average', 'Average', 'Good', 'Excellent', 'Elite'],
        include_lowest=True
    )
    
    # Show tier distributions
    print("  Rating tier distribution:")
    print(df['rating_tier'].value_counts().sort_index())
    
    # ========================================================================
    # SUMMARY STATISTICS
    # ========================================================================
    
    print(f"\n📊 SUMMARY: New Columns Added")
    print("=" * 40)
    
    new_columns = [
        'high_credibility', 'medium_credibility', 'basic_credibility', 'credibility_score',
        'weighted_rating', 'confidence_lower', 'confidence_upper', 'confidence_width',
        'rating_percentile', 'weighted_percentile', 'rating_zscore',
        'quality_score', 'excellence_score', 'popularity_adjusted_score',
        'rating_tier', 'credibility_tier', 'overall_tier'
    ]
    
    for i, col in enumerate(new_columns, 1):
        print(f"  {i:2d}. {col}")
    
    print(f"\n✅ Enhanced dataset ready with {len(new_columns)} new rating columns!")
    
    return df

In [24]:
rating_methods = add_rating_methodologies(movies_ratings_df)

🎯 Adding Rating Methodology Columns to Movies Table
✅ Processing 83,583 movies...

📊 Method 1: Credibility Assessment
  High credibility (1000+ votes): 22,331 movies (26.7%)
  Medium credibility (100+ votes): 65,190 movies (78.0%)

⚖️ Method 2: IMDb Weighted Rating (threshold: 1000)
  Average rating (C): 5.90
  Weighted rating range: 1.06 - 9.09

📊 Method 3: Bayesian Confidence Analysis
  Average confidence width: 1.16
  Confidence range: 0.33 - 10.00

📈 Method 4: Normalized Rating Scores
  Rating percentiles: 0-100
  Z-score range: -3.46 - 2.83

🏆 Method 5: Composite Quality Scores
  Quality score range: 37.4 - 93.6
  Excellence score range: 0.02 - 9.72

🏅 Method 6: Tier Classifications
  Rating tier distribution:
rating_tier
Poor            4055
Average        71070
Good            7429
Excellent       1006
Masterpiece       23
Name: count, dtype: int64

📊 SUMMARY: New Columns Added
   1. high_credibility
   2. medium_credibility
   3. basic_credibility
   4. credibility_score
   5. 

In [25]:
rating_methods.to_csv("rating_methods.csv")

In [26]:
def analyze_popular_actors_2020(movies_df, principals_df, names_df):
    """
    Business Question 2: Most popular 2020 actors
    """
    
    print("\n" + "="*50)
    print("🌟 ANALYSIS 2: MOST POPULAR 2020 ACTORS")
    print("="*50)
    
    if len(principals_df) == 0 or len(names_df) == 0:
        print("⚠️ Insufficient actor data for analysis")
        return pd.DataFrame()
    
    # Get target year movies
    target_years = [2020]
    target_movies = movies_df[movies_df['startYear'].isin(target_years)]
    
    # Merge actor data with movie performance
    actor_movies = principals_df.merge(
        target_movies[['tconst', 'primaryTitle', 'averageRating', 'numVotes']], 
        on='tconst'
    )
    
    # Calculate actor metrics
    actor_stats = actor_movies.groupby('nconst').agg({
        'tconst': 'count',
        'averageRating': 'mean',
        'numVotes': ['sum', 'mean'],
        'primaryTitle': lambda x: list(x)
    }).round(2)
    
    actor_stats.columns = ['movie_count', 'avg_rating', 'total_votes', 'avg_votes_per_movie', 'movies']
    actor_stats = actor_stats.reset_index()
    
    # Merge with names
    actor_stats = actor_stats.merge(names_df, on='nconst', how='left')
    
    # Calculate simple popularity score
    actor_stats['popularity_score'] = (
        0.4 * (actor_stats['total_votes'] / actor_stats['total_votes'].max()) +
        0.3 * (actor_stats['avg_rating'] / 10) +
        0.3 * (actor_stats['movie_count'] / actor_stats['movie_count'].max())
    ) * 100
    
    # Filter for actors with multiple movies
    popular_actors = actor_stats[actor_stats['movie_count'] >= 2].copy()
    top_actors = popular_actors.nlargest(10, 'popularity_score')
    
    print(f"🏅 TOP 10 MOST POPULAR ACTORS (2019-2021):")
    print("-" * 50)
    
    for i, (_, actor) in enumerate(top_actors.iterrows(), 1):
        print(f"{i:2d}. {actor['primaryName']}")
        print(f"    Score: {actor['popularity_score']:.1f} | Movies: {actor['movie_count']}")
        print(f"    Avg Rating: {actor['avg_rating']:.1f} | Total Votes: {actor['total_votes']:,}")
        print()
    
    return top_actors

In [28]:
top_actors = analyze_popular_actors_2020(movies_ratings_df, principals_df, names_df)


🌟 ANALYSIS 2: MOST POPULAR 2020 ACTORS
🏅 TOP 10 MOST POPULAR ACTORS (2019-2021):
--------------------------------------------------
 1. Rich Ceraulo Ko
    Score: 67.8 | Movies: 3
    Avg Rating: 6.8 | Total Votes: 1,291,491

 2. Okieriete Onaodowan
    Score: 49.1 | Movies: 3
    Avg Rating: 7.9 | Total Votes: 575,332

 3. Ivo Uukkivi
    Score: 46.8 | Movies: 3
    Avg Rating: 6.5 | Total Votes: 643,158

 4. Phylicia Rashad
    Score: 44.6 | Movies: 4
    Avg Rating: 6.6 | Total Votes: 474,506

 5. Alice Braga
    Score: 44.1 | Movies: 3
    Avg Rating: 6.9 | Total Votes: 516,189

 6. Aarti Kulkarni
    Score: 42.9 | Movies: 2
    Avg Rating: 6.8 | Total Votes: 565,364

 7. Andrew Howard
    Score: 42.5 | Movies: 2
    Avg Rating: 5.8 | Total Votes: 642,638

 8. Sergey A.
    Score: 42.4 | Movies: 10
    Avg Rating: 5.7 | Total Votes: 9,908

 9. Ali Wong
    Score: 42.4 | Movies: 3
    Avg Rating: 6.8 | Total Votes: 465,225

10. Eric Roberts
    Score: 42.4 | Movies: 12
    Avg Rati