In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Kaggle Hub for MovieLens data
import kagglehub
from kagglehub import KaggleDatasetAdapter

In [2]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
data_path="C:/Users/russe/OneDrive/Desktop/Portfolio/Next AAA Title"



In [36]:
movies_ratings_df = pd.read_csv(
        f'{data_path}/Processed/movies_ratings.csv',
        sep=','
    )

movies_df = pd.read_csv(
        f'{data_path}/Raw/movies.csv',
        sep=','
    )

ratings_df = pd.read_csv(
        f'{data_path}/Raw/ratings.csv',
        sep=','
    )

names_df = pd.read_csv(
        f'{data_path}/Raw/actors.csv',
        sep=','
    )

principals_df = pd.read_csv(
        f'{data_path}/Raw/principals.csv',
        sep=','
    )

movies_metrics_df = pd.read_csv(
        f'{data_path}/Processed/movies_metrics.csv',
        sep=','
    )



In [47]:
movies_ratings_df.info()
movies_metrics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83583 entries, 0 to 83582
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      83583 non-null  int64  
 1   tconst          83583 non-null  object 
 2   titleType       83583 non-null  object 
 3   primaryTitle    83583 non-null  object 
 4   startYear       83583 non-null  float64
 5   runtimeMinutes  81019 non-null  float64
 6   genres          83222 non-null  object 
 7   averageRating   83583 non-null  float64
 8   numVotes        83583 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 5.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152447 entries, 0 to 152446
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             152447 non-null  int64  
 1   tconst                 152447 non-null  object 
 2   titleTy

In [63]:
def analyze_2020_highest_rated_movies(movies_df):
    """
    BUSINESS QUESTION 1: What is the highest rated movie in 2020? 
    How do we balance ratings and number of votes?
    
    BUSINESS CONTEXT: Studios need to understand what constitutes "quality"
    - Simple ratings can be misleading (few votes)
    - Need credible metrics for investment decisions
    - Balance critical acclaim with audience validation
    """
    
    print("\n" + "="*70)
    print("🏆 BUSINESS ANALYSIS 1: 2020 HIGHEST RATED MOVIES")
    print("="*70)
    
    # Filter 2020 movies
    movies_2020 = movies_df[movies_df['startYear'] == 2020].copy()
    
    
    # RATING METHODOLOGY COMPARISON
    print(f"\n🎯 Rating Methodology Analysis:")
    
    # Method 1: Simple Average (baseline)
    simple_top = movies_2020.nlargest(5, 'averageRating')
    
    # Method 2: Credibility Filter (100+ votes)
    credible_movies = movies_2020[movies_2020['numVotes'] >= 100].copy()
    credible_top = credible_movies.nlargest(5, 'averageRating') if len(credible_movies) > 0 else pd.DataFrame()
    
    # Method 3: IMDb Weighted Rating Formula
    # WR = (v/(v+m)) * R + (m/(v+m)) * C
    # v = votes, m = minimum threshold, R = rating, C = average rating
    
    m = 1000  # minimum votes threshold (industry standard)
    C = movies_2020['averageRating'].mean()
    
    movies_2020['weighted_rating'] = (
        (movies_2020['numVotes'] / (movies_2020['numVotes'] + m)) * movies_2020['averageRating'] +
        (m / (movies_2020['numVotes'] + m)) * C
    )
    
    weighted_top = movies_2020.nlargest(5, 'weighted_rating')
    
    
    # RESULTS PRESENTATION
    print(f"\n📋 METHODOLOGY COMPARISON RESULTS:")
    
    print(f"\n1️⃣ Simple Average Rating (Top 5):")
    for i, (_, movie) in enumerate(simple_top.iterrows(), 1):
        print(f"   {i}. {movie['primaryTitle']}: {movie['averageRating']:.1f}/10 ({movie['numVotes']:,} votes)")
    
    if len(credible_top) > 0:
        print(f"\n2️⃣ Credible Movies Only (100+ votes):")
        for i, (_, movie) in enumerate(credible_top.iterrows(), 1):
            print(f"   {i}. {movie['primaryTitle']}: {movie['averageRating']:.1f}/10 ({movie['numVotes']:,} votes)")
    
    print(f"\n3️⃣ IMDb Weighted Rating (Recommended):")
    for i, (_, movie) in enumerate(weighted_top.iterrows(), 1):
        print(f"   {i}. {movie['primaryTitle']}: {movie['weighted_rating']:.2f} ({movie['numVotes']:,} votes)")
    
    
    return movies_2020, weighted_top

# Execute 2020 analysis
movies_2020_df, top_2020_movies = analyze_2020_highest_rated_movies(movies_metrics_df)



🏆 BUSINESS ANALYSIS 1: 2020 HIGHEST RATED MOVIES

🎯 Rating Methodology Analysis:

📋 METHODOLOGY COMPARISON RESULTS:

1️⃣ Simple Average Rating (Top 5):
   1. A Better Life: 9.8/10 (66 votes)
   2. Sarasate: The Spanish Spirit: 9.8/10 (21 votes)
   3. The Choice of Staying: 9.8/10 (60 votes)
   4. Babymetal: Legend - Metal Galaxy: 9.8/10 (10 votes)
   5. I Chose Life: Stories of Suicide and Survival: 9.8/10 (13 votes)

2️⃣ Credible Movies Only (100+ votes):
   1. Joshua: 9.4/10 (490 votes)
   2. Impionçable: 9.3/10 (102 votes)
   3. A Worm in the Heart: 9.1/10 (114 votes)
   4. Maduve Madri Sari Hogtane: 9.1/10 (139 votes)
   5. Moscow. Dormitory area: 9.0/10 (144 votes)

3️⃣ IMDb Weighted Rating (Recommended):
   1. Soorarai Pottru: 8.58 (129,501 votes)
   2. Demon Slayer: Kimetsu no Yaiba - Mt. Natagumo Arc: 8.56 (17,831 votes)
   3. Dil Bechara: 8.28 (136,657 votes)
   4. Hamilton: 8.28 (132,450 votes)
   5. The Father: 8.19 (219,125 votes)


In [58]:
movies_2020_df = movies_metrics_df[movies_metrics_df["startYear"] == 2020.0]
movies_2020_df

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,weighted_rating,high_credibility,recent_movie,genre_count,runtimeMinutes_filled,runtime_category
0,0,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,2020.0,70.0,Drama,6.4,249,6.295510,False,True,1.0,70.0,Short
3,3,tt0093119,movie,Grizzly II: Revenge,2020.0,74.0,"Horror,Music,Thriller",2.7,2029,3.878439,True,True,3.0,74.0,Short
121,121,tt0438755,movie,About Face: The Story of the Jewish Refugee So...,2020.0,94.0,Documentary,8.2,53,6.366659,False,True,1.0,94.0,Standard
135,135,tt0446792,movie,Surviving in L.A.,2020.0,,"Comedy,Drama,Romance",7.4,33,6.305607,False,True,3.0,120.0,
303,303,tt0805647,movie,The Witches,2020.0,106.0,"Adventure,Comedy,Family",5.4,49733,5.417139,True,True,3.0,106.0,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152420,152420,tt9911196,movie,The Marriage Escape,2020.0,103.0,"Comedy,Drama",7.4,3494,7.148441,True,True,2.0,103.0,Standard
152428,152428,tt9914192,movie,No Gogó do Paulinho,2020.0,98.0,Comedy,5.3,328,6.030039,False,True,1.0,98.0,Standard
152440,152440,tt9916190,movie,Safeguard,2020.0,95.0,"Action,Adventure,Thriller",3.6,265,5.710270,False,True,3.0,95.0,Standard
152441,152441,tt9916270,movie,Il talento del calabrone,2020.0,84.0,Thriller,5.8,1518,5.986454,True,True,1.0,84.0,Short


In [61]:
def analyze_popular_actors_2020(movies_2020_df, principals_df, names_df):
    """
    Business Question 2: Most popular 2020 actors
    Enhanced to use weighted ratings and movie count focus
    
    Parameters:
    movies_2020_df: DataFrame with 2020 movies INCLUDING 'weighted_rating' column
    principals_df: Actor-movie connections  
    names_df: Actor names database
    """
    
    print("\n" + "="*60)
    print("🌟 ANALYSIS 2: MOST POPULAR 2020 ACTORS (Enhanced)")
    print("="*60)
    

    print(f"📊 Analyzing actors from {len(movies_2020_df):,} movies in 2020")
    
    # Filter principals for actors/actresses only and target movies
    actor_roles = principals_df[
        (principals_df['tconst'].isin(movies_2020_df['tconst'])) &
        (principals_df['category'].isin(['actor', 'actress']))
    ].copy()
    
    print(f"👥 Found {len(actor_roles):,} actor-movie connections")
    
    if len(actor_roles) == 0:
        print("⚠️ No actor connections found")
        return pd.DataFrame()
    
    # Merge actor data with 2020 movie performance (including weighted_rating)
    actor_movies = actor_roles.merge(
        movies_2020_df[['tconst', 'primaryTitle', 'averageRating', 'numVotes', 'weighted_rating']], 
        on='tconst',
        how='inner'
    )
    
    print(f"🎬 Actor-movie dataset: {len(actor_movies):,} records")
    
    # Calculate enhanced actor metrics
    print("\n🧮 Calculating enhanced actor popularity metrics...")
    
    actor_stats = actor_movies.groupby('nconst').agg({
        'tconst': 'count',                              # Number of movies (KEY METRIC)
        'weighted_rating': ['mean', 'max'],             # Weighted rating performance (KEY METRIC)
        'averageRating': 'mean',                        # Simple rating (for reference)
        'numVotes': ['sum', 'mean'],                    # Vote metrics
        'primaryTitle': lambda x: list(x)               # Movie list
    }).round(2)
    
    # Flatten column names
    actor_stats.columns = [
        'movie_count',                    # Number of 2020 movies
        'avg_weighted_rating',            # Average weighted rating across their movies  
        'max_weighted_rating',            # Best weighted rating from their movies
        'avg_simple_rating',              # Average simple rating (for comparison)
        'total_votes',                    # Total votes across all their movies
        'avg_votes_per_movie',            # Average votes per movie
        'movies'                          # List of movie titles
    ]
    
    actor_stats = actor_stats.reset_index()
    
    # Merge with actor names
    actor_stats = actor_stats.merge(
        names_df[['nconst', 'primaryName']], 
        on='nconst', 
        how='left'
    )
    
    print(f"✅ Processed {len(actor_stats):,} unique actors")
    
    # ENHANCED POPULARITY SCORING SYSTEM
    print("\n🏆 Calculating Enhanced Popularity Scores...")
    
    print("""
    📊 NEW SCORING METHODOLOGY:
    • Movie Count (50%): More 2020 movies = higher visibility/demand
    • Weighted Rating Performance (35%): Quality of their 2020 movies  
    • Vote Volume (15%): Audience reach and recognition
    """)
    
    # Normalize metrics for scoring (0-100 scale each)
    max_movie_count = actor_stats['movie_count'].max()
    max_total_votes = actor_stats['total_votes'].max()
    
    # Component 1: Movie Count Score (50% weight)
    # More movies in 2020 = higher demand/popularity
    actor_stats['movie_count_score'] = (
        actor_stats['movie_count'] / max_movie_count * 100
    )
    
    # Component 2: Weighted Rating Performance Score (35% weight)  
    # Higher weighted ratings = better quality performance
    actor_stats['rating_performance_score'] = (
        actor_stats['avg_weighted_rating'] / 10 * 100
    )
    
    # Component 3: Vote Volume Score (15% weight)
    # Higher vote volumes = broader audience reach
    actor_stats['vote_volume_score'] = (
        actor_stats['total_votes'] / max_total_votes * 100
    )
    
    # FINAL POPULARITY SCORE
    actor_stats['popularity_score'] = (
        0.50 * actor_stats['movie_count_score'] +           # 50% - Productivity/Demand
        0.15 * actor_stats['rating_performance_score'] +    # 35% - Quality
        0.35 * actor_stats['vote_volume_score']             # 15% - Reach
    ).round(1)
    
    # Filter for actors with meaningful presence (at least 1 movie, reasonable data)
    credible_actors = actor_stats[
        (actor_stats['movie_count'] >= 1) &
        (actor_stats['avg_weighted_rating'] > 0) &
        (actor_stats['primaryName'].notna())
    ].copy()
    
    # Get top performers
    top_actors = credible_actors.nlargest(15, 'popularity_score')
    
    print(f"\n🏅 TOP 15 MOST POPULAR 2020 ACTORS:")
    print("=" * 70)
    print(f"{'Rank':<4} {'Actor':<25} {'Score':<7} {'Movies':<7} {'Avg W.Rating':<12} {'Total Votes':<12}")
    print("-" * 70)
    
    for i, (_, actor) in enumerate(top_actors.iterrows(), 1):
        name = actor['primaryName'][:24] if pd.notna(actor['primaryName']) else 'Unknown'
        score = actor['popularity_score']
        movies = actor['movie_count']
        rating = actor['avg_weighted_rating']
        votes = actor['total_votes']
        
        print(f"{i:<4} {name:<25} {score:<7.1f} {movies:<7} {rating:<12.2f} {votes:<12,.0f}")
    
    # Detailed breakdown for top 5
    print(f"\n📊 DETAILED BREAKDOWN - TOP 5 ACTORS:")
    print("=" * 50)
    
    for i, (_, actor) in enumerate(top_actors.head(5).iterrows(), 1):
        print(f"\n{i}. {actor['primaryName']} (Popularity Score: {actor['popularity_score']:.1f})")
        print(f"   🎬 Movies in 2020: {actor['movie_count']}")
        print(f"   ⭐ Average Weighted Rating: {actor['avg_weighted_rating']:.2f}")
        print(f"   📊 Best Movie Rating: {actor['max_weighted_rating']:.2f}")
        print(f"   👥 Total Audience Reach: {actor['total_votes']:,.0f} votes")
        print(f"   🎭 Films: {', '.join(actor['movies'][:3])}{'...' if len(actor['movies']) > 3 else ''}")
        
        # Component breakdown
        movie_score = 0.50 * actor['movie_count_score']
        rating_score = 0.15 * actor['rating_performance_score'] 
        vote_score = 0.35 * actor['vote_volume_score']
        
        print(f"   📈 Score Breakdown:")
        print(f"      • Movie Count (50%): {movie_score:.1f}")
        print(f"      • Rating Quality (35%): {rating_score:.1f}")
        print(f"      • Audience Reach (15%): {vote_score:.1f}")
    
    # Business insights
    print(f"\n💡 BUSINESS INSIGHTS:")
    
    # Most productive actors
    most_productive = credible_actors.nlargest(3, 'movie_count')
    print(f"📈 Most Productive (most 2020 movies):")
    for _, actor in most_productive.iterrows():
        print(f"   • {actor['primaryName']}: {actor['movie_count']} movies")
    
    # Highest quality performers  
    highest_quality = credible_actors.nlargest(3, 'avg_weighted_rating')
    print(f"\n⭐ Highest Quality (best weighted ratings):")
    for _, actor in highest_quality.iterrows():
        print(f"   • {actor['primaryName']}: {actor['avg_weighted_rating']:.2f} avg weighted rating")
    
    # Biggest audience reach
    biggest_reach = credible_actors.nlargest(3, 'total_votes')
    print(f"\n👥 Biggest Audience Reach (most total votes):")
    for _, actor in biggest_reach.iterrows():
        print(f"   • {actor['primaryName']}: {actor['total_votes']:,.0f} total votes")
    
    # Summary statistics
    print(f"\n📊 SUMMARY STATISTICS:")
    print(f"   • Total actors analyzed: {len(credible_actors):,}")
    print(f"   • Average movies per actor: {credible_actors['movie_count'].mean():.1f}")
    print(f"   • Average weighted rating: {credible_actors['avg_weighted_rating'].mean():.2f}")
    print(f"   • Actors with 2+ movies: {len(credible_actors[credible_actors['movie_count'] >= 2]):,}")
    print(f"   • Actors with 8.0+ rating: {len(credible_actors[credible_actors['avg_weighted_rating'] >= 8.0]):,}")
    
    return top_actors, actor_stats

In [None]:
# Change first parameter to movies_2020_df to get 2020 data only movies_metrics to get overall data
top_actors, actors_metrics = analyze_popular_actors_2020(movies_2020_df, principals_df, names_df)


🌟 ANALYSIS 2: MOST POPULAR 2020 ACTORS (Enhanced)
📊 Analyzing actors from 8,982 movies in 2020
👥 Found 57,586 actor-movie connections
🎬 Actor-movie dataset: 57,586 records

🧮 Calculating enhanced actor popularity metrics...
✅ Processed 49,302 unique actors

🏆 Calculating Enhanced Popularity Scores...

    📊 NEW SCORING METHODOLOGY:
    • Movie Count (50%): More 2020 movies = higher visibility/demand
    • Weighted Rating Performance (35%): Quality of their 2020 movies  
    • Vote Volume (15%): Audience reach and recognition
    

🏅 TOP 15 MOST POPULAR 2020 ACTORS:
Rank Actor                     Score   Movies  Avg W.Rating Total Votes 
----------------------------------------------------------------------
1    Simon Hill                59.3    35      6.21         848         
2    Rich Ceraulo Ko           49.5    3       6.79         1,291,491   
3    Kj Schrock                40.3    22      5.82         5,081       
4    Kelsey Painter            37.9    20      6.23         454 

In [53]:
actors_metrics.to_csv(f'{data_path}/Processed/actors_metrics.csv')

In [46]:
def analyze_user_preference_trends(movies_df):
    """
    BUSINESS QUESTION 3: What are the trends in user-movie preferences over the years?
    
    BUSINESS CONTEXT: Market evolution drives content strategy
    - Identify emerging vs declining genres
    - Understand rating behavior changes
    - Guide content development pipeline
    """
    
    print("\n" + "="*70)
    print("📈 BUSINESS ANALYSIS 3: USER PREFERENCE TRENDS")
    print("="*70)
    
    # Load MovieLens data for user behavior insights
    print("🔄 Loading MovieLens data for user behavior analysis...")
    
    
    # TREND ANALYSIS 1: Genre Evolution (IMDb Data)
    print(f"\n🎭 Genre Trend Analysis (2010-2023)...")
    
    recent_movies = movies_df[movies_df['startYear'] >= 2010].copy()
    
    # Extract and analyze genre trends
    genre_yearly_data = []
    for _, movie in recent_movies.iterrows():
        if pd.notna(movie['genres']):
            year = movie['startYear']
            for genre in movie['genres'].split(','):
                genre_yearly_data.append({
                    'year': year,
                    'genre': genre.strip(),
                    'rating': movie['averageRating'],
                    'votes': movie['numVotes']
                })
    
    genre_trends_df = pd.DataFrame(genre_yearly_data)
    
    # Calculate genre metrics by year
    genre_yearly_stats = genre_trends_df.groupby(['year', 'genre']).agg({
        'rating': 'mean',
        'votes': ['sum', 'count']
    }).round(2)
    
    genre_yearly_stats.columns = ['avg_rating', 'total_votes', 'movie_count']
    genre_yearly_stats = genre_yearly_stats.reset_index()
    
    # Focus on major genres
    major_genres = ['Action', 'Comedy', 'Drama', 'Horror', 'Romance', 'Thriller', 'Adventure', 'Sci-Fi', 'Animation']
    major_genre_trends = genre_yearly_stats[genre_yearly_stats['genre'].isin(major_genres)]
    
    # Calculate growth trends (2015-2023 vs 2010-2014)
    early_period = major_genre_trends[major_genre_trends['year'].between(2010, 2016)]
    recent_period = major_genre_trends[major_genre_trends['year'].between(2017, 2023)]
    
    early_avg = early_period.groupby('genre')['movie_count'].mean()
    recent_avg = recent_period.groupby('genre')['movie_count'].mean()
    
    genre_growth = ((recent_avg - early_avg) / early_avg * 100).round(1)
    genre_growth = genre_growth.sort_values(ascending=False)
    
    print(f"\n📊 Genre Growth Trends (2017-2023 vs 2010-2016):")
    print("Growing Genres:")
    for genre, growth in genre_growth.head(3).items():
        print(f"  🟢 {genre}: +{growth:.1f}% growth")
    
    print("Declining Genres:")
    for genre, growth in genre_growth.tail(3).items():
        print(f"  🔴 {genre}: {growth:.1f}% change")
    
    # KEY INSIGHTS SUMMARY
    print(f"\n💡 STRATEGIC INSIGHTS FOR AAA DEVELOPMENT:")
    
    # Top trending genres for investment
    trending_genres = genre_growth.head(3).index.tolist()
    print(f"1️⃣ INVEST IN: {', '.join(trending_genres)} - showing strongest growth")
    
    # Quality vs quantity balance
    recent_quality = recent_period.groupby('genre')['avg_rating'].mean().sort_values(ascending=False)
    print(f"2️⃣ QUALITY LEADERS: {', '.join(recent_quality.head(3).index)} - highest rated genres")
    
    # Market saturation analysis
    high_volume_genres = recent_period.groupby('genre')['movie_count'].sum().sort_values(ascending=False)
    print(f"3️⃣ SATURATED MARKETS: {', '.join(high_volume_genres.head(3).index)} - high competition")
    
    return major_genre_trends, genre_growth
# Execute trend analysis
genre_trends, genre_growth_analysis = analyze_user_preference_trends(movies_metrics_df)



📈 BUSINESS ANALYSIS 3: USER PREFERENCE TRENDS
🔄 Loading MovieLens data for user behavior analysis...

🎭 Genre Trend Analysis (2010-2023)...

📊 Genre Growth Trends (2017-2023 vs 2010-2016):
Growing Genres:
  🟢 Animation: +29.3% growth
  🟢 Thriller: +29.2% growth
  🟢 Horror: +25.5% growth
Declining Genres:
  🔴 Comedy: 3.7% change
  🔴 Adventure: 1.1% change
  🔴 Romance: -2.4% change

💡 STRATEGIC INSIGHTS FOR AAA DEVELOPMENT:
1️⃣ INVEST IN: Animation, Thriller, Horror - showing strongest growth
2️⃣ QUALITY LEADERS: Animation, Drama, Romance - highest rated genres
3️⃣ SATURATED MARKETS: Drama, Comedy, Thriller - high competition
