In [23]:
import os
import pandas as pd
# import visualize
# Import our local modules
# import data_preprocesso
from prep import data_preprocess
from prep.data_preprocess import load_data, preprocess_data, CSV_PATH
import analysis.analysis
import visual.visual

def main():
    # 1. Load Data
    # Ensure 'movies_metadata.csv' is inside a 'data' folder in this directory
    if not os.path.exists(CSV_PATH):
        print(f"CRITICAL ERROR: Could not find file at {CSV_PATH}")
        print("Please create a folder named 'data' and put 'movies_metadata.csv' inside it.")
        return

    df_raw = load_data(CSV_PATH)
    if df_raw is None:
        return

    # 2. Clean Data
    df = preprocess_data(df_raw)
    
    # 3. Analyze Data
    genre_metrics = analysis.analysis.get_genre_metrics(df)
    yearly_trends = analysis.analysis.get_yearly_trends(df)
    correlations = analysis.analysis.get_correlation_matrix(df)
    
    # Output some insights to console
    print("\n--- Top 5 Most Profitable Genres (ROI) ---")
    print(genre_metrics[['primary_genre', 'median_roi', 'count']].head(5))
    
    print("\n--- Correlation Matrix (Budget vs Revenue) ---")
    print(correlations.loc[['budget'], ['revenue']])

    # 4. Visualize Data
    visual.visual.plot_genre_roi(genre_metrics)
    visual.visual.plot_budget_vs_revenue(df)
    visual.visual.plot_yearly_trends(yearly_trends)
    
    print("\nAnalysis complete! Check the 'results' folder for plots.")

if __name__ == "__main__":
    main()

Loading data from: data/movies_metadata.csv...
Data loaded successfully: 45466 rows.
Starting data preprocessing...
Parsing JSON columns...
Preprocessing complete. 5369 entries remain after cleaning.
Calculating genre success metrics...

--- Top 5 Most Profitable Genres (ROI) ---
      primary_genre  median_roi  count
11           Horror    2.027030    325
7            Family    1.829940     55
15  Science Fiction    1.579513    104
2         Animation    1.452970    146
1         Adventure    1.327112    416

--- Correlation Matrix (Budget vs Revenue) ---
         revenue
budget  0.730151
Saved plot: results/genre_roi.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=genre_df.head(10), x='primary_genre', y='median_roi', palette='viridis')


Saved plot: results/budget_vs_revenue.png
Saved plot: results/yearly_budget_trend.png

Analysis complete! Check the 'results' folder for plots.


In [32]:
"""
Module: main.py
Description: Entry point for the CineMetrics analysis pipeline.
"""

import os
import sys
import pandas as pd

# --- Import Handling ---
# Ensures that local modules can be imported even if running from a different context
try:
    if os.getcwd() not in sys.path:
        sys.path.append(os.getcwd())
    
    # from data_cleaner import load_data, preprocess_data, CSV_PATH
    from prep.data_preprocess import load_data, preprocess_data, CSV_PATH
    import analysis
    import visualize 
    
    print("[SYSTEM] Modules loaded successfully.\n")

except ImportError as e:
    print(f"\n[CRITICAL ERROR] Module import failed: {e}")
    print("Ensure 'data_cleaner.py', 'analysis.py', and 'visualize.py' are in the same directory.")
    sys.exit(1)

def main():
    print("==========================================")
    print("   CineMetrics: Blockbuster Analytics     ")
    print("==========================================\n")

    # 1. Data Ingestion
    if not os.path.exists(CSV_PATH):
        print(f"[ERROR] Dataset not found at: {CSV_PATH}")
        print("Please create a 'data' folder and place 'movies_metadata.csv' inside it.")
        return

    # df_raw = load_data()
    df_raw = load_data(path=CSV_PATH)
    if df_raw is None:
        return

    # 2. Data Preprocessing
    df = preprocess_data(df_raw)
    if df.empty:
        print("[ERROR] Dataset is empty after preprocessing. Check input file integrity.")
        return
    
    # 3. Statistical Analysis
    print("\n[INFO] Performing statistical analysis...")
    
    # General Metrics
    genre_metrics = analysis.analysis.get_genre_metrics(df)
    yearly_trends = analysis.analysis.get_yearly_trends(df)
    correlations = analysis.analysis.get_correlation_matrix(df)
    
    # Deep Mining Metrics
    seasonal_stats = analysis.analysis.get_seasonal_stats(df)
    top_studios = analysis.analysis.get_top_studios(df)
    
    # --- Console Insights ---
    print("\n--- ðŸ’° Top 5 Most Profitable Genres (Median ROI) ---")
    print(genre_metrics[['primary_genre', 'median_roi', 'count']].head(5).to_string(index=False))
    
    print("\n--- ðŸ“… Best Month for Revenue ---")
    best_month = seasonal_stats.sort_values(by='median_revenue', ascending=False).iloc[0]
    print(f"Month: {best_month['month']}, Median Revenue: ${best_month['median_revenue']:,.2f}")

    print("\n--- ðŸŽ¬ Top Studio by Revenue ---")
    best_studio = top_studios.iloc[0]
    print(f"Studio: {best_studio['lead_studio']}, Median Revenue: ${best_studio['median_revenue']:,.2f}")

    print("\n--- ðŸ”— Key Correlations ---")
    print(f"Budget vs Revenue: {correlations.loc['budget', 'revenue']:.4f}")
    print(f"Runtime vs Revenue: {correlations.loc['runtime', 'revenue']:.4f}")

    # 4. Visualization
    print("\n[INFO] Generating visualizations...")
    try:
        visualize.plot_genre_roi(genre_metrics)
        visualize.plot_budget_vs_revenue(df)
        visualize.plot_yearly_trends(yearly_trends)
        visualize.plot_seasonal_revenue(seasonal_stats)
        visualize.plot_top_studios(top_studios)
        print("\n[SUCCESS] Analysis complete! Results saved to the 'results' directory.")
    except Exception as e:
        print(f"[ERROR] Visualization failed: {e}")

if __name__ == "__main__":
    main()

[SYSTEM] Modules loaded successfully.

   CineMetrics: Blockbuster Analytics     

Loading data from: data/movies_metadata.csv...
Data loaded successfully: 45466 rows.
Starting data preprocessing...
Parsing JSON columns...
Preprocessing complete. 5369 entries remain after cleaning.

[INFO] Performing statistical analysis...
Calculating genre success metrics...


AttributeError: module 'analysis.analysis' has no attribute 'get_seasonal_stats'