In [None]:
import pandas as pd
from ydata_profiling import ProfileReport

In [None]:
df = pd.read_csv("imdb_top_1000.csv")
df.head()

## Section 1 - Demo

In [None]:
profile = ProfileReport(df, title="IMDB Top 1000 – Profiling Report", explorative=True)
profile.to_notebook_iframe()

## Quick Demo
What does this report instantly tell us about:
- Missing data?
- Categorical columns with few unique values?
- Distribution skews and standout ratings?
Take 1–2 minutes to observe before we move on.

## Section 2 - Independent Exploration

#### Challenge 1: Missing Values
Find the top 5 columns with missing data and calculate % missing.

In [None]:
missing_summary = df.isnull().sum().sort_values(ascending=False)
missing_percent = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
pd.DataFrame({"Missing Count": missing_summary, "% Missing": missing_percent}).head(5)

#### Challenge 2: Categorical Cardinality
Identify columns with 10 or fewer unique values.

In [None]:
low_card_cols = [col for col in df.columns if df[col].nunique() <= 10]
df[low_card_cols].nunique()

#### Challenge 3: Outliers
Pick a numeric column (e.g. IMDB rating, Gross) and find values more than 3 standard deviations from the mean.

In [None]:
col = "IMDB_Rating"  # Try changing this to 'Gross' or others
mean = df[col].mean()
std = df[col].std()
outliers = df[(df[col] < mean - 3*std) | (df[col] > mean + 3*std)]
outliers