<a href="https://www.kaggle.com/code/violetmakena/what-makes-a-youtube-video-go-viral?scriptVersionId=249035003" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# The DATA

In [None]:
# Some imports to get us started
import pandas as pd #working and manupulating datasets
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

%matplotlib inline

In [None]:
import seaborn as sns
#visualisations
from scipy.stats import norm
#normalisation of the data
from sklearn.preprocessing import StandardScaler
#standardize data
from scipy import stats
#statistical insights
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
yt_data = pd.read_csv('/kaggle/input/youtube-data-for-analytics-600-rows/youtube_data.csv')

In [None]:
yt_data.head (3)

In [None]:
yt_data.describe()

In [None]:
yt_data.info()

In [None]:
#looking for the number of unique values in the train dataframe
for col in yt_data.columns:
    print(col, len(yt_data[col].unique()))

In [None]:
# Display the minimum and maximum date for 'View Count'
view_count_min = yt_data['view_count'].min()
view_count_max = yt_data['view_count'].max()

# Display the minimum and maximum date for 'Like Count'
like_count_min = yt_data['like_count'].min()
like_count_max = yt_data['like_count'].max()

# Print both ranges
print(f"View Count Range: {view_count_min} to {view_count_max}")
print(f"Like Count Range: {like_count_min} to {like_count_max}")

## View and like count 

View Count Range: 312.0 to 343758991.0  Like Count Range: 0.0 to 4421091.0

# WHICH?

# Which Category has the highest view count and likes 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.font_manager as fm

# Set Thai-compatible font globally (update path to your system's font)
plt.rcParams['font.family'] = 'Noto Sans Thai'  # or 'TH Sarabun', 'Tahoma', etc.


# List of columns for count plots
columns_to_countplot = ['category_id', 'duration']

# Set up the plot grid
num_plots = len(columns_to_countplot)
num_cols = 2  # Number of columns in the plot grid
num_rows = (num_plots + 1) // num_cols

In [None]:
# Set figure size
plt.figure(figsize=(15, 5 * num_rows))

# Loop through the columns and create count plots
for i, col in enumerate(columns_to_countplot, 1):
    ax = plt.subplot(num_rows, num_cols, i)
    
    # Optional: limit unique values to avoid clutter
    top_values = yt_data[col].value_counts().nlargest(20).index
    filtered_data = yt_data[yt_data[col].isin(top_values)]
    
    sns.countplot(data=filtered_data, x=col, ax=ax)
    ax.set_title(f'Count Plot for {col}')
    ax.set_xlabel(col)
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=45)  # Rotate x-labels for better readability
    
    # Annotate bars
    for p in ax.patches:
        height = p.get_height()
        ax.annotate(f'{height:.0f}', 
                    (p.get_x() + p.get_width() / 2., height),
                    ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Group the data by category_id and calculate total views and likes
category_stats = yt_data.groupby('category_id')[['view_count', 'like_count']].sum()

# Sort by total views for better readability
category_stats = category_stats.sort_values('view_count', ascending=False)

# Reset index
category_stats = category_stats.reset_index()

# Set up the figure and axes
fig, ax1 = plt.subplots(figsize=(12, 6))

# Bar width and positions
bar_width = 0.4
x = range(len(category_stats))

# First bar: view_count on ax1
bars1 = ax1.bar(x, category_stats['view_count'], width=bar_width, label='Views', color='pink')
ax1.set_ylabel('Total Views', color='pink')
ax1.tick_params(axis='y', labelcolor='pink')

# Twin axis for like_count
ax2 = ax1.twinx()
bars2 = ax2.bar([i + bar_width for i in x], category_stats['like_count'], width=bar_width, label='Likes', color='red')
ax2.set_ylabel('Total Likes', color='lightblue')
ax2.tick_params(axis='y', labelcolor='lightblue')

# X-axis labels
plt.xticks([i + bar_width / 2 for i in x], category_stats['category_id'], rotation=45)
ax1.set_xlabel('Category ID')
plt.title('Total Views and Likes per Category (Dual-Axis Bar Chart)')

#legends
fig.legend([bars1, bars2], ['Views', 'Likes'], loc='upper right')

plt.tight_layout()
plt.show()


In [None]:
# Filter for category_id 28
cat_28_data = yt_data[yt_data['category_id'] == 28]

# Group by c# hannel_title and sum numeric metrics
channel_sums = cat_28_data.groupby('channel_title')[['view_count', 'like_count', 'comment_count']].sum()

# Sort by view count (optional)
channel_sums = channel_sums.sort_values('view_count', ascending=False)

# Display the result
print(channel_sums)


### Top YT category 28 based on view count
* Mrwhosetheboss        
* Immersed              
* Tech Master Shorts    
* VMK Technical Power  
* Techवाला       

TOP VIEWS

In [None]:
# Group by c# hannel_title and sum numeric metrics
all_channel_sums = yt_data.groupby(['channel_title', 'category_id'])[['view_count', 'like_count', 'comment_count']].sum()    
# Sort by view count (optional)
all_channel_sums = all_channel_sums.sort_values('view_count', ascending=False)
# Display the result
print(all_channel_sums)

TOP LIKES

In [None]:
# Group by channel_title and sum numeric metrics
all_channel_sums = yt_data.groupby(['channel_title', 'category_id'])[['view_count', 'like_count', 'comment_count']].sum()    
# Sort by view count (optional)
all_channel_sums = all_channel_sums.sort_values('like_count', ascending=False)
# Display the result
print(all_channel_sums)

### Mrwhosetheboss leads with  **highest views ** and **highest number of likes**

# Investigating whether time and date of video posted influences the views and likes

In [None]:
yt_data['published_date']

In [None]:
# Make sure 'published_date' is in datetime format
yt_data['published_date'] = pd.to_datetime(yt_data['published_date'], format='%Y-%m-%dT%H:%M:%SZ')

# Extract components
yt_data['published_hour'] = yt_data['published_date'].dt.hour
yt_data['published_day'] = yt_data['published_date'].dt.date
yt_data['published_month'] = yt_data['published_date'].dt.month
yt_data['published_weekday'] = yt_data['published_date'].dt.day_name()


In [None]:
yt_data.head(5)

In [None]:
hourly_views = yt_data.groupby('published_hour')['view_count'].sum().sort_index()
hourly_likes = yt_data.groupby('published_hour')['like_count'].sum().sort_index()
hourly_comments = yt_data.groupby('published_hour')['comment_count'].sum().sort_index()

# Colors for views (blue highlight max)
highlight_color_views = 'red'
default_color_views = 'lightgrey'
colors_views = [highlight_color_views if val == hourly_views.max() else default_color_views for val in hourly_views.values]

# Colors for likes (yellow highlight max)
highlight_color_likes = 'yellow'
default_color_likes = 'lightgrey'
colors_likes = [highlight_color_likes if val == hourly_likes.max() else default_color_likes for val in hourly_likes.values]

# Colors for comments (orange highlight max)
highlight_color_comments = 'orange'
default_color_comments = 'lightgrey'
colors_comments = [highlight_color_comments if val == hourly_comments.max() else default_color_comments for val in hourly_comments.values]

# Corrected subplot layout: 1 row, 3 columns
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharex=True)

# Plot views
sns.barplot(x=hourly_views.index, y=hourly_views.values, palette=colors_views, ax=axes[0])
axes[0].set_title('Total Views by Hour of Day')
axes[0].set_xlabel('Hour (0–23)')
axes[0].set_ylabel('Total Views')

# Plot likes
sns.barplot(x=hourly_likes.index, y=hourly_likes.values, palette=colors_likes, ax=axes[1])
axes[1].set_title('Total Likes by Hour of Day')
axes[1].set_xlabel('Hour (0–23)')
axes[1].set_ylabel('Total Likes')

# Plot comments
sns.barplot(x=hourly_comments.index, y=hourly_comments.values, palette=colors_comments, ax=axes[2])
axes[2].set_title('Total Comments by Hour of Day')
axes[2].set_xlabel('Hour (0–23)')
axes[2].set_ylabel('Total Comments')

plt.tight_layout()
plt.show()


Key Insight:
Hour 5 (5 AM) is the most effective upload time in terms of accumulated views. This might indicate:

Global scheduling behavior (e.g., aligning with certain time zones).

Algorithmic preference or content targeting audiences who are online early.

In [None]:
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Aggregate metrics
weekday_views = yt_data.groupby('published_weekday')['view_count'].sum().reindex(weekday_order)
weekday_likes = yt_data.groupby('published_weekday')['like_count'].sum().reindex(weekday_order)
weekday_comments = yt_data.groupby('published_weekday')['comment_count'].sum().reindex(weekday_order)

# Colors for views
highlight_color_views = 'red'
default_color_views = 'lightgrey'
colors_views = [highlight_color_views if val == weekday_views.max() else default_color_views for val in weekday_views.values]

# Colors for likes
highlight_color_likes = 'yellow'
default_color_likes = 'lightgrey'
colors_likes = [highlight_color_likes if val == weekday_likes.max() else default_color_likes for val in weekday_likes.values]

# Colors for comments
highlight_color_comments = 'orange'
default_color_comments = 'lightgrey'
colors_comments = [highlight_color_comments if val == weekday_comments.max() else default_color_comments for val in weekday_comments.values]

# Create 1-row, 3-column grid
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharex=False)

# Views
sns.barplot(x=weekday_views.index, y=weekday_views.values, palette=colors_views, ax=axes[0])
axes[0].set_title('Total Views by Weekday')
axes[0].set_xlabel('Weekday')
axes[0].set_ylabel('Views')

# Likes
sns.barplot(x=weekday_likes.index, y=weekday_likes.values, palette=colors_likes, ax=axes[1])
axes[1].set_title('Total Likes by Weekday')
axes[1].set_xlabel('Weekday')
axes[1].set_ylabel('Likes')

# Comments
sns.barplot(x=weekday_comments.index, y=weekday_comments.values, palette=colors_comments, ax=axes[2])
axes[2].set_title('Total Comments by Weekday')
axes[2].set_xlabel('Weekday')
axes[2].set_ylabel('Comments')

plt.tight_layout()
plt.show()


In [None]:
# Grouping by month
monthly_views = yt_data.groupby('published_month')['view_count'].sum().sort_index()
monthly_likes = yt_data.groupby('published_month')['like_count'].sum().sort_index()
monthly_comments = yt_data.groupby('published_month')['comment_count'].sum().sort_index()

# Colors for views (green highlight)
highlight_color_views = 'red'
default_color_views = 'lightgrey'
colors_views = [highlight_color_views if val == monthly_views.max() else default_color_views for val in monthly_views.values]

# Colors for likes (red highlight)
highlight_color_likes = 'orange'
default_color_likes = 'lightgrey'
colors_likes = [highlight_color_likes if val == monthly_likes.max() else default_color_likes for val in monthly_likes.values]

# Colors for comments (blue highlight)
highlight_color_comments = 'yellow'
default_color_comments = 'lightgrey'
colors_comments = [highlight_color_comments if val == monthly_comments.max() else default_color_comments for val in monthly_comments.values]

# Subplot grid: 1 row, 3 columns
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharex=True)

# Views
sns.barplot(x=monthly_views.index, y=monthly_views.values, palette=colors_views, ax=axes[0])
axes[0].set_title('Total Views by Month')
axes[0].set_xlabel('Month (1–12)')
axes[0].set_ylabel('Total Views')

# Likes
sns.barplot(x=monthly_likes.index, y=monthly_likes.values, palette=colors_likes, ax=axes[1])
axes[1].set_title('Total Likes by Month')
axes[1].set_xlabel('Month (1–12)')
axes[1].set_ylabel('Total Likes')

# Comments
sns.barplot(x=monthly_comments.index, y=monthly_comments.values, palette=colors_comments, ax=axes[2])
axes[2].set_title('Total Comments by Month')
axes[2].set_xlabel('Month (1–12)')
axes[2].set_ylabel('Total Comments')

plt.tight_layout()
plt.show()


In [None]:
yt_data.head(5)

# Plotting View Count Trend Over Time by Category:

### Updating Your Dashboard to Use Only Valid Columns  

In [None]:
# Ensure datetime is clean
yt_data['published_date'] = pd.to_datetime(yt_data['published_date'], errors='coerce')

# Dropping rows with missing 'published_date' or 'view_count'
yt_clean = yt_data.dropna(subset=['published_date', 'view_count'])

# Extract year
yt_clean['published_year'] = yt_clean['published_date'].dt.year

# Grouping by available years
yearly_data = yt_clean.groupby('published_year')[['view_count', 'like_count', 'comment_count']].sum(min_count=1).sort_index()


### Safe Plotting – Avoid Errors from Missing Columns

In [None]:
yt_clean.info()

In [None]:
from matplotlib.ticker import MaxNLocator

In [None]:
# Define the metrics you care about
metrics = ['view_count', 'like_count', 'comment_count']

# Define custom colors for each metric
colors = {
    'view_count': '#1f77b4',     # Blue
    'like_count': '#2ca02c',     # Green
    'comment_count': '#ff7f0e'   # Orange
}

# Define custom titles for each metric
titles = {
    'view_count': 'Total Views per Year',
    'like_count': 'Total Likes per Year',
    'comment_count': 'Total Comments per Year'
}

# Ensure only columns that exist and have enough data are used
available_metrics = [col for col in metrics if col in yt_clean.columns and yt_clean[col].notna().sum() > 100]

# Loop through each available metric and plot
for metric in available_metrics:
    fig, ax = plt.subplots(figsize=(8, 5))

    sns.barplot(
        x=yearly_data.index,
        y=yearly_data[metric],
        color=colors.get(metric, 'grey'),  # Use defined color or fallback to grey
        ax=ax
    )

    ax.set_title(titles.get(metric, metric.replace('_', ' ').title()), fontsize=13, weight='bold')
    ax.set_xlabel("Year", fontsize=11)
    ax.set_ylabel(metric.replace('_', ' ').title(), fontsize=11)
    ax.grid(axis='y', linestyle='--', alpha=0.5)
    ax.yaxis.set_major_locator(MaxNLocator(nbins=5))

    # Annotate peak value
    if yearly_data[metric].notna().sum() > 1:
        peak_year = yearly_data[metric].idxmax()
        peak_value = yearly_data[metric].max()
        ax.annotate(
            f'{int(peak_value):,}',
            xy=(peak_year, peak_value),
            xytext=(peak_year, peak_value * 1.05),
            ha='center',
            fontsize=9,
            color='black',
            arrowprops=dict(arrowstyle='->', color='black')
        )

    plt.tight_layout()
    plt.show()


# Overall Observations:
* 2024 was your channel’s strongest year for all three metrics: views, likes, and comments.
* There's a clear growth trend from 2021 to 2024, suggesting improved content strategy, more frequent uploads, or viral videos.
* 2025 shows a dip, which may indicate fewer uploads, lower-performing videos, or a shift in content engagement.

###   Normalize Per Upload

In [None]:
# Count how many videos were uploaded each year
upload_counts = yt_clean.groupby('published_year').size()

#Averages per video 
# sums up the total view_count, like_count, and comment_count across all videos.
yearly_sums = yt_clean.groupby('published_year')[['view_count', 'like_count', 'comment_count']].sum()

# Instead of summing values, it counts how many videos (rows) were uploaded in each year
upload_counts = yt_clean.groupby('published_year').size()

# Normalize (average per video) Divides each value in yearly_sums by the corresponding value in upload_counts (i.e., total / number of uploads).
avg_per_video = yearly_sums.div(upload_counts, axis=0)


In [None]:
avg_per_video

In [None]:
upload_counts 

In [None]:
yearly_sums

In [None]:
#Plotting 
# Titles and colors
titles = {
    'view_count': 'Avg Views per Video by Year',
    'like_count': 'Avg Likes per Video by Year',
    'comment_count': 'Avg Comments per Video by Year'
}
colors = {
    'view_count': 'mediumseagreen',
    'like_count': 'goldenrod',
    'comment_count': 'cornflowerblue'
}

# Plotting
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, metric in enumerate(['view_count', 'like_count', 'comment_count']):
    if metric in avg_per_video.columns and avg_per_video[metric].notna().sum() >= 2:
        sns.barplot(
            x=avg_per_video.index,
            y=avg_per_video[metric],
            ax=axes[i],
            color=colors[metric]
        )
        axes[i].set_title(titles[metric], fontsize=13, weight='bold')
        axes[i].set_xlabel("Year", fontsize=11)
        axes[i].set_ylabel("Average per Video", fontsize=11)
        axes[i].yaxis.set_major_locator(MaxNLocator(nbins=5))
        axes[i].grid(axis='y', linestyle='--', alpha=0.5)

        # Annotate peak
        peak_year = avg_per_video[metric].idxmax()
        peak_value = avg_per_video[metric].max()
        axes[i].annotate(
            f'{int(peak_value):,}',
            xy=(peak_year, peak_value),
            xytext=(peak_year, peak_value * 1.05),
            ha='center',
            fontsize=9,
            color='black',
            arrowprops=dict(arrowstyle='->', color='black')
        )
    else:
        axes[i].axis('off')

# Hide unused subplot if any
axes[3].axis('off')

plt.tight_layout()
plt.show()

The chart above shows 
Which year gave you the most views per upload which is the most effective year.
Whether volume = success or whether strategy = results.
Highlights years where fewer videos performed best.

# Key Trends
Growth in Total Upload Impact
* There’s explosive growth in total views and engagement, especially:
* 2022–2024: strong upward trend in all metrics.
* 2024 has the highest total views (2.06 billion), more than 3x 2023.

Drop in Per-Video Performance Despite Growth
From the earlier avg_per_video data:
In 2024, although total views are huge, the average views per video dropped to 12.7M (from 25.6M in 2022 and 20.4M in 2023).

In 2025, it dropped further to just 1.98M per video, despite uploading 354 videos!

This shows diminishing returns — more uploads but less impact per video.

# What does this mean for Startegy
2022:
High impact per video with low volume: Possibly the sweet spot — content resonated well with the audience.

**2024–2025:**
Volume skyrocketed, but quality/engagement seems diluted.

**Possible reasons:**

* Less engaging content.
* Shorter videos or less compelling titles/thumbnails.
* Algorithm shift.
* Uploading too frequently with little promotion.

# **Recommendations** 
* Audit 2022 videos: What made them so successful? Replicate that strategy.
* Reduce quantity, focus on quality: 2025 uploads are very high, but effectiveness is low.
* Improve titles, thumbnails, hooks: Test and optimize click-through and retention.
* Cnsider upload pacing: Fewer, more targeted uploads may perform better.
* Boost 2025 videos with SEO, paid ads, or re-edits if they underperformed but had potential.

# Summary Data

## Deeper statistical summary
YOY growth & CAGR


#### YoY Growth Year-over Year Growth

Formula:

YoY Growth = [(Value current − Value previous)/Value previous]x100


In [None]:
# Calculate Year-over-Year growth for each metric
yoy_growth = yearly_sums.pct_change() * 100  # pct_change() is called on the DataFrame

# Round to 2 decimal places for readability
yoy_growth = yoy_growth.round(2)

# Rename columns to indicate growth
yoy_growth.columns = [f"{col} YoY Growth (%)" for col in yoy_growth.columns]

# Show the result
print(yoy_growth)

In [None]:
# Plot all 3 metrics on one chart
plt.figure(figsize=(12, 6))
plt.plot(yoy_growth.index, yoy_growth['view_count YoY Growth (%)'], label='Views', marker='o')
plt.plot(yoy_growth.index, yoy_growth['like_count YoY Growth (%)'], label='Likes', marker='o')
plt.plot(yoy_growth.index, yoy_growth['comment_count YoY Growth (%)'], label='Comments', marker='o')

plt.axhline(0, color='grey', linestyle='--')
plt.title("📈 YoY Growth Rate of Engagement Metrics")
plt.xlabel("Published Year")
plt.ylabel("YoY Growth (%)")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


# Insights:
* 2016–2017 were very low-activity years.
* 2019–2021 was your golden growth period (triple-digit YoY).
* 2022 marked a high, but 2023 slowed down.
* 2024 brought strong comment growth — maybe more engaging content?
* 2025 saw a steep crash 

**need to investigate**:
1. Was content different?
2. Were videos shorter or less promoted?
3. Were there YouTube algorithm changes?



###  CAGR (2009 to 2025) 
### CAGR (Compound Annual Growth Rate) to assess long-term trend
Formula:
CAGR = [(End Value/Start Value)^1/n - 1

In [None]:
def calculate_cagr(start_value, end_value, periods):
    return (end_value / start_value) ** (1 / periods) - 1

start_year = 2009
end_year = 2025
n_years = end_year - start_year  # 16 years

cagr_results = {}
for metric in yearly_sums.columns:
    start_val = yearly_sums.loc[start_year, metric]
    end_val = yearly_sums.loc[end_year, metric]
    cagr = calculate_cagr(start_val, end_val, n_years)
    cagr_results[metric] = round(cagr * 100, 2)  # convert to %

print("CAGR from 2009 to 2025 (%):")
for metric, rate in cagr_results.items():
    print(f"{metric}: {rate}%")


View count growing slowly at ~2.9% per year over 16 years — a modest steady increase.
Likes and comments growing much faster (~15% and ~12%), showing engagement has ramped up faster than just views.
This could mean people interact more actively with your content over time even if total views grow gradually.

In [None]:
# CAGR results you provided
cagr_2009_2025 = {
    'view_count': 2.93,
    'like_count': 14.74,
    'comment_count': 12.09
}

plt.figure(figsize=(7,5))
plt.bar(cagr_2009_2025.keys(), cagr_2009_2025.values(), color=['#1f77b4', '#2ca02c', '#ff7f0e'])
plt.title("CAGR (2009–2025) of YouTube Metrics (%)")
plt.ylabel("CAGR %")
plt.ylim(0, max(cagr_2009_2025.values()) * 1.2)
for i, v in enumerate(cagr_2009_2025.values()):
    plt.text(i, v + 0.5, f"{v:.2f}%", ha='center')
plt.show()


### Comparing CAGR for 2016–2025 before and After Covid-19

In [None]:
# Define function to calculate CAGR (reuse if needed)
def calculate_cagr(start_value, end_value, periods):
    return (end_value / start_value) ** (1 / periods) - 1

start_year_1 = 2009
start_year_2 = 2019
end_year = 2025

n_years_1 = end_year - start_year_1  # 16 years
n_years_2 = end_year - start_year_2  # 9 years

cagr_2016_2025 = {}
for metric in yearly_sums.columns:
    start_val = yearly_sums.loc[start_year_2, metric]
    end_val = yearly_sums.loc[end_year, metric]
    cagr = calculate_cagr(start_val, end_val, n_years_2)
    cagr_2016_2025[metric] = round(cagr * 100, 2)  # convert to %

# Combine both CAGR dictionaries for comparison
import pandas as pd

cagr_compare = pd.DataFrame({
    '2009-2025': cagr_2009_2025,
    '2019-2025': cagr_2016_2025
})

print(cagr_compare)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Your CAGR data
cagr_data = {
    '2009-2025': {'view_count': 2.93, 'like_count': 14.74, 'comment_count': 12.09},
    '2019-2025': {'view_count': 112.46, 'like_count': 136.98, 'comment_count': 121.01}
}

# Convert to DataFrame
cagr_df = pd.DataFrame(cagr_data)

# Plot
ax = cagr_df.plot(kind='bar', figsize=(8,5), color=['#1f77b4', '#ff7f0e'])
plt.title('CAGR Comparison: Before and After COVID-19')
plt.ylabel('CAGR (%)')
plt.xticks(rotation=0)
plt.ylim(0, cagr_df.values.max() * 1.2)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.legend(title='Period')
plt.tight_layout()

# Annotate bars
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height:.2f}%', (p.get_x() + p.get_width() / 2, height),
                ha='center', va='bottom', fontsize=9)

plt.show()


Massive surge in growth since 2019 across all metrics.
Engagement exploded post-COVID, likely due to increased online content consumption.

The very high engagement numbers in 2009 like the huge average views per video are most likely caused by a small sample size and outliers:

In 2009, you had only 3 uploads (from your upload counts).
If even one video went viral or gained massive views/likes/comments, it would skew the averages and totals enormously.
This kind of data distortion happens often when early dataset years have very few entries.
What this means practically:
2009 numbers aren’t reliable for trend analysis because they don't represent a stable or mature dataset.
> 
This can exaggerate engagement metrics, making that year look like an outlier.

Between 2019 and 2024, your data shows massive CAGR jumps across views, likes, and comments, which fits perfectly with the COVID-19 pandemic timeline and its social impact:
* Lockdowns and social distancing meant people spent more time online, consuming and interacting with digital content.
* Platforms like YouTube saw surges in both content creation (uploads) and user engagement (views, likes, comments).
* This is why the post-2019 growth rates are dramatically higher than the longer-term average from 2009 onward.

# Engagement Trends by Channel or Content Type

In [None]:
yt_clean.info()

In [None]:
channel_summary = yt_clean.groupby('category_id')[['view_count', 'like_count', 'comment_count']].mean().sort_values(by='view_count', ascending=False)
print(channel_summary.head(5))  # Top 5 channels by average views

In [None]:
import matplotlib.pyplot as plt

# Select top 5 categories by view_count (already sorted)
top5 = channel_summary.head(5)

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Pie chart for average view_count (top 5)
axes[0].pie(top5['view_count'], labels=top5.index, autopct='%1.1f%%', startangle=140)
axes[0].set_title('Top 5 Average View Count by Category')

# Pie chart for average like_count (top 5)
axes[1].pie(top5['like_count'], labels=top5.index, autopct='%1.1f%%', startangle=140)
axes[1].set_title('Top 5 Average Like Count by Category')

# Pie chart for average comment_count (top 5)
axes[2].pie(top5['comment_count'], labels=top5.index, autopct='%1.1f%%', startangle=140)
axes[2].set_title('Top 5 Average Comment Count by Category')

plt.tight_layout()
plt.show()

In [None]:
# Total views per category
total_views = yt_clean.groupby('category_id')['view_count'].sum()

# Average views per category
avg_views = yt_clean.groupby('category_id')['view_count'].mean()

# Combine in one DataFrame
views_compare = pd.DataFrame({
    'Total Views': total_views,
    'Average Views': avg_views
}).sort_values(by='Total Views', ascending=False)

print(views_compare.head(10))


In [None]:
# Calculate totals and averages
total_views = yt_clean.groupby('category_id')['view_count'].sum()
avg_views = yt_clean.groupby('category_id')['view_count'].mean()

# Combine into one DataFrame and sort by total views
views_compare = pd.DataFrame({
    'Total Views': total_views,
    'Average Views': avg_views
}).sort_values(by='Total Views', ascending=False)

# Select top 10 categories by total views
top10 = views_compare.head(10)

x = np.arange(len(top10))

fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot total views on left y-axis
color1 = 'tab:blue'
ax1.set_xlabel('Category ID')
ax1.set_ylabel('Total Views', color=color1)
bars1 = ax1.bar(x - 0.1, top10['Total Views'], width=0.4, label='Total Views', color=color1)
ax1.tick_params(axis='y', labelcolor=color1)
ax1.set_xticks(x)
ax1.set_xticklabels(top10.index.astype(str))
ax1.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

# Create second y-axis for average views
ax2 = ax1.twinx()
color2 = 'tab:orange'
ax2.set_ylabel('Average Views', color=color2)
bars2 = ax2.bar(x + 0.3, top10['Average Views'], width=0.4, label='Average Views', color=color2)
ax2.tick_params(axis='y', labelcolor=color2)
ax2.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

# Title and legend
plt.title('Total vs Average Views by Category (Top 10)')
fig.tight_layout()

# Combine legends
bars = bars1 + bars2
labels = [bar.get_label() for bar in bars]
ax1.legend(handles=[bars1, bars2], loc='upper right')

plt.show()

# The difference between Average views and Total Views 
* Total Views is the sum of views across all videos in that category.
Categories with many videos tend to have very large total views, even if each video individually doesn’t get massive views.
i.e. If category 28 has thousands of videos, even average views of 8.8 million per video sum up to billions in total views.

* Average Views is the mean per video — it reflects how popular an average single video is in that category.
If a category has few but highly popular videos, average views can be very high even if total views are smaller.
i.e. Category 1 has average views ~64 million but total views ~257 million, meaning it has relatively fewer videos but those are very popular.

Number of videos per category varies a lot categories with many videos will have higher totals but possibly lower averages. Conversely, categories with fewer but highly viewed videos have high averages but lower totals

In [None]:
video_counts = yt_clean.groupby('category_id').size()
views_compare['Video Count'] = video_counts
print(views_compare[['Total Views', 'Average Views', 'Video Count']])

In [None]:
#The log scale helps visualize categories with very different magnitudes better.
#Both Total Views and Average Views are on log scale.

Video Count remains linear (on right y-axis).
views_compare.index = views_compare.index.astype(str)

x = np.arange(len(views_compare))

fig, ax1 = plt.subplots(figsize=(14, 7))

color1 = 'tab:blue'
color2 = 'tab:orange'
color3 = 'tab:green'

# Bar plot for Total Views with log scale
bars_total = ax1.bar(x - 0.2, views_compare['Total Views'], width=0.4, label='Total Views', color=color1)
ax1.set_xlabel('Category ID')
ax1.set_ylabel('Total Views (log scale)', color=color1)
ax1.set_yscale('log')
ax1.tick_params(axis='y', labelcolor=color1)
ax1.set_xticks(x)
ax1.set_xticklabels(views_compare.index, rotation=45, ha='right')

# Second y-axis for Average Views with log scale
ax2 = ax1.twinx()
bars_avg = ax2.bar(x + 0.2, views_compare['Average Views'], width=0.4, label='Average Views', color=color2, alpha=0.7)
ax2.set_ylabel('Average Views (log scale)', color=color2)
ax2.set_yscale('log')
ax2.tick_params(axis='y', labelcolor=color2)

# Plot Video Count as line on same right y-axis (linear scale)
ax2.plot(x, views_compare['Video Count'], label='Video Count', color=color3, marker='o', linewidth=2)

# Title and combined legend
plt.title('Total Views, Average Views (log scale) and Video Count by Category')
bars = [bars_total, bars_avg]
labels = [bar.get_label() for bar in bars]
lines, line_labels = ax2.get_legend_handles_labels()
ax1.legend(bars + lines, labels + line_labels, loc='upper left')

plt.tight_layout()
plt.show()


What this plot shows:
Blue bars: Total Views (scaled to billions on left axis)
Orange bars: Average Views per video (scaled to millions on right axis)
Green line + dots: Number of videos (on right axis)

# Summary
Final Insights on Video Engagement by Category
*Diverse Engagement Across Categories*
Engagement metrics reveal significant variation in both total and average views, reflecting differences in audience size and content appeal across categories.

*Total Views vs. Average Views*

*Categories with many videos (e.g., 28 and 22) achieve very high total views, though their average views per video are moderated by volume.
*Categories with fewer videos (e.g., 1 and 10) show higher average views, indicating stronger individual video performance.

*Value of Log Scale Visualization*
*Applying a log scale uncovers underlying patterns: some categories dominate total views through volume, while others excel in average views despite fewer videos.

*Video Count Impact*
*While more videos generally correlate with higher total views, this does not guarantee high average views. Some smaller categories produce highly engaging content with strong per-video impact.

## Strategic Implications

*Categories with high average views but fewer videos are ideal for targeted, high-impact content strategies focusing on niche audiences.
*Categories with high total views from volume support broad audience reach and quantity-driven growth.
*Balanced growth may be achieved by improving content quality in high-volume categories or increasing volume in high-impact categories.

**Post-COVID Growth Influence**
* Engagement growth post-2019, influenced by pandemic-driven changes in viewer behavior, has amplified views across categories, marking this period as crucial for trend analysis.