# Netflix Subscriptions Data Analysis 2025

This analysis examines Netflix subscription pricing and library sizes across 128 countries in 2025.

## Key Questions
1. How does Netflix pricing vary globally?
2. Which countries offer the best value for money?
3. How does the new Ads tier compare to other plans?
4. What regional patterns exist in pricing and content?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry_convert as pc
import os
%matplotlib inline

# Create docs folder for visualizations
os.makedirs('docs/images', exist_ok=True)

# Set style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load the data
df = pd.read_csv('netflix-2025.csv')
print(f"Total countries: {len(df)}")
df.head()

In [None]:
# Clean column names
df.columns = df.columns.str.strip()

# Drop empty columns
df = df.drop(columns=[col for col in df.columns if 'Unnamed' in col or col == 'X.1'], errors='ignore')

# Calculate actual subscription costs from cost per title
# Cost per title = subscription_cost / library_size, so subscription_cost = cost_per_title * library_size
df['Cost - Basic with Ads ($)'] = (df['Cost per Title - Basic with Ads ($)'] * df['Total Library Size']).round(2)
df['Cost - Basic ($)'] = (df['Cost per Title - Basic ($)'] * df['Total Library Size']).round(2)
df['Cost - Standard ($)'] = (df['Cost per Title - Standard ($)'] * df['Total Library Size']).round(2)
df['Cost - Premium ($)'] = (df['Cost per Title - Premium ($)'] * df['Total Library Size']).round(2)

df.head()

## Add Regional Classification

In [None]:
def country_to_continent(country_name):
    """Convert country name to continent."""
    special_cases = {
        'CÃ´te d\'Ivoire': 'Africa',
        'Democratic Republic of the Congo': 'Africa',
        'Trinidad and Tobago': 'North America',
        'Antigua & Barbuda': 'North America',
        'St. Lucia': 'North America',
        'Turks & Caicos Islands': 'North America',
        'Bosnia & Herzegovina': 'Europe',
        'Palestine': 'Asia',
        'Guernsey': 'Europe',
        'French Guiana': 'South America',
        'French Polynesia': 'Oceania'
    }
    if country_name in special_cases:
        return special_cases[country_name]
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        return pc.convert_continent_code_to_continent_name(country_continent_code)
    except:
        return 'Unknown'

df['Region'] = df['Country'].apply(country_to_continent)
df['Region'].value_counts()

## Library Size Analysis

In [None]:
# Library size statistics
print("Library Size Statistics:")
print(df['Total Library Size'].describe())

fig, ax = plt.subplots(figsize=(12, 6))
sns.histplot(data=df, x='Total Library Size', kde=True, bins=30, color='#E50914')
plt.title('Netflix Library Size Distribution (2025)', fontsize=16, fontweight='bold')
plt.xlabel('Total Library Size')
plt.ylabel('Number of Countries')
plt.tight_layout()
plt.savefig('docs/images/library_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Top and Bottom countries by library size
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

top_lib = df.nlargest(15, 'Total Library Size')
sns.barplot(data=top_lib, y='Country', x='Total Library Size', ax=axes[0], palette='Reds_r')
axes[0].set_title('Top 15 Countries by Library Size', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Library Size')

bot_lib = df.nsmallest(15, 'Total Library Size')
sns.barplot(data=bot_lib, y='Country', x='Total Library Size', ax=axes[1], palette='Blues_r')
axes[1].set_title('Bottom 15 Countries by Library Size', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Library Size')

plt.tight_layout()
plt.savefig('docs/images/library_top_bottom.png', dpi=150, bbox_inches='tight')
plt.show()

## Subscription Cost Analysis

In [None]:
# Subscription cost statistics
cost_cols = ['Cost - Basic with Ads ($)', 'Cost - Basic ($)', 'Cost - Standard ($)', 'Cost - Premium ($)']
print("\nSubscription Cost Statistics:")
print(df[cost_cols].describe())

In [None]:
# Cost distribution by tier
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
colors = ['#564d4d', '#831010', '#E50914', '#ff4d4d']
titles = ['Basic with Ads', 'Basic', 'Standard', 'Premium']

for i, (col, color, title) in enumerate(zip(cost_cols, colors, titles)):
    ax = axes[i//2, i%2]
    data = df[col].dropna()
    sns.histplot(data=data, kde=True, ax=ax, color=color, bins=20)
    ax.set_title(f'{title} Tier Cost Distribution', fontsize=12, fontweight='bold')
    ax.set_xlabel('Monthly Cost ($)')
    ax.axvline(data.mean(), color='black', linestyle='--', label=f'Mean: ${data.mean():.2f}')
    ax.legend()

plt.suptitle('Netflix Subscription Cost Distribution by Tier (2025)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('docs/images/cost_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Most expensive and cheapest countries
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

top_cost = df.nlargest(15, 'Cost - Premium ($)')
sns.barplot(data=top_cost, y='Country', x='Cost - Premium ($)', ax=axes[0], palette='Reds_r')
axes[0].set_title('Most Expensive Countries (Premium Tier)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Monthly Cost ($)')

bot_cost = df[df['Cost - Premium ($)'] > 0].nsmallest(15, 'Cost - Premium ($)')
sns.barplot(data=bot_cost, y='Country', x='Cost - Premium ($)', ax=axes[1], palette='Greens_r')
axes[1].set_title('Cheapest Countries (Premium Tier)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Monthly Cost ($)')

plt.tight_layout()
plt.savefig('docs/images/cost_top_bottom.png', dpi=150, bbox_inches='tight')
plt.show()

## Cost Per Title Analysis (Value for Money)

In [None]:
# Best value countries (lowest cost per title)
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

best_value = df[df['Cost per Title - Premium ($)'] > 0].nsmallest(15, 'Cost per Title - Premium ($)')
sns.barplot(data=best_value, y='Country', x='Cost per Title - Premium ($)', ax=axes[0], palette='Greens_r')
axes[0].set_title('Best Value Countries (Lowest Cost per Title)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Cost per Title ($)')

worst_value = df.nlargest(15, 'Cost per Title - Premium ($)')
sns.barplot(data=worst_value, y='Country', x='Cost per Title - Premium ($)', ax=axes[1], palette='Reds_r')
axes[1].set_title('Worst Value Countries (Highest Cost per Title)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Cost per Title ($)')

plt.tight_layout()
plt.savefig('docs/images/value_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## Ads Tier Analysis

In [None]:
# Countries with Ads tier available
ads_df = df[df['Cost - Basic with Ads ($)'].notna() & (df['Cost - Basic with Ads ($)'] > 0)].copy()
print(f"Countries with Ads tier: {len(ads_df)} out of {len(df)}")

# Savings from Ads tier vs Basic
ads_df['Ads Savings ($)'] = ads_df['Cost - Basic ($)'] - ads_df['Cost - Basic with Ads ($)']
ads_df['Ads Savings (%)'] = ((ads_df['Ads Savings ($)'] / ads_df['Cost - Basic ($)']) * 100).round(1)

fig, ax = plt.subplots(figsize=(12, 8))
ads_sorted = ads_df.sort_values('Ads Savings (%)', ascending=True)
colors = ['#E50914' if x > 40 else '#831010' for x in ads_sorted['Ads Savings (%)']]
sns.barplot(data=ads_sorted, y='Country', x='Ads Savings (%)', palette=colors)
plt.title('Savings with Ads Tier vs Basic Tier', fontsize=16, fontweight='bold')
plt.xlabel('Savings (%)')
plt.tight_layout()
plt.savefig('docs/images/ads_savings.png', dpi=150, bbox_inches='tight')
plt.show()

## Regional Analysis

In [None]:
# Regional breakdown
region_stats = df.groupby('Region').agg({
    'Total Library Size': 'mean',
    'Cost - Basic ($)': 'mean',
    'Cost - Standard ($)': 'mean',
    'Cost - Premium ($)': 'mean',
    'Cost per Title - Premium ($)': 'mean',
    'Country': 'count'
}).round(2)
region_stats.columns = ['Avg Library Size', 'Avg Basic ($)', 'Avg Standard ($)', 'Avg Premium ($)', 'Avg Cost per Title', 'Countries']
region_stats = region_stats.sort_values('Countries', ascending=False)
print(region_stats)

In [None]:
# Regional visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Countries per region
region_counts = df['Region'].value_counts()
axes[0, 0].pie(region_counts.values, labels=region_counts.index, autopct='%1.0f%%', 
               colors=sns.color_palette('Reds_r', len(region_counts)))
axes[0, 0].set_title('Netflix Coverage by Region', fontsize=12, fontweight='bold')

# Avg library size by region
region_lib = df.groupby('Region')['Total Library Size'].mean().sort_values(ascending=True)
sns.barplot(x=region_lib.values, y=region_lib.index, ax=axes[0, 1], palette='Reds_r')
axes[0, 1].set_title('Average Library Size by Region', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Library Size')

# Avg premium cost by region
region_cost = df.groupby('Region')['Cost - Premium ($)'].mean().sort_values(ascending=True)
sns.barplot(x=region_cost.values, y=region_cost.index, ax=axes[1, 0], palette='Blues_r')
axes[1, 0].set_title('Average Premium Cost by Region', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Monthly Cost ($)')

# Avg cost per title by region
region_value = df.groupby('Region')['Cost per Title - Premium ($)'].mean().sort_values(ascending=True)
sns.barplot(x=region_value.values, y=region_value.index, ax=axes[1, 1], palette='Greens_r')
axes[1, 1].set_title('Average Cost per Title by Region (Value)', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Cost per Title ($)')

plt.suptitle('Regional Analysis of Netflix Subscriptions (2025)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('docs/images/regional_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## Library Size vs Cost Correlation

In [None]:
# Scatter plot: Library Size vs Premium Cost
fig, ax = plt.subplots(figsize=(14, 8))

# Color by region
regions = df['Region'].unique()
colors = dict(zip(regions, sns.color_palette('husl', len(regions))))

for region in regions:
    region_data = df[df['Region'] == region]
    ax.scatter(region_data['Total Library Size'], region_data['Cost - Premium ($)'], 
               label=region, alpha=0.7, s=80, c=[colors[region]])

# Add trend line
z = np.polyfit(df['Total Library Size'], df['Cost - Premium ($)'], 1)
p = np.poly1d(z)
ax.plot(df['Total Library Size'].sort_values(), p(df['Total Library Size'].sort_values()), 
        'r--', alpha=0.8, linewidth=2, label='Trend')

ax.set_xlabel('Library Size', fontsize=12)
ax.set_ylabel('Premium Cost ($)', fontsize=12)
ax.set_title('Library Size vs Premium Subscription Cost', fontsize=16, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig('docs/images/correlation_scatter.png', dpi=150, bbox_inches='tight')
plt.show()

# Correlation
corr = df['Total Library Size'].corr(df['Cost - Premium ($)'])
print(f"\nCorrelation between Library Size and Premium Cost: {corr:.3f}")

## Price Tier Spread Analysis

In [None]:
# Calculate tier spreads
df['Basic to Premium Spread'] = df['Cost - Premium ($)'] - df['Cost - Basic ($)']
df['Premium Markup (%)'] = ((df['Cost - Premium ($)'] / df['Cost - Basic ($)'] - 1) * 100).round(1)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Highest spreads
top_spread = df.nlargest(15, 'Basic to Premium Spread')
sns.barplot(data=top_spread, y='Country', x='Basic to Premium Spread', ax=axes[0], palette='Reds_r')
axes[0].set_title('Largest Price Spread (Basic to Premium)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Price Difference ($)')

# Lowest spreads
bot_spread = df[df['Basic to Premium Spread'] > 0].nsmallest(15, 'Basic to Premium Spread')
sns.barplot(data=bot_spread, y='Country', x='Basic to Premium Spread', ax=axes[1], palette='Greens_r')
axes[1].set_title('Smallest Price Spread (Basic to Premium)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Price Difference ($)')

plt.tight_layout()
plt.savefig('docs/images/price_spread.png', dpi=150, bbox_inches='tight')
plt.show()

## Summary Statistics for Dashboard

In [None]:
# Create summary data for dashboard
summary = {
    'total_countries': len(df),
    'avg_library_size': int(df['Total Library Size'].mean()),
    'max_library_country': df.loc[df['Total Library Size'].idxmax(), 'Country'],
    'max_library_size': int(df['Total Library Size'].max()),
    'min_library_country': df.loc[df['Total Library Size'].idxmin(), 'Country'],
    'min_library_size': int(df['Total Library Size'].min()),
    'avg_premium_cost': df['Cost - Premium ($)'].mean().round(2),
    'most_expensive_country': df.loc[df['Cost - Premium ($)'].idxmax(), 'Country'],
    'most_expensive_cost': df['Cost - Premium ($)'].max().round(2),
    'cheapest_country': df.loc[df[df['Cost - Premium ($)'] > 0]['Cost - Premium ($)'].idxmin(), 'Country'],
    'cheapest_cost': df[df['Cost - Premium ($)'] > 0]['Cost - Premium ($)'].min().round(2),
    'best_value_country': df.loc[df[df['Cost per Title - Premium ($)'] > 0]['Cost per Title - Premium ($)'].idxmin(), 'Country'],
    'countries_with_ads': len(ads_df)
}

print("Key Findings:")
for key, value in summary.items():
    print(f"  {key}: {value}")

# Export summary and data for dashboard
import json
with open('docs/summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

# Export processed data
export_cols = ['Country', 'Region', 'Total Library Size', 'Cost - Basic with Ads ($)', 
               'Cost - Basic ($)', 'Cost - Standard ($)', 'Cost - Premium ($)',
               'Cost per Title - Premium ($)']
df[export_cols].to_json('docs/data.json', orient='records', indent=2)

## Conclusions

### Key Findings from 2025 Data:

1. **Global Coverage**: Netflix now operates in 128 countries across all continents

2. **Library Size Variation**: Content libraries range significantly, with some countries having nearly double the content of others

3. **Pricing Disparities**: Premium subscription costs vary dramatically by country, with the most expensive markets paying several times more than the cheapest

4. **Ads Tier Adoption**: The ad-supported tier is available in select markets, offering significant savings (30-60% off basic tier)

5. **Regional Patterns**: 
   - Europe tends to have higher subscription costs
   - Developing markets often have better value (lower cost per title)
   - Library sizes are relatively consistent globally

6. **Best Value Markets**: Countries with low subscription costs but above-average libraries offer the best value for money