# Revenue por País - Treemap

Visualización de revenue total por país usando treemap (rectángulos proporcionales al revenue)

In [11]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Cargar dataset
df = pd.read_csv('../data/dataset.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nPrimeras filas:")
df.head()

Dataset shape: (20000, 11)

Primeras filas:


Unnamed: 0,user_id,country,country_region,source,platform,device_family,os_version,event_1,event_2,event_3,revenue
0,a62c86b4,cr,Provincia de San Jose,Organic,iOS,Apple iPhone,14.4,0,102,0.0,0.317348
1,1d5189a4,ie,Leinster,Organic,Android,Samsung Galaxy Phone,8.0.0,0,32,,0.105701
2,9a2ad63a,uy,Departamento de Montevideo,Organic,Android,samsung samsung SM-A107M,10,0,4,0.0,0.003268
3,e1b19bfd,ar,Buenos Aires,Organic,ios,Apple iPad,12.5.1,0,117,0.0,0.180275
4,312293d4,es,Valencia,Organic,iOS,Apple iPhone,12.5.1,0,57,1.0,0.133304


In [12]:
# Agregar revenue por país
country_revenue = df.groupby('country').agg({
    'revenue': 'sum',
    'user_id': 'count'  # Número de usuarios
}).reset_index()

country_revenue.columns = ['country', 'total_revenue', 'num_users']

# Calcular revenue promedio por usuario
country_revenue['avg_revenue_per_user'] = country_revenue['total_revenue'] / country_revenue['num_users']

# Ordenar por revenue total
country_revenue = country_revenue.sort_values('total_revenue', ascending=False)

print(f"Total países: {len(country_revenue)}")
print(f"\nTop 10 países por revenue:")
country_revenue.head(10)

Total países: 91

Top 10 países por revenue:


Unnamed: 0,country,total_revenue,num_users,avg_revenue_per_user
64,pe,589457.325292,130,4534.287118
29,es,1528.458848,12892,0.118559
31,fr,437.261839,3310,0.132103
85,us,128.967276,299,0.431329
56,mx,72.167796,840,0.085914
46,it,49.472192,253,0.195542
23,de,44.721057,168,0.266197
3,ar,35.062253,497,0.070548
15,ch,33.883927,87,0.38947
19,co,23.274629,254,0.091632


In [13]:
# Crear treemap interactivo
fig = px.treemap(
    country_revenue,
    path=['country'],
    values='total_revenue',
    color='avg_revenue_per_user',
    color_continuous_scale='RdYlGn',
    hover_data={
        'total_revenue': ':$,.2f',
        'num_users': ':,',
        'avg_revenue_per_user': ':$,.2f'
    },
    title='Revenue por País (tamaño = revenue total, color = revenue promedio por usuario)'
)

fig.update_traces(
    textposition='middle center',
    texttemplate='<b>%{label}</b><br>$%{value:,.0f}',
    marker=dict(line=dict(width=2, color='white'))
)

fig.update_layout(
    width=1200,
    height=800,
    font=dict(size=12)
)

fig.show()

In [14]:
# Estadísticas por país
print("="*80)
print("ESTADÍSTICAS POR PAÍS")
print("="*80)

total_revenue = country_revenue['total_revenue'].sum()
total_users = country_revenue['num_users'].sum()

print(f"\nRevenue total: ${total_revenue:,.2f}")
print(f"Total usuarios: {total_users:,}")
print(f"Revenue promedio global: ${total_revenue/total_users:,.2f}")

# Top 5 países por revenue total
print("\n" + "="*80)
print("TOP 5 PAÍSES POR REVENUE TOTAL")
print("="*80)
for idx, row in country_revenue.head(5).iterrows():
    pct = (row['total_revenue'] / total_revenue) * 100
    print(f"{row['country']:20s} | Revenue: ${row['total_revenue']:>12,.2f} ({pct:>5.1f}%) | Users: {row['num_users']:>6,} | Avg: ${row['avg_revenue_per_user']:>8,.2f}")

# Top 5 países por revenue promedio por usuario
country_revenue_sorted_avg = country_revenue.sort_values('avg_revenue_per_user', ascending=False)
print("\n" + "="*80)
print("TOP 5 PAÍSES POR REVENUE PROMEDIO POR USUARIO")
print("="*80)
for idx, row in country_revenue_sorted_avg.head(5).iterrows():
    print(f"{row['country']:20s} | Avg: ${row['avg_revenue_per_user']:>8,.2f} | Users: {row['num_users']:>6,} | Total: ${row['total_revenue']:>12,.2f}")

# Concentración de revenue
country_revenue['cumulative_pct'] = (country_revenue['total_revenue'].cumsum() / total_revenue) * 100
top_5_countries = country_revenue.head(5)['cumulative_pct'].iloc[-1]
top_10_countries = country_revenue.head(10)['cumulative_pct'].iloc[-1]

print("\n" + "="*80)
print("CONCENTRACIÓN DE REVENUE")
print("="*80)
print(f"Top 5 países concentran: {top_5_countries:.1f}% del revenue total")
print(f"Top 10 países concentran: {top_10_countries:.1f}% del revenue total")

ESTADÍSTICAS POR PAÍS

Revenue total: $591,926.61
Total usuarios: 20,000
Revenue promedio global: $29.60

TOP 5 PAÍSES POR REVENUE TOTAL
pe                   | Revenue: $  589,457.33 ( 99.6%) | Users:    130 | Avg: $4,534.29
es                   | Revenue: $    1,528.46 (  0.3%) | Users: 12,892 | Avg: $    0.12
fr                   | Revenue: $      437.26 (  0.1%) | Users:  3,310 | Avg: $    0.13
us                   | Revenue: $      128.97 (  0.0%) | Users:    299 | Avg: $    0.43
mx                   | Revenue: $       72.17 (  0.0%) | Users:    840 | Avg: $    0.09

TOP 5 PAÍSES POR REVENUE PROMEDIO POR USUARIO
pe                   | Avg: $4,534.29 | Users:    130 | Total: $  589,457.33
gi                   | Avg: $    1.16 | Users:      2 | Total: $        2.33
za                   | Avg: $    0.77 | Users:      1 | Total: $        0.77
us                   | Avg: $    0.43 | Users:    299 | Total: $      128.97
nz                   | Avg: $    0.40 | Users:      3 | Total: $    

In [15]:
# Treemap solo con Top 20 países (más legible)
top_20_countries = country_revenue.head(20).copy()

fig2 = px.treemap(
    top_20_countries,
    path=['country'],
    values='total_revenue',
    color='avg_revenue_per_user',
    color_continuous_scale='RdYlGn',
    hover_data={
        'total_revenue': ':$,.2f',
        'num_users': ':,',
        'avg_revenue_per_user': ':$,.2f'
    },
    title='Top 20 Países por Revenue (tamaño = revenue total, color = revenue promedio por usuario)'
)

fig2.update_traces(
    textposition='middle center',
    texttemplate='<b>%{label}</b><br>$%{value:,.0f}<br>%{customdata[1]:,} users',
    marker=dict(line=dict(width=3, color='white'))
)

fig2.update_layout(
    width=1400,
    height=900,
    font=dict(size=14)
)

fig2.show()

In [16]:
# Análisis de whales por país
df['is_whale'] = df['revenue'] > 1000

whale_analysis = df.groupby('country').agg({
    'revenue': ['sum', 'mean', 'count'],
    'is_whale': 'sum'
}).reset_index()

whale_analysis.columns = ['country', 'total_revenue', 'avg_revenue', 'num_users', 'num_whales']
whale_analysis['whale_revenue_pct'] = (whale_analysis['num_whales'] / whale_analysis['num_users']) * 100

# Ordenar por número de whales
whale_analysis = whale_analysis.sort_values('num_whales', ascending=False)

print("="*80)
print("TOP 10 PAÍSES POR NÚMERO DE WHALES (revenue > $1000)")
print("="*80)
for idx, row in whale_analysis.head(10).iterrows():
    print(f"{row['country']:20s} | Whales: {int(row['num_whales']):>3} ({row['whale_revenue_pct']:>5.1f}%) | Users: {int(row['num_users']):>6,} | Total Revenue: ${row['total_revenue']:>12,.2f}")

TOP 10 PAÍSES POR NÚMERO DE WHALES (revenue > $1000)
pe                   | Whales:  54 ( 41.5%) | Users:    130 | Total Revenue: $  589,457.33
ad                   | Whales:   0 (  0.0%) | Users:     12 | Total Revenue: $        0.68
pt                   | Whales:   0 (  0.0%) | Users:     82 | Total Revenue: $       11.98
pl                   | Whales:   0 (  0.0%) | Users:      2 | Total Revenue: $        0.00
pf                   | Whales:   0 (  0.0%) | Users:      1 | Total Revenue: $        0.20
pa                   | Whales:   0 (  0.0%) | Users:     16 | Total Revenue: $        0.22
nz                   | Whales:   0 (  0.0%) | Users:      3 | Total Revenue: $        1.19
no                   | Whales:   0 (  0.0%) | Users:      2 | Total Revenue: $        0.28
nl                   | Whales:   0 (  0.0%) | Users:     28 | Total Revenue: $        3.75
ni                   | Whales:   0 (  0.0%) | Users:      3 | Total Revenue: $        0.00
