In [1]:
import numpy as np
import pandas as pd

from thesis_tools.utils.data import read_billionaires_data
from thesis_tools.utils.latex import dataframe_to_latex_table

In [2]:
df = read_billionaires_data()

In [3]:
df

Unnamed: 0,year,rank,net_worth,full_name,self_made,country_of_citizenship,region,sub_region,log_net_worth
0,1997-01-01,,2.0,Chatri Sophonpanich & family,False,Thailand,East Asia,Southeast Asia,0.693147
1,1997-01-01,,1.8,King Bhumibol Adulyadej,False,Thailand,East Asia,Southeast Asia,0.587787
2,1998-01-01,,3.3,Edmond Safra,True,Lebanon,Middle East,Not a sub-region,1.193922
3,1999-01-01,,1.0,Srichand & Gopichand Hinduja,False,India,India,India,0.000000
4,1999-01-01,,7.1,Dieter Schwarz,True,Germany,Europe,Germany,1.960095
...,...,...,...,...,...,...,...,...,...
31727,2023-01-01,2540.0,1.0,Yu Rong,True,China,China,China,0.000000
31728,2023-01-01,2540.0,1.0,"Richard Yuengling, Jr.",False,United States,North America,U.S.,0.000000
31729,2023-01-01,2540.0,1.0,Zhang Gongyun,True,China,China,China,0.000000
31730,2023-01-01,2540.0,1.0,Zhang Guiping & family,True,China,China,China,0.000000


In [4]:
df.columns

Index(['year', 'rank', 'net_worth', 'full_name', 'self_made',
       'country_of_citizenship', 'region', 'sub_region', 'log_net_worth'],
      dtype='object')

In [5]:
df

Unnamed: 0,year,rank,net_worth,full_name,self_made,country_of_citizenship,region,sub_region,log_net_worth
0,1997-01-01,,2.0,Chatri Sophonpanich & family,False,Thailand,East Asia,Southeast Asia,0.693147
1,1997-01-01,,1.8,King Bhumibol Adulyadej,False,Thailand,East Asia,Southeast Asia,0.587787
2,1998-01-01,,3.3,Edmond Safra,True,Lebanon,Middle East,Not a sub-region,1.193922
3,1999-01-01,,1.0,Srichand & Gopichand Hinduja,False,India,India,India,0.000000
4,1999-01-01,,7.1,Dieter Schwarz,True,Germany,Europe,Germany,1.960095
...,...,...,...,...,...,...,...,...,...
31727,2023-01-01,2540.0,1.0,Yu Rong,True,China,China,China,0.000000
31728,2023-01-01,2540.0,1.0,"Richard Yuengling, Jr.",False,United States,North America,U.S.,0.000000
31729,2023-01-01,2540.0,1.0,Zhang Gongyun,True,China,China,China,0.000000
31730,2023-01-01,2540.0,1.0,Zhang Guiping & family,True,China,China,China,0.000000


In [6]:
# Assuming df is your dataframe
# Group by 'region' and 'year' to get the number of billionaires per region per year
grouped = df.groupby(['region', 'year']).agg(
    num_billionaires=('full_name', 'count'),
    avg_net_worth=('net_worth', 'mean'),
    avg_log_net_worth=('log_net_worth', 'mean')
).reset_index()

# Group by 'region' to calculate the required statistics
summary_df = grouped.groupby('region').agg(
    avg_num_billionaires=('num_billionaires', 'mean'),
    avg_net_worth=('avg_net_worth', 'mean'),
    avg_log_net_worth=('avg_log_net_worth', 'mean'),
    min_num_billionaires=('num_billionaires', 'min'),
    max_num_billionaires=('num_billionaires', 'max')
).reset_index()

# Set the region as the index
summary_df.set_index('region', inplace=True)

summary_df


Unnamed: 0_level_0,avg_num_billionaires,avg_net_worth,avg_log_net_worth,min_num_billionaires,max_num_billionaires
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Central Eurasia,78.217391,3.717604,0.934771,5,132
China,211.375,3.233832,0.793005,1,698
East Asia,133.28,2.717816,0.762768,2,306
Europe,248.88,4.114906,1.056303,3,509
India,61.875,3.958797,0.940472,1,169
Middle East,53.2,2.944171,0.806056,1,90
North America,471.956522,4.231217,0.966241,176,800
Rest of World,10.73913,3.543045,1.021715,2,23
South America,55.52,4.016755,0.914527,1,113


In [7]:
dataframe_to_latex_table(
    summary_df, 
    caption='Summary statistics of Forbes billionaires data grouped by region', 
    label='tab:table_summary_statistics_billionaires',
    n_decimals=1
)

\begin{table}
\caption{Summary statistics of Forbes billionaires data grouped by region}
\label{tab:table_summary_statistics_billionaires}
\begin{tabular}{lrrrrr}
\toprule
 & avg\_num\_billionaires & avg\_net\_worth & avg\_log\_net\_worth & min\_num\_billionaires & max\_num\_billionaires \\
region &  &  &  &  &  \\
\midrule
Central Eurasia & 78.2 & 3.7 & 0.9 & 5 & 132 \\
China & 211.4 & 3.2 & 0.8 & 1 & 698 \\
East Asia & 133.3 & 2.7 & 0.8 & 2 & 306 \\
Europe & 248.9 & 4.1 & 1.1 & 3 & 509 \\
India & 61.9 & 4.0 & 0.9 & 1 & 169 \\
Middle East & 53.2 & 2.9 & 0.8 & 1 & 90 \\
North America & 472.0 & 4.2 & 1.0 & 176 & 800 \\
Rest of World & 10.7 & 3.5 & 1.0 & 2 & 23 \\
South America & 55.5 & 4.0 & 0.9 & 1 & 113 \\
\bottomrule
\end{tabular}
\end{table}



In [8]:
def billionaires_stats(df):
    df = df.copy()
    # Convert the 'year' column from timestamp to datetime, and extract the year
    df['year'] = pd.to_datetime(df['year'], unit='ns').dt.year
    
    # Ensure the DataFrame is sorted by year
    df = df.sort_values(by='year')

    # Calculate the number of billionaires per year
    yearly_counts = df.groupby('year')['full_name'].count()

    # Calculate the increase/decrease in the number of billionaires compared to the previous year
    yearly_diff = yearly_counts.diff().fillna(0).astype(int)

    # Initialize lists to store the results
    new_billionaires = []
    left_billionaires = []
    remained_billionaires = []

    # Iterate through each year to calculate new, left, and remained billionaires
    for year in yearly_counts.index:
        if year == yearly_counts.index.min():
            new_billionaires.append(0)
            left_billionaires.append(0)
            remained_billionaires.append(0)
        else:
            current_year_names = set(df[df['year'] == year]['full_name'])
            previous_year_names = set(df[df['year'] == year - 1]['full_name'])

            new_billionaires.append(len(current_year_names - previous_year_names))
            left_billionaires.append(len(previous_year_names - current_year_names))
            remained_billionaires.append(len(current_year_names & previous_year_names))

    # Calculate the proportion of billionaires who remained on the list
    proportion_remained = [rem / total if total > 0 else 0 for rem, total in zip(remained_billionaires, yearly_counts)]

    # Create the final DataFrame
    stats_df = pd.DataFrame({
        'year': yearly_counts.index,
        'N': yearly_counts.values,
        'change': yearly_diff.values,
        'new': new_billionaires,
        'left_list': left_billionaires,
        'remained': remained_billionaires,
        'share_remained': proportion_remained
    })

    # set year as index
    stats_df.set_index('year', inplace=True)

    return stats_df


In [9]:
billionaires_stats(df)

Unnamed: 0_level_0,N,change,new,left_list,remained,share_remained
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1997,2,0,0,0,0,0.0
1998,1,-1,1,2,0,0.0
1999,8,7,7,0,1,0.125
2000,8,0,5,5,3,0.375
2001,335,327,332,5,3,0.008955
2002,333,-2,27,29,306,0.918919
2003,332,-1,38,39,294,0.885542
2004,432,100,101,1,331,0.766204
2005,530,98,101,3,429,0.809434
2006,628,98,108,10,520,0.828025


In [10]:
dataframe_to_latex_table(
    billionaires_stats(df), 
    caption='Changes in the number of billionaires in the Forbes dataset over time', 
    label='tab:table_billionaires_changes_over_time',
    n_decimals=2
)

\begin{table}
\caption{Changes in the number of billionaires in the Forbes dataset over time}
\label{tab:table_billionaires_changes_over_time}
\begin{tabular}{lrrrrrr}
\toprule
 & N & change & new & left\_list & remained & share\_remained \\
year &  &  &  &  &  &  \\
\midrule
1997 & 2 & 0 & 0 & 0 & 0 & 0.00 \\
1998 & 1 & -1 & 1 & 2 & 0 & 0.00 \\
1999 & 8 & 7 & 7 & 0 & 1 & 0.12 \\
2000 & 8 & 0 & 5 & 5 & 3 & 0.38 \\
2001 & 335 & 327 & 332 & 5 & 3 & 0.01 \\
2002 & 333 & -2 & 27 & 29 & 306 & 0.92 \\
2003 & 332 & -1 & 38 & 39 & 294 & 0.89 \\
2004 & 432 & 100 & 101 & 1 & 331 & 0.77 \\
2005 & 530 & 98 & 101 & 3 & 429 & 0.81 \\
2006 & 628 & 98 & 108 & 10 & 520 & 0.83 \\
2007 & 761 & 133 & 147 & 14 & 614 & 0.81 \\
2008 & 908 & 147 & 223 & 76 & 685 & 0.75 \\
2009 & 738 & -170 & 39 & 209 & 699 & 0.95 \\
2010 & 1011 & 273 & 354 & 81 & 657 & 0.65 \\
2011 & 1209 & 198 & 255 & 58 & 953 & 0.79 \\
2012 & 1226 & 17 & 212 & 195 & 1013 & 0.83 \\
2013 & 1426 & 200 & 304 & 104 & 1121 & 0.79 \\
2014 & 1645 &

In [11]:
df = read_billionaires_data(only_years=['2023'])

In [12]:
df

Unnamed: 0,year,rank,net_worth,full_name,self_made,country_of_citizenship,region,sub_region,log_net_worth
0,2023-01-01,1,211.0,Bernard Arnault & family,False,France,Europe,France,5.351858
1,2023-01-01,2,180.0,Elon Musk,True,United States,North America,U.S.,5.192957
2,2023-01-01,3,114.0,Jeff Bezos,True,United States,North America,U.S.,4.736198
3,2023-01-01,4,107.0,Larry Ellison,True,United States,North America,U.S.,4.672829
4,2023-01-01,5,106.0,Warren Buffett,True,United States,North America,U.S.,4.663439
...,...,...,...,...,...,...,...,...,...
2635,2023-01-01,2540,1.0,Yu Rong,True,China,China,China,0.000000
2636,2023-01-01,2540,1.0,"Richard Yuengling, Jr.",False,United States,North America,U.S.,0.000000
2637,2023-01-01,2540,1.0,Zhang Gongyun,True,China,China,China,0.000000
2638,2023-01-01,2540,1.0,Zhang Guiping & family,True,China,China,China,0.000000


In [18]:
sub_regions = df['sub_region'].unique()
self_made_share = {}
for sub_region in sub_regions:
    # find the share of self made billionaires in each sub-region
    self_made_share[sub_region] = df[df['sub_region'] == sub_region]['self_made'].mean()
self_made_share_df = pd.DataFrame(self_made_share, index=['self_made_share']).T.sort_values(by='self_made_share', ascending=False)

In [19]:
self_made_share_df

Unnamed: 0,self_made_share
Russia,0.990385
China,0.934046
British Islands,0.885246
Japan,0.775
Canada,0.761905
U.S.,0.717007
Australia,0.680851
Israel + Turkey,0.642857
Not a sub-region,0.614583
Asian Islands,0.589474
