In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_bill = pd.read_csv("./bill_stats.csv")

In [14]:
pd.set_option("display.max_columns", None)

In [5]:
df_bill.dtypes

rank                                                   int64
finalWorth                                             int64
category                                            category
personName                                            object
age                                                    int64
country                                               object
city                                                  object
source                                                object
industries                                            object
countryOfCitizenship                                  object
organization                                          object
selfMade                                                bool
status                                              category
gender                                              category
birthDate                                     datetime64[ns]
lastName                                              object
firstName               

In [3]:
df_bill.replace(
    {
        np.nan : 0
    }
    , inplace=True
)

In [4]:
df_bill = df_bill.astype(
    {
    'category' : 'category',
    'status' : 'category',
    'gender' : 'category',
    'birthDate' : 'datetime64[ns]',
    'date' : 'datetime64[ns]',
    'age' : 'int',
    'birthYear' : 'int',
    'birthMonth' : 'int',
    'birthDay' : 'int',
    'population_country' : 'int'
    }
)

In [6]:
df_bill['gdp_country'] = df_bill['gdp_country'] \
        .str.replace('$', '').str.replace(',', '').str.strip()
df_bill['gdp_country'] = pd.to_numeric(df_bill['gdp_country'])
df_bill['gdp_country'] = df_bill['gdp_country'].fillna(0)
df_bill['gdp_country'] = df_bill['gdp_country'].astype('int')
df_bill['nBillionairesCtr'] = df_bill.groupby(
    'country')['country'].transform('count')
df_bill['totalWealthCtr'] = df_bill.groupby('country')['finalWorth'].transform('sum')
df_bill['meanWealthCtr'] = df_bill.groupby('country')['finalWorth'].transform('mean')
df_bill['nBillionairesIndy'] = df_bill.groupby(
    'industries')['industries'].transform('count')
df_bill['totalWealthCtrIndy'] = df_bill.groupby(
    'industries')['finalWorth'].transform('sum')
df_bill['meanWealthCtrIndy'] = df_bill.groupby(
    'industries')['finalWorth'].transform('mean')
df_bill['birthDecade'] = pd.cut(df_bill['birthYear'], bins=range(
    1920, 2005, 10), include_lowest=True, right=False)
df_bill['birthDecade'] = df_bill['birthDecade'].astype('str')
df_bill['birthDecade'] = df_bill['birthDecade'].str.replace(
    '[', '').str.replace(',', ' -').str.replace(')', '')
df_bill['birthDecade'] = df_bill['birthDecade'].replace('2000+', np.nan)
df_bill = df_bill.drop(columns=["category", "organization",
                 "status", "lastName", "firstName", "title"], axis=1)

In [8]:
df_bill.dtypes

rank                                                   int64
finalWorth                                             int64
category                                            category
personName                                            object
age                                                    int64
country                                               object
city                                                  object
source                                                object
industries                                            object
countryOfCitizenship                                  object
organization                                          object
selfMade                                                bool
status                                              category
gender                                              category
birthDate                                     datetime64[ns]
lastName                                              object
firstName               

In [9]:
df_bill['birthMonth'].unique()

array([ 3,  6,  1,  8, 10,  2,  4,  7, 12,  5, 11,  9,  0])

In [10]:
birth_month_names = {
    0 : "NA",
    1 : "January",
    2 : "February",
    3 : "March", 
    4 : "April", 
    5 : "May",
    6 : "June", 
    7 : "July",
    8 : "August",
    9 : "September",
    10 : "October",
    11 : "November",
    12 : "December"
}

In [53]:
list(df_bill['category'].unique())

['Fashion & Retail',
 'Automotive',
 'Technology',
 'Finance & Investments',
 'Media & Entertainment',
 'Telecom',
 'Diversified',
 'Food & Beverage',
 'Logistics',
 'Gambling & Casinos',
 'Manufacturing',
 'Real Estate',
 'Metals & Mining',
 'Energy',
 'Healthcare',
 'Service',
 'Construction & Engineering',
 'Sports']

In [16]:
df_bill['country'].nunique()

79

In [33]:
for value in df_bill['category'].unique():
    print(f"'{value}'" + ",")

'Fashion & Retail',
'Automotive',
'Technology',
'Finance & Investments',
'Media & Entertainment',
'Telecom',
'Diversified',
'Food & Beverage',
'Logistics',
'Gambling & Casinos',
'Manufacturing',
'Real Estate',
'Metals & Mining',
'Energy',
'Healthcare',
'Service',
'Construction & Engineering',
'Sports',


In [35]:
df_bill.groupby('country').size().reset_index(name='count')

Unnamed: 0,country,count
0,0,38
1,Algeria,1
2,Andorra,1
3,Argentina,4
4,Armenia,1
...,...,...
74,United Kingdom,82
75,United States,754
76,Uruguay,1
77,Uzbekistan,1


In [57]:
df_bill['country'].unique()

array(['France', 'United States', 'Mexico', 'India', 'Spain', 'China',
       'Canada', 'Germany', 'Switzerland', 'Belgium', 'Hong Kong',
       'Austria', 'Japan', 'United Kingdom', 'Australia', 'Indonesia',
       'United Arab Emirates', 'Russia', 'Chile', 'Monaco',
       'Czech Republic', 'Sweden', 0, 'Thailand', 'Uzbekistan',
       'Singapore', 'Nigeria', 'Israel', 'Italy', 'South Africa',
       'Brazil', 'Malaysia', 'South Korea', 'New Zealand', 'Philippines',
       'Taiwan', 'Norway', 'Egypt', 'Denmark', 'Eswatini (Swaziland)',
       'Colombia', 'Netherlands', 'Poland', 'Bahamas', 'Ukraine',
       'Cayman Islands', 'Greece', 'Turkey', 'Argentina', 'Georgia',
       'Portugal', 'Kazakhstan', 'Algeria', 'Vietnam', 'Latvia',
       'Finland', 'Bermuda', 'Luxembourg', 'British Virgin Islands',
       'Cambodia', 'Lebanon', 'Oman', 'Ireland', 'Cyprus', 'Guernsey',
       'Liechtenstein', 'Turks and Caicos Islands', 'Romania', 'Qatar',
       'Uruguay', 'Nepal', 'Slovakia', 'Moro

In [43]:
df_bill.groupby('country')[['latitude_country', 'longitude_country']].size().reset_index()

Unnamed: 0,country,0
0,0,38
1,Algeria,1
2,Andorra,1
3,Argentina,4
4,Armenia,1
...,...,...
74,United Kingdom,82
75,United States,754
76,Uruguay,1
77,Uzbekistan,1


In [50]:
df_bill_countries = df_bill[df_bill['country'].isin(df_bill['country'].unique())]

In [51]:
df_bill_countries

Unnamed: 0,rank,finalWorth,category,personName,age,country,city,source,industries,countryOfCitizenship,...,cpi_change_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country_country,total_tax_rate_country,population_country,latitude_country,longitude_country
0,1,211000,Fashion & Retail,Bernard Arnault & family,74,France,Paris,LVMH,Fashion & Retail,France,...,1.1,"$2,715,518,274,227",65.6,102.5,82.5,24.2,60.7,67059887,46.227638,2.213749
1,2,180000,Automotive,Elon Musk,51,United States,Austin,"Tesla, SpaceX",Automotive,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523,37.090240,-95.712891
2,3,114000,Technology,Jeff Bezos,59,United States,Medina,Amazon,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523,37.090240,-95.712891
3,4,107000,Technology,Larry Ellison,78,United States,Lanai,Oracle,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523,37.090240,-95.712891
4,5,106000,Finance & Investments,Warren Buffett,92,United States,Omaha,Berkshire Hathaway,Finance & Investments,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523,37.090240,-95.712891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2635,2540,1000,Healthcare,Yu Rong,51,China,Shanghai,Health clinics,Healthcare,China,...,2.9,"$19,910,000,000,000",50.6,100.2,77.0,9.4,59.2,1397715000,35.861660,104.195397
2636,2540,1000,Food & Beverage,"Richard Yuengling, Jr.",80,United States,Pottsville,Beer,Food & Beverage,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523,37.090240,-95.712891
2637,2540,1000,Manufacturing,Zhang Gongyun,60,China,Gaomi,Tyre manufacturing machinery,Manufacturing,China,...,2.9,"$19,910,000,000,000",50.6,100.2,77.0,9.4,59.2,1397715000,35.861660,104.195397
2638,2540,1000,Real Estate,Zhang Guiping & family,71,China,Nanjing,Real estate,Real Estate,China,...,2.9,"$19,910,000,000,000",50.6,100.2,77.0,9.4,59.2,1397715000,35.861660,104.195397


In [2]:
from faicons import icon_svg

In [7]:
df_bill.dtypes

rank                                                   int64
finalWorth                                             int64
personName                                            object
age                                                    int64
country                                               object
city                                                  object
source                                                object
industries                                            object
countryOfCitizenship                                  object
selfMade                                                bool
gender                                              category
birthDate                                     datetime64[ns]
date                                          datetime64[ns]
state                                                 object
residenceStateRegion                                  object
birthYear                                              int64
birthMonth              

In [8]:
df_bill_num = df_bill[df_bill.select_dtypes(include=['number', 'bool']).columns].copy()
df_bill_str = df_bill[df_bill.select_dtypes(include=['object', 'bool']).columns].copy()

In [9]:
df_bill_num.dtypes

rank                                            int64
finalWorth                                      int64
age                                             int64
selfMade                                         bool
birthYear                                       int64
birthMonth                                      int64
birthDay                                        int64
cpi_country                                   float64
cpi_change_country                            float64
gdp_country                                     int64
gross_tertiary_education_enrollment           float64
gross_primary_education_enrollment_country    float64
life_expectancy_country                       float64
tax_revenue_country_country                   float64
total_tax_rate_country                        float64
population_country                              int64
latitude_country                              float64
longitude_country                             float64
nBillionairesCtr            

In [12]:
df_bill_num.columns

Index(['rank', 'finalWorth', 'age', 'selfMade', 'birthYear', 'birthMonth',
       'birthDay', 'cpi_country', 'cpi_change_country', 'gdp_country',
       'gross_tertiary_education_enrollment',
       'gross_primary_education_enrollment_country', 'life_expectancy_country',
       'tax_revenue_country_country', 'total_tax_rate_country',
       'population_country', 'latitude_country', 'longitude_country',
       'nBillionairesCtr', 'totalWealthCtr', 'meanWealthCtr',
       'nBillionairesIndy', 'totalWealthCtrIndy', 'meanWealthCtrIndy'],
      dtype='object')

In [15]:
df_bill_num.head()

Unnamed: 0,rank,finalWorth,age,selfMade,birthYear,birthMonth,birthDay,cpi_country,cpi_change_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country_country,total_tax_rate_country,population_country,latitude_country,longitude_country,nBillionairesCtr,totalWealthCtr,meanWealthCtr,nBillionairesIndy,totalWealthCtrIndy,meanWealthCtrIndy
0,1,211000,74,False,1949,3,5,110.05,1.1,2715518274227,65.6,102.5,82.5,24.2,60.7,67059887,46.227638,2.213749,35,499500,14271.428571,266,1698800,6386.466165
1,2,180000,51,True,1971,6,28,117.24,7.5,21427700000000,88.2,101.8,78.5,9.6,36.6,328239523,37.09024,-95.712891,754,4575100,6067.771883,73,525300,7195.890411
2,3,114000,59,True,1964,1,12,117.24,7.5,21427700000000,88.2,101.8,78.5,9.6,36.6,328239523,37.09024,-95.712891,754,4575100,6067.771883,314,1877900,5980.573248
3,4,107000,78,True,1944,8,17,117.24,7.5,21427700000000,88.2,101.8,78.5,9.6,36.6,328239523,37.09024,-95.712891,754,4575100,6067.771883,314,1877900,5980.573248
4,5,106000,92,True,1930,8,30,117.24,7.5,21427700000000,88.2,101.8,78.5,9.6,36.6,328239523,37.09024,-95.712891,754,4575100,6067.771883,372,1605100,4314.784946


In [10]:
df_bill_str.dtypes

personName              object
country                 object
city                    object
source                  object
industries              object
countryOfCitizenship    object
selfMade                  bool
state                   object
residenceStateRegion    object
birthDecade             object
dtype: object