### Table of contents
# 1. Exploratory data analysis
### 1.1 Import assets
### 1.2 Data consistency and wrangling steps
### 1.3 Data cleaning
### 1.4 Descriptive analysis
### 1.5 Exporting the cleaned data set

# 1.1 Import assets

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Import datasets
path = r'C:\Users\raque\Documents\Achievement 6\Forbes billionaires 1997-2023 analysis'
df_billio = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'all_billionaires_1997_2023.csv'), index_col = False)

# 1.2 Data consistency and wrangling steps

In [3]:
df_billio.shape

(31732, 19)

In [4]:
df_billio.head()

Unnamed: 0,year,month,rank,net_worth,last_name,first_name,full_name,birth_date,age,gender,country_of_citizenship,country_of_residence,city_of_residence,business_category,business_industries,organization_name,position_in_organization,self_made,wealth_status
0,1997,7,1,2.0 B,Sophonpanich,Chatri,Chatri Sophonpanich & family,28/02/1934,73.0,Male,Thailand,Thailand,Bangkok,Finance and Investments,['Finance and Investments'],,,False,
1,1997,7,2,1.8 B,Adulyadej,King Bhumibol,King Bhumibol Adulyadej,05/12/1927,69.0,Male,Thailand,,,,,,,False,
2,1998,7,1,3.3 B,Safra,Edmond,Edmond Safra,06/08/1932,65.0,Male,Lebanon,Lebanon,,,,,,True,
3,1999,7,1,7.1 B,Schwarz,Dieter,Dieter Schwarz,24/09/1939,59.0,Male,Germany,Germany,Neckarsulm,Fashion & Retail,['Fashion & Retail'],,,True,
4,1999,7,2,2.9 B,Ebner,Martin,Martin Ebner,01/08/1945,53.0,Male,Switzerland,Switzerland,Wilen,Finance and Investments,['Finance and Investments'],,,False,


In [5]:
df_billio.tail()

Unnamed: 0,year,month,rank,net_worth,last_name,first_name,full_name,birth_date,age,gender,country_of_citizenship,country_of_residence,city_of_residence,business_category,business_industries,organization_name,position_in_organization,self_made,wealth_status
31727,2023,4,2540,1.0 B,Yu,Rong,Yu Rong,14/12/1971,51.0,Male,China,China,Shanghai,Healthcare,['Healthcare'],,,True,Decreased
31728,2023,4,2540,1.0 B,Yuengling,Richard,"Richard Yuengling, Jr.",10/03/1943,80.0,Male,United States,United States,Pottsville,Food & Beverage,['Food & Beverage'],,,False,Remained Even
31729,2023,4,2540,1.0 B,Zhang,Gongyun,Zhang Gongyun,18/12/1962,60.0,Male,China,China,Gaomi,Manufacturing,['Manufacturing'],,,True,Returned to List
31730,2023,4,2540,1.0 B,Zhang,Guiping,Zhang Guiping & family,21/08/1951,71.0,Male,China,China,Nanjing,Real Estate,['Real Estate'],,,True,Decreased
31731,2023,4,2540,1.0 B,Zobel,Inigo,Inigo Zobel,01/11/1956,66.0,Male,Philippines,Philippines,Makati,Diversified,['Diversified'],,,False,Returned to List


In [6]:
df_billio['net_worth'] = df_billio['net_worth'].str.rstrip(' B')

In [7]:
df_billio.rename(columns={"net_worth": "net_worth_billions"}, inplace=True)

In [8]:
df_billio.head(2)

Unnamed: 0,year,month,rank,net_worth_billions,last_name,first_name,full_name,birth_date,age,gender,country_of_citizenship,country_of_residence,city_of_residence,business_category,business_industries,organization_name,position_in_organization,self_made,wealth_status
0,1997,7,1,2.0,Sophonpanich,Chatri,Chatri Sophonpanich & family,28/02/1934,73.0,Male,Thailand,Thailand,Bangkok,Finance and Investments,['Finance and Investments'],,,False,
1,1997,7,2,1.8,Adulyadej,King Bhumibol,King Bhumibol Adulyadej,05/12/1927,69.0,Male,Thailand,,,,,,,False,


In [9]:
# Checking missing data 
df_billio.isnull().sum()

year                            0
month                           0
rank                            0
net_worth_billions              0
last_name                    3689
first_name                   3747
full_name                       0
birth_date                   4404
age                           675
gender                       3829
country_of_citizenship          9
country_of_residence          702
city_of_residence             935
business_category            5843
business_industries           990
organization_name           27344
position_in_organization    27040
self_made                    3689
wealth_status                7146
dtype: int64

In [10]:
# Check for mixed types
for col in df_billio.columns.tolist():
  weird = (df_billio[[col]].applymap(type) != df_billio[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_billio[weird]) > 0:
    print (col)

last_name
first_name
birth_date
gender
country_of_citizenship
country_of_residence
city_of_residence
business_category
business_industries
organization_name
position_in_organization
self_made
wealth_status


In [11]:
# Checking data types
df_billio.dtypes

year                          int64
month                         int64
rank                          int64
net_worth_billions           object
last_name                    object
first_name                   object
full_name                    object
birth_date                   object
age                         float64
gender                       object
country_of_citizenship       object
country_of_residence         object
city_of_residence            object
business_category            object
business_industries          object
organization_name            object
position_in_organization     object
self_made                    object
wealth_status                object
dtype: object

In [13]:
# Changing data types
df_billio['rank'] = df_billio['rank'].astype('str')
df_billio['net_worth_billions'] = df_billio['net_worth_billions'].astype('float64')
df_billio['last_name'] = df_billio['last_name'].astype('str')
df_billio['first_name'] = df_billio['first_name'].astype('str')
df_billio['birth_date'] = df_billio['birth_date'].astype('str')
df_billio['full_name'] = df_billio['full_name'].astype('str')
df_billio['age'] = df_billio['age'].astype('float')
df_billio['gender'] = df_billio['gender'].astype('str')
df_billio['country_of_citizenship'] = df_billio['country_of_citizenship'].astype('str')
df_billio['country_of_residence'] = df_billio['country_of_residence'].astype('str')
df_billio['city_of_residence'] = df_billio['city_of_residence'].astype('str')
df_billio['business_category'] = df_billio['business_category'].astype('str')
df_billio['business_industries'] = df_billio['business_industries'].astype('str')
df_billio['organization_name'] = df_billio['organization_name'].astype('str')
df_billio['position_in_organization'] = df_billio['position_in_organization'].astype('str')
df_billio['self_made'] = df_billio['self_made'].astype('bool')
df_billio['wealth_status'] = df_billio['wealth_status'].astype('str')

# 1.3 Data cleaning

In [14]:
# Removing columns
df_billio = df_billio.drop(columns = ['month', 'birth_date', 'business_category', 'last_name', 'first_name'])

In [15]:
df_billio.head()

Unnamed: 0,year,rank,net_worth_billions,full_name,age,gender,country_of_citizenship,country_of_residence,city_of_residence,business_industries,organization_name,position_in_organization,self_made,wealth_status
0,1997,1,2.0,Chatri Sophonpanich & family,73.0,Male,Thailand,Thailand,Bangkok,['Finance and Investments'],,,False,
1,1997,2,1.8,King Bhumibol Adulyadej,69.0,Male,Thailand,,,,,,False,
2,1998,1,3.3,Edmond Safra,65.0,Male,Lebanon,Lebanon,,,,,True,
3,1999,1,7.1,Dieter Schwarz,59.0,Male,Germany,Germany,Neckarsulm,['Fashion & Retail'],,,True,
4,1999,2,2.9,Martin Ebner,53.0,Male,Switzerland,Switzerland,Wilen,['Finance and Investments'],,,False,


In [16]:
# Storing the duplicated values in df_dups
df_dups = df_billio[df_billio.duplicated()]

In [17]:
# Checking the duplicates 
df_dups

Unnamed: 0,year,rank,net_worth_billions,full_name,age,gender,country_of_citizenship,country_of_residence,city_of_residence,business_industries,organization_name,position_in_organization,self_made,wealth_status
14219,2016,854,2.1,Oleg Deripaska,48.0,Male,Russia,Russia,Moscow,['Metals & Mining'],,,True,Decreased


In [18]:
# Dropping the duplicate row
df_billio_no_dups = df_billio.drop_duplicates()

# 1.4 Descriptive analysis

In [19]:
df_billio_no_dups.head()

Unnamed: 0,year,rank,net_worth_billions,full_name,age,gender,country_of_citizenship,country_of_residence,city_of_residence,business_industries,organization_name,position_in_organization,self_made,wealth_status
0,1997,1,2.0,Chatri Sophonpanich & family,73.0,Male,Thailand,Thailand,Bangkok,['Finance and Investments'],,,False,
1,1997,2,1.8,King Bhumibol Adulyadej,69.0,Male,Thailand,,,,,,False,
2,1998,1,3.3,Edmond Safra,65.0,Male,Lebanon,Lebanon,,,,,True,
3,1999,1,7.1,Dieter Schwarz,59.0,Male,Germany,Germany,Neckarsulm,['Fashion & Retail'],,,True,
4,1999,2,2.9,Martin Ebner,53.0,Male,Switzerland,Switzerland,Wilen,['Finance and Investments'],,,False,


In [20]:
df_billio_no_dups.tail()

Unnamed: 0,year,rank,net_worth_billions,full_name,age,gender,country_of_citizenship,country_of_residence,city_of_residence,business_industries,organization_name,position_in_organization,self_made,wealth_status
31727,2023,2540,1.0,Yu Rong,51.0,Male,China,China,Shanghai,['Healthcare'],,,True,Decreased
31728,2023,2540,1.0,"Richard Yuengling, Jr.",80.0,Male,United States,United States,Pottsville,['Food & Beverage'],,,False,Remained Even
31729,2023,2540,1.0,Zhang Gongyun,60.0,Male,China,China,Gaomi,['Manufacturing'],,,True,Returned to List
31730,2023,2540,1.0,Zhang Guiping & family,71.0,Male,China,China,Nanjing,['Real Estate'],,,True,Decreased
31731,2023,2540,1.0,Inigo Zobel,66.0,Male,Philippines,Philippines,Makati,['Diversified'],,,False,Returned to List


In [21]:
df_billio_no_dups.shape

(31731, 14)

In [22]:
df_billio_no_dups.columns

Index(['year', 'rank', 'net_worth_billions', 'full_name', 'age', 'gender',
       'country_of_citizenship', 'country_of_residence', 'city_of_residence',
       'business_industries', 'organization_name', 'position_in_organization',
       'self_made', 'wealth_status'],
      dtype='object')

In [23]:
df_billio_no_dups.info

<bound method DataFrame.info of        year  rank  net_worth_billions                     full_name   age  \
0      1997     1                 2.0  Chatri Sophonpanich & family  73.0   
1      1997     2                 1.8       King Bhumibol Adulyadej  69.0   
2      1998     1                 3.3                  Edmond Safra  65.0   
3      1999     1                 7.1                Dieter Schwarz  59.0   
4      1999     2                 2.9                  Martin Ebner  53.0   
...     ...   ...                 ...                           ...   ...   
31727  2023  2540                 1.0                       Yu Rong  51.0   
31728  2023  2540                 1.0        Richard Yuengling, Jr.  80.0   
31729  2023  2540                 1.0                 Zhang Gongyun  60.0   
31730  2023  2540                 1.0        Zhang Guiping & family  71.0   
31731  2023  2540                 1.0                   Inigo Zobel  66.0   

      gender country_of_citizenship country

In [24]:
df_billio_no_dups.describe(include = 'object')

Unnamed: 0,rank,full_name,gender,country_of_citizenship,country_of_residence,city_of_residence,business_industries,organization_name,position_in_organization,wealth_status
count,31731,31731,31731,31731,31731,31731,31731,31731.0,31731.0,31731
unique,839,4982,3,89,95,1139,68,568.0,165.0,5
top,1818,Robert Miller,Male,United States,United States,New York,['Manufacturing'],,,Increased
freq,234,27,24867,10142,10058,1368,2818,27343.0,27039.0,12367


In [25]:
df_billio_no_dups.describe()

Unnamed: 0,year,net_worth_billions,age
count,31731.0,31731.0,31056.0
mean,2015.755948,4.022348,62.835555
std,5.644283,7.10559,14.175046
min,1997.0,0.6,0.0
25%,2012.0,1.4,54.0
50%,2017.0,2.2,63.0
75%,2021.0,3.9,73.0
max,2023.0,219.0,101.0


## 1.5 Exporting the clean data set

In [26]:
# Exporting df_billio_no_dups
df_billio_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'all_billionaires_1997_2023_clean.csv'), index=False)