In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import plotly as py
import plotly.graph_objs as go
import os
py.offline.init_notebook_mode(connected = True)
import datetime as dt
import missingno as msno
from wordcloud import WordCloud
import random
import matplotlib
import plotly.express as px
plt.rcParams['figure.dpi'] = 140

# 1. Import Data

In [None]:
data = pd.read_csv("../input/forbes-billionaires-of-2021-20/forbes_billionaires.csv")
data.head()

In [None]:
print("There are {} row and {} columns in this data.".format(data.shape[0],data.shape[1]))

### 1.1. Missing Data

In [None]:
data.isnull().sum()

As it can seen there are null data in 7 columns. So let's make the calculation that gives the null ratios below.

In [None]:
for i in data.columns:
    null_rate = data[i].isna().sum() / len(data) * 100 
    if null_rate > 0 :
        print("{} null rate: {}%".format(i,round(null_rate,2)))

**Age**: We will find the average age and write it instead of the null values.

**Residence**: Write No Data instead of NaN

**Citizenship**: It should be equal to the Country columns because Country has no null values and looks almost the same as the Citizenship columns.

**Status**: We'll drop it down to two options. Married or Not Married.

**Children**: Write average children rate instead of NaN

**Education**: Write No Data instead of NaN

**Self_made**: Null rate in this columns is very small. That's why we can drop them by dropna or add them to Self_Made option.

In [None]:
# write instead of null
data["Age"].replace(np.nan, data["Age"].sum()/(len(data)-len(data)*4.54/100),inplace  = True)
data["Children"].replace(np.nan, data["Children"].sum()/(len(data)-len(data)*43.67/100),inplace  = True)
data['Residence'].replace(np.nan, 'No Data',inplace  = True)
data['Citizenship'] = data["Citizenship"].fillna(data["Country"].mode()[0])
data["Status"].replace(np.nan, "No Data", inplace = True)
data["Education"].replace(np.nan, "No Data", inplace = True)

data.groupby(['Self_made'])['Self_made'].count()
data["Self_made"].replace(np.nan, data["Self_made"].mode()[0], inplace = True)

In [None]:
data.isnull().sum()

Now there is no any null values.

# 2. Let's Visualize

Forbes Color Palette

https://www.color-hex.com/color-palette/3133

In [None]:
# Palette
sns.palplot(['#333333', '#565656', '#626262','#eeeeee'])

plt.title("Forbes Color Palette ",loc='left',fontfamily='serif',fontsize=15,y=1.2)
plt.show()

### 2.1. Status

In [None]:
x = data["Status"].value_counts()
x = pd.DataFrame(x).T

for i in x.columns:
    y = x[i].sum() / 2755 * 100
    if y > 0 :
        print("{} y: {}%".format(i,round(y,2)))

In [None]:
no_data = x["No Data"][0]
married = x["Married"][0]
not_married = (x["Divorced"][0] + x["Widowed"][0] + x["Single"][0] + x["In Relationship"][0] + 
               x["Separated"][0] + x["Widowed, Remarried"][0] + x["Engaged"][0])
married = pd.DataFrame(married, index = ["Status"], columns = ["Married"])
not_married = pd.DataFrame(not_married, index = ["Status"], columns = ["Not Married"])
x2 = pd.concat([married, not_married], axis = 1)
x2

The above is not yet proportionally distributed marriage rate on No Data. First, let's find the rate and then distribute them.

In [None]:
for i in x2.columns:
    y2 = x2[i].sum() / (married["Married"][0]+not_married["Not Married"][0]) * 100
    if y2 > 0 :
        print("{} y: {}%".format(i,round(y2,2)))

In [None]:
married = (x["Married"][0] + round(x["No Data"][0]*0.83))
not_married = (x2["Not Married"][0] + round(x["No Data"][0]*0.17))

married = pd.DataFrame(married, index = ["Status"], columns = ["Married"])
not_married = pd.DataFrame(not_married, index = ["Status"], columns = ["Not Married"])
x3 = pd.concat([married, not_married], axis = 1)
x3

Now let's check again Marriage rate...

In [None]:
for i in x3.columns:
    y3 = x3[i].sum() / (married["Married"][0]+not_married["Not Married"][0]) * 100
    if y3 > 0 :
        print("{} : {}%".format(i,round(y3,2)))

In [None]:
fig = px.pie(x2.T, values = "Status",
             names = ["Married","Not Married"], 
             width=800, height=500)
fig.update_traces(textposition = "inside",
                  textinfo = "percent + label",
                  hole = 0.3,
                  marker = dict(colors = ["#333333", "#565656"],
                                line = dict(color = "white", width = 3)))

fig.update_layout(annotations = [dict(text = "Status",
                                      x = 0.5, y= 0.5,
                                      font_size = 24, showarrow = False,
                                      font_family = "Verdana",
                                      font_color = "black")],
                  showlegend = False)

fig.show()

### 2.2. Self Made or Not

In [None]:
x=data.groupby(['Self_made'])['Self_made'].count()
y=len(data)
r=((x/y)).round(2)

self_ratio = pd.DataFrame(r).T
self_ratio.columns = ["Not Self Made", "Self Made"]
self_ratio = pd.concat([self_ratio["Self Made"], self_ratio["Not Self Made"]], axis = 1) # Kolonların yerini değiştirdim.


fig, ax = plt.subplots(1,1,figsize=(6.5, 2.5))

ax.barh(self_ratio.index, self_ratio['Self Made'], 
        color='#333333', alpha=0.9, label='Male')
ax.barh(self_ratio.index, self_ratio['Not Self Made'], left=self_ratio['Self Made'], 
        color='#565656', alpha=0.9, label='Female')

ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticks([])
#ax.set_yticklabels(mf_ratio.index, fontfamily='serif', fontsize=11)

for i in self_ratio.index:
    ax.annotate(f"{int(self_ratio['Self Made'][i]*100)}%", 
                   xy=(self_ratio['Self Made'][i]/2, i),
                   va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif',
                   color='white')

    ax.annotate("Self Made", 
                   xy=(self_ratio['Self Made'][i]/2, -0.25),
                   va = 'center', ha='center',fontsize=13, fontweight='light', fontfamily='serif',
                   color='white')

for i in self_ratio.index:
    ax.annotate(f"{int(self_ratio['Not Self Made'][i]*100)}%", 
                   xy=(self_ratio['Self Made'][i]+self_ratio['Not Self Made'][i]/2, i),
                   va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif',
                   color='white')
    ax.annotate("Not Self Made", 
                   xy=(self_ratio['Self Made'][i]+self_ratio['Not Self Made'][i]/2, -0.25),
                   va = 'center', ha='center',fontsize=13, fontweight='light', fontfamily='serif',
                   color='white')
    
# Title & Subtitle
fig.text(0.125,1.03,'Self Made or Not Self Made', fontfamily='serif',fontsize=15, fontweight='bold')
fig.text(0.125,0.92,'We can see that their fortune vastly self made',fontfamily='serif',fontsize=12)  

for s in ['top', 'left', 'right', 'bottom']:
    ax.spines[s].set_visible(False)
    


#ax.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.06))

# Removing legend due to labelled plot
ax.legend().set_visible(False)
plt.show()

### 2.3. Citizenship

In [None]:
x4 = data["Citizenship"].value_counts()

fig, ax = plt.subplots(1,1,figsize=(8.5, 5.5))
sns.barplot(y=x4[0:10].index, x=x4[0:10],linewidth=1.5, facecolor=(0, 0, 0, 1),edgecolor=".5")

fig.text(0.26,0.95,'Top 10 Citizenships of Richest Billionaires', fontfamily='serif',fontsize=12, fontweight='bold', alpha = 0.9)
fig.text(0.94, 0.8, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(0.81, 0.5,
         '''
                     There are more Forbes 
                     billionaires in US and 
                     China than any other 
                     countries. This chart 
                     shows us the billionaire 
                     citizenships.
                     
                     ''',
         
         fontfamily='serif',fontsize=12)

ax.grid(axis='x', linestyle='-', alpha=0.4)   

grid_x_ticks = np.arange(0, 800, 50) # y ticks, min, max, then step
ax.set_xticks(grid_x_ticks)
ax.set_axisbelow(True)


plt.xlabel("Billionaires",fontsize=10,fontweight='bold', alpha = 0.9)
plt.yticks(fontsize=8,fontweight='bold', alpha = 0.9)
plt.xticks(fontsize=6)
#plt.title("Visualization by Citizenship")
plt.show()

### 2.4. Country

In [None]:
x5 = data["Country"].value_counts()

fig, ax = plt.subplots(1,1,figsize=(6.5, 3.5))
sns.barplot(y=x4[0:10].index, x=x4[0:10],linewidth=1.5, facecolor=(0, 0, 0, 1),edgecolor=".5")

fig.text(0.2,0.95,'Top 10 Countries of Richest Billionaires', fontfamily='serif',fontsize=12, fontweight='bold', alpha = 0.9)

ax.grid(axis='x', linestyle='-', alpha=0.4)   

grid_x_ticks = np.arange(0, 800, 50) # y ticks, min, max, then step
ax.set_xticks(grid_x_ticks)
ax.set_axisbelow(True)


plt.xlabel("Billionaires",fontsize=10,fontweight='bold', alpha = 0.9)
plt.yticks(fontsize=8,fontweight='bold', alpha = 0.9)
plt.xticks(fontsize=6)
#plt.title("Visualization by Citizenship")
plt.show()

### 2.5. Total Worth

In [None]:
networth = data["NetWorth"].sum().round()/10000
networth = pd.DataFrame(networth, index = range(1), columns=["Trillion $"])

#1
fig, ax = plt.subplots(1,1,figsize=(8.5, 4.5))

ax.barh(networth.index, networth['Trillion $'], 
        color='#333333', alpha=1, label='Total Worth')

ax.set_xlim(0, 2)
ax.set_xticks([])
ax.set_yticks([])
#ax.set_yticklabels(mf_ratio.index, fontfamily='serif', fontsize=11)

# movie percentage
for i in networth.index:
    ax.annotate(f"{int(networth['Trillion $'][i]*10)}", 
                   xy=(networth['Trillion $'][i]/2, i),
                   va = 'center', ha='center',fontsize=80, fontweight='light', fontfamily='serif',
                   color='white')

    ax.annotate("Trillion $", 
                   xy=(networth['Trillion $'][i]/2, -0.25),
                   va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif',
                   color='white')
    

# Title & Subtitle
fig.text(0.125,1.03,'Total Worth', fontfamily='serif',fontsize=14, fontweight='bold')
fig.text(0.125,0.92,'The total worth of the richest 2755 person.',fontfamily='serif',fontsize=11)  

for s in ['top', 'left', 'right', 'bottom']:
    ax.spines[s].set_visible(False)
    


#ax.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.06))

# Removing legend due to labelled plot
ax.legend().set_visible(False)
plt.show()

### 2.6. Total Worth by Top 10 Citizenship

In [None]:
worth_sum = 0
a = [] # Top Countries
b = [] # Net Worths by Countries
for i in range(10):
    worth_index = data["Country"].value_counts().index[i]
    worth = data[data["Country"] == data["Country"].value_counts().index[i]]
    worth_sum += worth["NetWorth"].sum().round()
    a.append(worth_index)
    b.append(round(worth["NetWorth"].sum()))
print("Top 10 Total Worth: {} billion $".format(worth_sum))

b = pd.DataFrame(b, columns = ["Total Worth"])
a = pd.DataFrame(a, columns = ["Country"])
x6 = pd.concat([a,b], axis = 1)
x6 = x6.set_index("Country")

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7.5, 8.5))
sns.barplot(x=x6.index, y=x6["Total Worth"],linewidth=1.5, facecolor=(0, 0, 0, 1),edgecolor=".5")

fig.text(0.27,0.90,'Top 10 Countries of Richest Billionaires', fontfamily='serif',fontsize=12, fontweight='bold', alpha = 0.9)
fig.text(0.138, 0.825, x6["Total Worth"][0], fontweight='bold', fontsize=10, color = "white")
fig.text(0.217, 0.520, x6["Total Worth"][1], fontweight='bold', fontsize=10, color = "white")
fig.text(0.30, 0.205, x6["Total Worth"][2], fontweight='bold', fontsize=10, color = "white")
fig.text(0.38, 0.208, x6["Total Worth"][3], fontweight='bold', fontsize=10, color = "white")
fig.text(0.455, 0.203, x6["Total Worth"][4], fontweight='bold', fontsize=10, color = "white")
fig.text(0.53, 0.180, x6["Total Worth"][5], fontweight='bold', fontsize=10, color = "white")
fig.text(0.61, 0.142, x6["Total Worth"][6], fontweight='bold', fontsize=10, color = "white")
fig.text(0.689, 0.145, x6["Total Worth"][7], fontweight='bold', fontsize=10, color = "white")
fig.text(0.765, 0.143, x6["Total Worth"][8], fontweight='bold', fontsize=10, color = "white")
fig.text(0.843, 0.142, x6["Total Worth"][9], fontweight='bold', fontsize=10, color = "white")
fig.text(0.921, 0.80, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(0.8, 0.43,
         '''
                     
                     In the Country title, USA has  
                     the highest total value with 
                     724 people. The total wealth of 
                     these rich people is 4398 B $.
                     
                     In second place is China, which
                     put 626 people on the list. The
                     sum of their wealth is 2532 B $.
                     In turn, 140 people from India, 
                     136 from Germany and 118 from 
                     Russia entered the list.
                     
                     The total wealth on the list is 
                     13 Trillion $, but the wealth of 
                     those who enter this chart is 
                     10 Trillion $. The vast majority 
                     is in the hands of those entering 
                     the list from these 10 countries.
                      
                     ''',
         
         fontfamily='serif',fontsize=10)

ax.grid(axis='y', linestyle='-', alpha=0.6)   

grid_y_ticks = np.arange(0, 4500, 200) # y ticks, min, max, then step
ax.set_yticks(grid_y_ticks)
ax.set_axisbelow(True)


plt.xlabel("Country",fontsize=10,fontweight='bold', alpha = 0) # alpha = 0 : invisible
plt.ylabel("Billion $",fontsize=12,fontweight='bold', alpha = 0.9)
plt.yticks(fontsize=8,fontweight='bold', alpha = 0.9)
plt.xticks(fontsize=7, rotation = 60, fontweight='bold')
#plt.title("Visualization by Citizenship")
plt.show()

In [None]:
name = data["Name"]
net_worth = (data["NetWorth"])
country = data["Country"]
source = data["Source"]
rank = data["Rank"]
age = data["Age"]
residence = data["Residence"]
citizenship = data["Citizenship"]
status = data["Status"]
children = data["Children"]
education = data["Education"]
self_made = data["Self_made"]

In [None]:
data2 = pd.concat([name, net_worth, source], axis = 1)
data2[:10]

In [None]:
def plot_pie_charts(x, y, title):
    # more muted color 
    c = ['#333333', '#565656', '#626262', '#eeeeee', '#eeeeee',
        '#eeeeee', '#eeeeee', '#eeeeee', '#eeeeee', '#eeeeee', '#eeeeee']
    plt.figure(figsize=(8,6))
    plt.title(title, size=18)
    plt.pie(y, colors=c,shadow=True, labels=x)
#    plt.legend(x, loc='right', fontsize=6)
    plt.show()

In [None]:
plot_pie_charts(name[:10], net_worth[:10],'Top Richest People')

In [None]:
data2 = pd.concat([name, net_worth, source], axis = 1)
data2[:5]
x7 = data2[:5]
x7 = x7.set_index("Name")

In [None]:
fig = px.pie(x7, values = "NetWorth",
             names = x7.index, width=800, height=500)
fig.update_traces(
                  textposition = "inside",
                  textinfo = "value , label , text",
                  textfont = {'family': "Serif", 'size': [30, 26, 20, 18, 16]},
#                  text = ["$","$","$","$","$"],
                  hole = 0.3,
                  marker = dict(colors = ["#565646", "#565646", "#565646", "#565646", "#565646"],
                                line = dict(color = "white", width = 3)))

fig.update_layout(annotations = [dict(text = "Values",
                                      x = 0.5, y= 0.5,
                                      font_size = 30, showarrow = False,
                                      font_family = "serif",
                                      font_color = "#565646")],
                  showlegend = False)

fig.update_layout(
    title_font_size=30, 
    font_family = "serif", 
    font_color = "black",
    title_text="Top Richest People"
)


fig.show()

In [None]:
fig = px.pie(x7, values = "NetWorth",
             names = x7.index, width=800, height=500)
fig.update_traces(title_font_size=10,
                  textposition = "inside",
                  text = x7["Source"],
                  textinfo = "label + text",
                  textfont = {'family': "Serif", 'size': [30, 26, 20, 18, 16]},
                  hole = 0.3,
                  marker = dict(colors = ['#221f1f', '#221f1f', '#221f1f', '#221f1f', '#221f1f'],
                                line = dict(color = "white", width = 3)))

fig.update_layout(annotations = [dict(text = "Source",
                                      x = 0.5, y= 0.5,
                                      font_size = 30, showarrow = False,
                                      font_family = "serif",
                                      font_color = "black")],
                  showlegend = False)

fig.update_layout(
    title_text="Top Richest People & Their Sources",
    title_font_size=30, 
    font_family = "serif", 
    font_color = "black"
)
    
fig.show()

In [None]:
text = str(list(name[0:50])).replace(',', '').replace('[', '').replace("'", '').replace(']', '').replace('.', '')

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#221f1f', '#565646', "#eeeeee"])

wordcloud = WordCloud(background_color = 'white', width = 800,  height = 500,colormap=cmap, max_words = 100).generate(text)

plt.figure( figsize=(10,10))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.title('Wordclouds of Richest Names', fontfamily='serif',fontsize=30, fontweight='bold', alpha = 0.7, color = "Red")
plt.show()

### 2.7. Self Made & Not Self Made Fortunes

In [None]:
data_self_made = data[data["Self_made"] == True]
data_not_self_made = data[data["Self_made"] == False]

x8 = [data_self_made["NetWorth"].sum(), data_not_self_made["NetWorth"].sum()]
col = ["Self Made or Not"]
idx = ["Self Made", "Not Self Made"]
x8 = pd.DataFrame(x8, index = idx, columns = col)

fig, ax = plt.subplots(1,1,figsize=(7, 3))
ax = sns.barplot(x=x8.index, y=x8["Self Made or Not"],linewidth=1.5, facecolor=(0, 0, 0, 1),edgecolor=".5")

fig.text(0.20,0.95,'Compare Self Made / Not Self Made Fortune', fontfamily='serif',fontsize=12, fontweight='bold', alpha = 0.9)
fig.text(0.250, 0.720, str(data_self_made["NetWorth"].sum()) + "B $", fontweight='bold', fontsize=15, color = "white")
fig.text(0.63, 0.32, str(data_not_self_made["NetWorth"].sum()) + "B $", fontweight='bold', fontsize=15, color = "white")
fig.text(0.93, 0.80, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(0.8, 0.43,
         '''
                     
                     Total self made fortune is vastly  
                     high than total not self made value. 
                     Although the self made ones are 72%,
                     the value of their wealth is 70%.
                      
                     ''',
         
         fontfamily='serif',fontsize=10)

ax.grid(axis='y', linestyle='-', alpha=0.6)   

grid_y_ticks = np.arange(0, 11000, 1000) # y ticks, min, max, then step
ax.set_yticks(grid_y_ticks)
ax.set_axisbelow(True)

plt.ylabel("Billion $",fontsize=12,fontweight='bold', alpha = 0.9)
plt.yticks(fontsize=8,fontweight='bold', alpha = 0.9)

plt.show()