In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv('billionaires.csv')
sample = df.sample(500, random_state=27)

In [None]:
selfmades = df[df['selfMade'] == True]
not_selfmades = df[df['selfMade'] == False]
sources_selfmade = selfmades['source'].value_counts()
sources_not_selfmade = not_selfmades['source'].value_counts()
merged = pd.merge(sources_selfmade, sources_not_selfmade, on='source', suffixes=('_selfmade', '_not_selfmade'))
merged.rename(columns={'count_selfmade': 'Selfmade'}, inplace=True)
merged.rename(columns={'count_not_selfmade': 'Not Selfmade'}, inplace=True)
merged.sort_values(by=['Selfmade'], inplace=True, ascending=False)

quantile = merged['Selfmade'].quantile(q=0.7, interpolation='linear')
tops = merged[merged['Selfmade'] > quantile]
tops.plot(kind='bar', figsize=(20, 5))

plt.title('Number of Billionaires by Source')
plt.xlabel('Source')
plt.ylabel('Number of Billionaires')
plt.xticks(rotation=90, ha='right')

plt.show()

In [None]:
# plot the industry counts for the original dataset
industry_counts = df['industries'].value_counts()
plt.subplot(1, 2, 1)
industry_counts.plot(kind='bar', figsize=(20, 5), color='green')
plt.title('Number of Billionaires by Industries (Original Dataset)')
plt.xlabel('Industries')
plt.ylabel('Number of Billionaires')
plt.xticks(rotation=90, ha='right')

# plot the industry counts for the sample dataset
industry_counts_sample = sample['industries'].value_counts()
plt.subplot(1, 2, 2)
industry_counts_sample.plot(kind='bar', figsize=(20, 5), color='orange')
plt.title('Number of Billionaires by Industries (Sample Dataset)')
plt.xlabel('Industries')
plt.ylabel('Number of Billionaires')
plt.xticks(rotation=90, ha='right')

plt.show()

In [None]:
country_counts = df['country'].value_counts()
plt.subplot(1, 2, 1)
quantile = country_counts.quantile(q=0.7, interpolation='linear')
tops = country_counts[country_counts > quantile]
tops.plot(kind='bar', figsize=(20, 5), color='green')
plt.title('Number of Billionaires by Country (Original Dataset)')
plt.xlabel('Country')
plt.ylabel('Number of Billionaires')
plt.xticks(rotation=90, ha='right')

country_counts_sample = sample['country'].value_counts()
plt.subplot(1, 2, 2)
quantile = country_counts_sample.quantile(q=0.5, interpolation='linear')
tops_sample = country_counts_sample[country_counts_sample > quantile]
tops_sample.plot(kind='bar', figsize=(20, 5), color='orange')
plt.title('Number of Billionaires by Country (Sample Dataset)')
plt.xlabel('Country')
plt.ylabel('Number of Billionaires')
plt.xticks(rotation=90, ha='right')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))

ages = df['age']
plt.hist(ages, bins=20,color='purple')
ax.set_title('Number of Billionaires by Age')
ax.set_xlabel('Age')
ax.set_ylabel('Number of Billionaires')

mean = ages.mean()
std_dev = ages.std()
x = np.linspace(mean - 3*std_dev, mean + 3*std_dev, 100)
pdf = norm.pdf(x, mean, std_dev)

normal_ax = ax.twinx()
normal_ax.plot(x, pdf, color='green')
normal_ax.set_ylabel('Probability Density')
plt.show()

In [None]:
pd.plotting.scatter_matrix(df.select_dtypes(include=[np.number]), figsize=(90, 90))
plt.show()

In [None]:
plt.scatter(df['cpi_country'], df['life_expectancy_country'], s=64, c='green', alpha=0.1)
plt.xlabel('Consumer Price Index')
plt.ylabel('Life Expectancy')
plt.title('Relationship between CPI and Life Expectancy')
plt.grid(True)
plt.show()

In [None]:
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(16, 16))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
population = df['population_country']
tax = df['total_tax_rate_country']

plt.scatter(population, tax, s=64, c='green', alpha=0.1)
plt.xlabel('Country Population')
plt.ylabel('Tax Rate')
plt.title('Relationship between Country Population and Tax Rate')
plt.grid(True)
plt.show()