## Importing libraries and data 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
%matplotlib inline

In [None]:
GDP2020 = pd.read_excel('GDPCAP.xlsx')
GDPCAP = pd.read_excel ('GDP2020.xlsx')
Bnaires = pd.read_excel('2022_forbes_billionaires.xlsx')

## Cleaning the data


Cleaning GDP2020 Per Capita data 

1. Overview
2. Drop columns .drop("column_name", axis = 1, inplace = True)
3. Renaming columns by creating a new list and assigning to the columns
4. Dropping rows where there's no information on GDP dropna( subset = ["column_name"], inplace = True )


In [None]:
GDPCAP.head()

In [None]:
GDPCAP.drop("Country Code", axis = 1 , inplace = True)
columns_rename = ['CountryName','2020GDP']
GDPCAP.columns = columns_rename 
GDPCAP.dropna(subset = ['2020GDP'], inplace = True )

GDP 2020 table clean
1. removing null rows for GDP 
2. reformatting GDP

In [None]:
GDP2020.rename(columns = {"GDPPERCAP" : "GDP" }, inplace = True) 
GDP2020.dropna(subset = ["GDP"], inplace = True )

In [None]:
pd.options.display.float_format = '{:.2f}'.format
GDP2020.nlargest(20,'GDP')

Cleaning Billionare table 
1. dropping unneccessary table 
2. concatenating networth to convert to int

In [None]:
Bnaires.drop("Column1", axis = 1 , inplace = True)

In [None]:
Bnaires["networth"] = Bnaires["networth"].str.replace("B","")
Bnaires["networth"] = Bnaires["networth"].str.replace("$","")
Bnaires["networth"] = Bnaires["networth"].astype(float)




In [None]:
Bnaires.head(50)

In Billionaires record, some of the countries do not match the names listed in GDP data

Finding unmatched country names from Billionaire data to map it on GDP data
1. saving unique country names from each data frame as lists
2. using for loop, append country names (from Billionaire list) that do not exist in GDP data
3. check the data quality & confirm what needs to be fixed for a good quality mapping
4. after mapping recheck the data for both GDP & GDP Per capita country names 


In [None]:
Bnairecountry = Bnaires["country"].unique()
GDPcountry = GDP2020["Country"].unique()

In [None]:
Missingcountry = []
for i in Bnairecountry:
    if i not in GDPcountry: 
        Missingcountry.append(i)
    

In [None]:
Missingcountry

In [None]:

GDP2020.loc[GDP2020["Country"].str.contains("|".join(Missingcountry))]

In [None]:
Bnaires.loc[Bnaires['country'].str.contains('Hong Kong'),'country'] = 'Hong Kong SAR, China'
Bnaires.loc[Bnaires['country'].str.contains('Russia'),'country'] = 'Russian Federation'
Bnaires.loc[Bnaires['country'].str.contains('Czechia'),'country'] = 'Czech Republic'
Bnaires.loc[Bnaires['country'].str.contains('South Korea'),'country'] = 'Korea, Rep.'
Bnaires.loc[Bnaires['country'].str.contains('Slovakia'),'country'] = 'Slovak Republic'
Bnaires.loc[Bnaires['country'].str.contains('Venezuela'),'country'] = 'Venezuela, RB'
Bnaires.loc[Bnaires['country'].str.contains('Guernsey'),'country'] = 'United Kingdom'
Bnaires.loc[Bnaires['country'].str.contains('Macau'),'country'] = 'Macao SAR, China'
Bnaires.loc[Bnaires['country'].str.contains('Taiwan'),'country'] = 'China'
Bnaires.loc[Bnaires['country'].str.contains('Egypt'),'country'] = 'Egypt, Arab Rep.'


In [None]:
Bnairecountry = Bnaires["country"].unique()

In [None]:

maptest = []
for i in Bnairecountry:
    if i not in GDPcountry: 
        maptest.append(i)
    

In [None]:
maptest

In [None]:
Bnaires.loc[Bnaires['country'].str.contains('Eswatini'), "country"] = 'Eswatini'

In [None]:
maptest2 = []

for i in Bnaires['country'].unique():
    if i not in   GDPCAP['CountryName'].unique():
        maptest2.append(i)

In [None]:
maptest2

In [None]:
GDPCAP.loc[GDPCAP['CountryName'].str.contains("Liechten")].sort_values(by = 'CountryName')

In [None]:

GDPCAP.loc[GDPCAP['CountryName'].str.contains("V")].sort_values(by = 'CountryName')

# Exploring and Visualizing data

In [None]:
Bnaires.info()

1. Top 10 country with most billionaires

In [None]:
bnairecountry = Bnaires.groupby("country")

In [None]:
bnairecountry.size().nlargest(10)

2. Top 10 country with most money from billionaires

In [None]:
bnairecountry.sum().sort_values("networth", ascending = False)

3. Average age of billionaire per country , select only top 10 

In [None]:
agedistribution = bnairecountry["age"].describe()

In [None]:
agedistribution["count"] = agedistribution["count"].astype(int)

In [None]:
countfilter = agedistribution.loc[agedistribution["count"]>15]

countfilter[["mean","count"]].nsmallest(10,"mean")

Using lambda funtion to find out how many billionaires exist in U.S.

In [None]:
bnairecountries = Bnaires["country"]
len(list(filter(lambda x : "United States" in x, bnairecountries)))

Joining GDP Per capita and GDP for further analysis and visuzalization

In [None]:
total_capita = GDP2020.merge(GDPCAP,how = 'inner', left_on= "Country", right_on="CountryName")

In [None]:
total_capita.rename(columns = {"2020GDP":"CAP"}, inplace = True)
total_capita

In [None]:
total_capita["GDP_Rank"] = total_capita["GDP"].rank(ascending = False)
total_capita["CAP_Rank"] = total_capita["CAP"].rank(ascending = False)

total_capita["TotalGDP_standardliving_ratio"] = total_capita["GDP_Rank"]/total_capita["CAP_Rank"]


In [None]:
total_capita.sort_values(by = "TotalGDP_standardliving_ratio", ascending = True)

In [None]:
plt.figure(figsize =(10,6))
sns.regplot(data =total_capita, y = "GDP_Rank", x = "CAP_Rank", color = 'c').set(title = "Capita_Rank vs Total GDP rank")

In [None]:
top20GDP =total_capita.nlargest(20,["GDP"])

In [None]:
top80GDP =total_capita.nlargest(80,["GDP"])
top200GDP = total_capita.nlargest(200,["GDP"])

In [None]:
plt.figure(figsize =(10,6))
sns.regplot(data =top20GDP, y = "GDP_Rank", x = "CAP_Rank", color = 'c').set(title = "Top20 Capita_Rank vs Total GDP rank")

In [None]:
sns.jointplot(data =top20GDP, y = "GDP_Rank", x = "CAP_Rank",kind ='reg')

In [None]:
plt.figure(figsize =(10,6))
sns.regplot(data =top200GDP, y = "GDP_Rank", x = "CAP_Rank", color = 'c').set(title = "Top20 Capita_Rank vs Total GDP rank")

In [None]:

sns.lmplot(data =top20GDP, y= "GDP_Rank", x = "CAP_Rank", hue= 'CountryName',palette= 'Spectral').set(title = "Top20 Capita_Rank vs Total GDP rank")

def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(point['val']))

label_point(top20GDP["CAP_Rank"], top20GDP["GDP_Rank"], top20GDP["CountryName"], plt.gca()) 
plt.gcf().set_size_inches(15, 8)

In [None]:

sns.lmplot(data =top80GDP, y= "GDP_Rank", x = "CAP_Rank", hue= 'CountryName',palette= 'viridis').set(title = "Top80 Capita_Rank vs Total GDP rank")

def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(point['val']))

label_point(top80GDP["CAP_Rank"], top80GDP["GDP_Rank"], top80GDP["CountryName"], plt.gca()) 
plt.gcf().set_size_inches(15, 8)

In [None]:
sns.displot(total_capita['CAP'], bins = 10)

Sampling data for plots 

In [None]:
sampling = total_capita.sample(int(0.2*len(total_capita)))
                               
plt.figure(figsize =(10,6))
sns.regplot(data =sampling, y = "GDP", x = "CAP", color = 'c').set(title = "GDP per Capita vs Total GDP")

Plotting Billionaires networth and age 

In [None]:
plt.figure(figsize = (10,6))
sns.regplot(data =Bnaires, y = "networth", x = "age", color ='g')

Merging total capita and billionaires data

In [None]:
Bnaires_GDP =Bnaires.merge(total_capita,how= "inner", left_on = "country", right_on="Country")

In [None]:
countrygroup = Bnaires_GDP.groupby('country').sum()

In [None]:
countrygroup.head()

In [None]:
sns.jointplot(x= 'CAP', y = 'networth', data= countrygroup, kind='reg')

In [None]:
sns.jointplot(x= 'networth', y = 'GDP', data= countrygroup, kind='reg')

Below plot against networth and total gdp/gdp_cap ratio does not provide good insights. 

In [None]:
plt.figure(figsize =(10,6))
sns.regplot(data = countrygroup, y = "networth", x = "TotalGDP_standardliving_ratio", color = 'c').set(title = "GDP per Capita vs Total GDP")

ranking the data by gdp/gdp_cap to see if this provides a better information on billionaire countries and Gdp/gdp_cap gap

In [None]:
countrygroup["gdp_gdpcap_gap_rank"] = countrygroup["TotalGDP_standardliving_ratio"].rank(ascending = True)
countrygroup["networth_rank"] = countrygroup["networth"].rank(ascending = False)

In [None]:
countrygroup.sort_values(by = "gdp_gdpcap_gap_rank", ascending= False)

In [None]:
countrygroup.reset_index(inplace = True)

Below line graph on gdp_capita_gap_rank vs networth_rank has negative relationship, which indicates the countries that have most money by bilionaires will have higher proportion of gdp/gdpcapita 

In [None]:
sns.lmplot(data =countrygroup, y = "networth_rank", x ='gdp_gdpcap_gap_rank' )

In [None]:
top30networth= countrygroup.nlargest(30,["networth"])

In [None]:
sns.lmplot(data =top30networth, x = "networth_rank", y ='gdp_gdpcap_gap_rank', hue = 'country')


def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(point['val']))

label_point(top30networth["networth_rank"], top30networth["gdp_gdpcap_gap_rank"], top30networth["country"], plt.gca()) 
plt.gcf().set_size_inches(20, 15)

