In [1]:
import json
import requests
import pandas as pd

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_research_and_development_spending'

In [3]:
rnd_spending = pd.read_html(url)[0]

In [12]:
rnd_spending

1,country,r&dExpense_billions,%GDP,r&dExpensePerCapita,Year,Source
0,China,553.4,2.19%,388,2018,[2]
1,United States,511.1,2.744%,1586.35,2016,[3]
2,European Union*,379.0,1.64%,658.94,2016,[3]
3,Japan,165.7,3.147%,1297.39,2016,[3]
4,Germany,118.8,2.94%,1450.17,2016,[3]
...,...,...,...,...,...,...
86,Moldova,0.06,0.35%,16.44,2013,[7]
87,Bahrain,0.06,0.100%,45.2,2014,[6]
88,Paraguay,0.06,0.13%,9.13,2015,[3][8]
89,Georgia,0.05,0.16%,14.73,2014,[7]


In [5]:
rnd_spending.columns = rnd_spending.iloc[1]

In [7]:
rnd_spending.drop(rnd_spending.index[0:2], inplace=True)

In [9]:
rnd_spending.reset_index(drop=True, inplace=True)

In [11]:
rnd_spending.rename(columns={'Country/Region': 'country',
                            'Expenditures on R&D (billions of US$, PPP),': 'r&dExpense_billions',
                            '% of GDP PPP': '%GDP',
                            'Expenditures on R&D per capita (US$ PPP),': 'r&dExpensePerCapita'}, inplace=True)

In [18]:
def lowercase_feature(df, col):
    return df[col].str.lower()

In [20]:
rnd_spending['country'] = lowercase_feature(rnd_spending, 'country')

In [21]:
rnd_spending

1,country,r&dExpense_billions,%GDP,r&dExpensePerCapita,Year,Source
0,china,553.40,2.190,388.00,2018,[2]
1,united states,511.10,2.744,1586.35,2016,[3]
2,european union*,379.00,1.640,658.94,2016,[3]
3,japan,165.70,3.147,1297.39,2016,[3]
4,germany,118.80,2.940,1450.17,2016,[3]
...,...,...,...,...,...,...
86,moldova,0.06,0.350,16.44,2013,[7]
87,bahrain,0.06,0.100,45.20,2014,[6]
88,paraguay,0.06,0.130,9.13,2015,[3][8]
89,georgia,0.05,0.160,14.73,2014,[7]


In [14]:
def extract_float(df, col, regex):
    return df[col].str.extract(regex).astype('float64')

In [15]:
rnd_spending['%GDP'] = extract_float(rnd_spending,'%GDP',r"(\d+.?\d*)")

In [17]:
rnd_spending[['r&dExpense_billions', 'r&dExpensePerCapita']] = rnd_spending[['r&dExpense_billions', 'r&dExpensePerCapita']].astype('float64')

In [68]:
merged_df = pd.merge(old_df, rnd_spending[['country', 'r&dExpense_billions', '%GDP', 'r&dExpensePerCapita']], on='country', how='left')

In [70]:
merged_df[['r&dExpense_billions', '%GDP', 'r&dExpensePerCapita']] =  merged_df[['r&dExpense_billions', '%GDP', 'r&dExpensePerCapita']].fillna(0)

In [71]:
merged_df.dtypes

id                       int64
fullName                object
position               float64
lastName                object
age                    float64
gender                 float64
country                 object
image                   object
source                  object
worth                  float64
worthChange            float64
realTimePosition       float64
sourceDetails           object
r&dExpense_billions    float64
%GDP                   float64
r&dExpensePerCapita    float64
dtype: object

In [117]:
response = requests.get('https://www.forbes.com/ajax/list/data?year=2018&uri=billionaires&type=person')
results = response.json()
new_df = pd.DataFrame(results)

In [133]:
len(new_df['name'].unique())

2475

In [139]:
duplicate = new_df['name'].duplicated()

In [147]:
new_df['name']

0       False
1       False
2       False
3       False
4       False
        ...  
2473    False
2474    False
2475    False
2476    False
2477    False
Name: name, Length: 2478, dtype: bool

In [141]:
new_df[duplicate]

Unnamed: 0,name,lastName,uri,imageUri,worthChange,age,source,industry,gender,country,...,state,headquarters,position,rank,worth,title,government,pay,managementAssets,salary
1131,jim davis,Davis,jim-davis-1,no-pic,0.0,59.0,staffing & recruiting,Service,M,United States,...,Maryland,MD,862.0,859.0,2800.0,,,,,
1214,robert miller,Miller,robert-miller,robert-miller,0.0,74.0,electronics components,Technology,M,Canada,...,,,945.0,924.0,2600.0,"President and CEO, Future Electronics",False,,,
1786,li li,Li,li-li-2,li-li-2,14.602,55.0,pharmaceuticals,Healthcare,M,China,...,,,1517.0,1477.0,1600.0,,,,,


In [100]:
new_df.isnull().sum()

name                   0
lastName               0
uri                    0
imageUri               0
worthChange          291
age                   63
source                 0
industry               6
gender                15
country                0
timestamp              0
realTimeWorth        291
realTimeRank         291
realTimePosition     291
squareImage          406
state               1753
headquarters        1817
position             271
rank                 271
worth                271
title               2135
government          2135
pay                 2456
managementAssets    2478
salary              2478
dtype: int64

In [52]:
old_df = pd.read_csv('../data/processed/cleaned_output_1.csv', sep=';', keep_default_na=False)

In [133]:
old_df['country'].sort_values().unique()

array(['', 'angola', 'argentina', 'australia', 'austria', 'belgium',
       'brazil', 'canada', 'chile', 'china', 'colombia', 'cyprus',
       'czech republic', 'denmark', 'egypt', 'finland', 'france',
       'georgia', 'germany', 'greece', 'hong kong', 'iceland', 'india',
       'indonesia', 'ireland', 'israel', 'italy', 'japan', 'kazakhstan',
       'kuwait', 'lebanon', 'liechtenstein', 'macau', 'malaysia',
       'mexico', 'monaco', 'morocco', 'netherlands', 'new zealand',
       'norway', 'oman', 'peru', 'philippines', 'poland', 'qatar',
       'russia', 'singapore', 'south africa', 'south korea', 'spain',
       'st. kitts and nevis', 'swaziland', 'sweden', 'switzerland',
       'taiwan', 'thailand', 'turkey', 'ukraine', 'united arab emirates',
       'united kingdom', 'united states', 'vietnam'], dtype=object)

In [130]:
def get_null(x):
    return x.isnull().sum()

In [134]:
get_null(old_df)

id                  0
fullName            0
position            0
lastName            0
age                 0
gender              0
country             0
image               0
source              0
worth               0
worthChange         0
realTimePosition    0
sourceDetails       0
dtype: int64

In [34]:
def lowercase_feature(df, col):
    return df[col].str.lower()

#new_df['name'] = lowercase_feature(new_df, 'name')

In [50]:
merged_df.dtypes

id                       int64
fullName                object
position               float64
lastName                object
age                    float64
gender                  object
country                 object
image                   object
source                  object
worth                  float64
worthChange            float64
realTimePosition       float64
sourceDetails           object
r&dExpense_billions    float64
%GDP                   float64
r&dExpensePerCapita    float64
dtype: object

In [72]:
table_analysis = merged_df.groupby('country').agg(billionaires=('id', 'count'),
                                                  age=('age', 'mean'),
                                                  gender=('gender', 'mean'),
                                                  mean_billionaire_pos=('position', 'mean'),
                                                  median_billionaire_pos=('position', 'median'),
                                                  total_billionaire_worth=('worth', 'sum'),
                                                  mean_billionaire_worth=('worth', 'mean'),
                                                  median_billionaire_worth=('worth', 'median'),
                                                  rd_expense_billions=('r&dExpense_billions','max'),
                                                  rd_expense_capita=('r&dExpensePerCapita','max'),
                                                  percent_GDP=('%GDP','max')).sort_values(by='billionaires', ascending=False)[1:].reset_index()

In [75]:
table_analysis['country'].sort_values().unique()

array(['angola', 'argentina', 'australia', 'austria', 'belgium', 'brazil',
       'canada', 'chile', 'china', 'colombia', 'cyprus', 'czech republic',
       'denmark', 'egypt', 'finland', 'france', 'georgia', 'germany',
       'greece', 'hong kong', 'iceland', 'india', 'indonesia', 'ireland',
       'israel', 'italy', 'japan', 'kazakhstan', 'kuwait', 'lebanon',
       'liechtenstein', 'macau', 'malaysia', 'mexico', 'monaco',
       'morocco', 'netherlands', 'new zealand', 'norway', 'oman', 'peru',
       'philippines', 'poland', 'qatar', 'russia', 'singapore',
       'south africa', 'south korea', 'spain', 'st. kitts and nevis',
       'swaziland', 'sweden', 'switzerland', 'taiwan', 'thailand',
       'turkey', 'ukraine', 'united arab emirates', 'united kingdom',
       'united states', 'vietnam'], dtype=object)

In [167]:
null_table

Unnamed: 0,country,billionaires,mean_billionaire_pos,median_billionaire_pos,mean_billionaire_worth,median_billionaire_worth,rdExpense,rdExpensePerCapita,percent_GDP
28,monaco,4,1463.0,1459.5,1.725,1.7,,,
35,st. kitts and nevis,2,1834.0,1834.0,1.35,1.35,,,
43,lebanon,2,1344.5,1344.5,2.0,2.0,,,
53,swaziland,1,505.0,505.0,4.3,4.3,,,
55,angola,1,933.0,933.0,2.6,2.6,,,
60,liechtenstein,1,587.0,587.0,3.9,3.9,,,


In [73]:
rich_per_country

country
united states    201
china            158
usa               69
germany           45
india             41
                ... 
oman               1
qatar              1
south africa       1
swaziland          1
angola             1
Name: id, Length: 63, dtype: int64