# Financial Data

In [1]:
# Importing Pandas to clean the dataset
import pandas as pd

In [2]:
# A template of countries and the years have been created seperately and will be used to merge all relevant data frames
country_year = pd.read_csv('raw_data/country_template_year.csv')
country_year

Unnamed: 0,LOCATION,TIME,"Population, total"
0,AFG,1960,8.996973e+06
1,AFG,1961,9.169410e+06
2,AFG,1962,9.351441e+06
3,AFG,1963,9.543205e+06
4,AFG,1964,9.744781e+06
...,...,...,...
16099,WLD,2016,7.424286e+09
16100,WLD,2017,7.509074e+09
16101,WLD,2018,7.591945e+09
16102,WLD,2019,7.673534e+09


In [3]:
# Checking the data type for year
country_year['TIME'].dtypes

dtype('int64')

In [4]:
# Automated teller dataset
tellers = pd.read_csv('raw_data/automated_teller_machines.csv')
tellers

Unnamed: 0,LOCATION,TIME,"Automated teller machines (ATMs) (per 100,000 adults)"
0,AFG,1960,0.000000
1,AFG,1961,0.000000
2,AFG,1962,0.000000
3,AFG,1963,0.000000
4,AFG,1964,0.000000
...,...,...,...
16099,WLD,2016,38.330143
16100,WLD,2017,38.034455
16101,WLD,2018,40.949620
16102,WLD,2019,49.615022


In [5]:
# Checking the datatype for year
tellers['TIME'].dtypes

dtype('int64')

In [6]:
# Merging the two data frames 
merge = pd.merge(country_year, tellers, on=['LOCATION','TIME'],how='left').copy()
merge

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)"
0,AFG,1960,8.996973e+06,0.000000
1,AFG,1961,9.169410e+06,0.000000
2,AFG,1962,9.351441e+06,0.000000
3,AFG,1963,9.543205e+06,0.000000
4,AFG,1964,9.744781e+06,0.000000
...,...,...,...,...
16099,WLD,2016,7.424286e+09,38.330143
16100,WLD,2017,7.509074e+09,38.034455
16101,WLD,2018,7.591945e+09,40.949620
16102,WLD,2019,7.673534e+09,49.615022


In [7]:
# Borrowing money from bank
bank_loans = pd.read_csv('raw_data/borrowersfrombanks.csv')
bank_loans

Unnamed: 0,LOCATION,TIME,"Borrowers from commercial banks (per 1,000 adults)"
0,AFG,1960,0.000000
1,AFG,1961,0.000000
2,AFG,1962,0.000000
3,AFG,1963,0.000000
4,AFG,1964,0.000000
...,...,...,...
16099,WLD,2016,164.010187
16100,WLD,2017,164.077303
16101,WLD,2018,0.000000
16102,WLD,2019,0.000000


In [8]:
# Merging bank loans
merge1 = pd.merge(merge, bank_loans, on=['TIME','LOCATION'], how='left')
merge1

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)"
0,AFG,1960,8.996973e+06,0.000000,0.000000
1,AFG,1961,9.169410e+06,0.000000,0.000000
2,AFG,1962,9.351441e+06,0.000000,0.000000
3,AFG,1963,9.543205e+06,0.000000,0.000000
4,AFG,1964,9.744781e+06,0.000000,0.000000
...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,38.330143,164.010187
16100,WLD,2017,7.509074e+09,38.034455,164.077303
16101,WLD,2018,7.591945e+09,40.949620,0.000000
16102,WLD,2019,7.673534e+09,49.615022,0.000000


In [9]:
# Broad Money Growth Dataframe
broad_money_growth = pd.read_csv('raw_data/broadmoneygrowth.csv')
broad_money_growth

Unnamed: 0,LOCATION,TIME,broad_money_growth_%
0,AFG,1960,0.000000
1,AFG,1961,10.291859
2,AFG,1962,17.827298
3,AFG,1963,15.579196
4,AFG,1964,22.888116
...,...,...,...
16099,WLD,2016,0.000000
16100,WLD,2017,0.000000
16101,WLD,2018,0.000000
16102,WLD,2019,0.000000


In [10]:
merge2 = pd.merge(merge1, broad_money_growth, on=['TIME','LOCATION'], how='left')
merge2

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)",broad_money_growth_%
0,AFG,1960,8.996973e+06,0.000000,0.000000,0.000000
1,AFG,1961,9.169410e+06,0.000000,0.000000,10.291859
2,AFG,1962,9.351441e+06,0.000000,0.000000,17.827298
3,AFG,1963,9.543205e+06,0.000000,0.000000,15.579196
4,AFG,1964,9.744781e+06,0.000000,0.000000,22.888116
...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,38.330143,164.010187,0.000000
16100,WLD,2017,7.509074e+09,38.034455,164.077303,0.000000
16101,WLD,2018,7.591945e+09,40.949620,0.000000,0.000000
16102,WLD,2019,7.673534e+09,49.615022,0.000000,0.000000


In [11]:
# Domestic Companies
domestic_companies = pd.read_csv('raw_data/domesticcompanieslist.csv')
domestic_companies

Unnamed: 0,LOCATION,TIME,"Listed domestic companies, total"
0,AFG,1960,0
1,AFG,1961,0
2,AFG,1962,0
3,AFG,1963,0
4,AFG,1964,0
...,...,...,...
16099,WLD,2016,43522
16100,WLD,2017,43444
16101,WLD,2018,43342
16102,WLD,2019,0


In [12]:
merge3 = pd.merge(merge2, domestic_companies, on=['TIME','LOCATION'], how='left')
merge3

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)",broad_money_growth_%,"Listed domestic companies, total"
0,AFG,1960,8.996973e+06,0.000000,0.000000,0.000000,0
1,AFG,1961,9.169410e+06,0.000000,0.000000,10.291859,0
2,AFG,1962,9.351441e+06,0.000000,0.000000,17.827298,0
3,AFG,1963,9.543205e+06,0.000000,0.000000,15.579196,0
4,AFG,1964,9.744781e+06,0.000000,0.000000,22.888116,0
...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,38.330143,164.010187,0.000000,43522
16100,WLD,2017,7.509074e+09,38.034455,164.077303,0.000000,43444
16101,WLD,2018,7.591945e+09,40.949620,0.000000,0.000000,43342
16102,WLD,2019,7.673534e+09,49.615022,0.000000,0.000000,0


In [13]:
# Foreign Investments
foreign_investments = pd.read_csv('raw_data/foreigninvestment.csv')
foreign_investments

Unnamed: 0,LOCATION,TIME,"Foreign direct investment, net inflows (% of GDP)",Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 174,Unnamed: 175,Unnamed: 176,Unnamed: 177,Unnamed: 178,Unnamed: 179,Unnamed: 180,Unnamed: 181,Unnamed: 182,Unnamed: 183
0,AFG,1960,0.000000,,,,,,,,...,,,,,,,,,,
1,AFG,1961,0.000000,,,,,,,,...,,,,,,,,,,
2,AFG,1962,0.000000,,,,,,,,...,,,,,,,,,,
3,AFG,1963,0.000000,,,,,,,,...,,,,,,,,,,
4,AFG,1964,0.000000,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,3.473328,,,,,,,,...,,,,,,,,,,
16100,WLD,2017,2.666773,,,,,,,,...,,,,,,,,,,
16101,WLD,2018,1.320038,,,,,,,,...,,,,,,,,,,
16102,WLD,2019,1.767214,,,,,,,,...,,,,,,,,,,


In [14]:
foreign_investments1 = foreign_investments[['LOCATION',	'TIME',	'Foreign direct investment, net inflows (% of GDP)']].copy()
foreign_investments1

Unnamed: 0,LOCATION,TIME,"Foreign direct investment, net inflows (% of GDP)"
0,AFG,1960,0.000000
1,AFG,1961,0.000000
2,AFG,1962,0.000000
3,AFG,1963,0.000000
4,AFG,1964,0.000000
...,...,...,...
16099,WLD,2016,3.473328
16100,WLD,2017,2.666773
16101,WLD,2018,1.320038
16102,WLD,2019,1.767214


In [15]:
merge4=pd.merge(merge3, foreign_investments1, on=['TIME','LOCATION'], how='left')
merge4

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)",broad_money_growth_%,"Listed domestic companies, total","Foreign direct investment, net inflows (% of GDP)"
0,AFG,1960,8.996973e+06,0.000000,0.000000,0.000000,0,0.000000
1,AFG,1961,9.169410e+06,0.000000,0.000000,10.291859,0,0.000000
2,AFG,1962,9.351441e+06,0.000000,0.000000,17.827298,0,0.000000
3,AFG,1963,9.543205e+06,0.000000,0.000000,15.579196,0,0.000000
4,AFG,1964,9.744781e+06,0.000000,0.000000,22.888116,0,0.000000
...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,38.330143,164.010187,0.000000,43522,3.473328
16100,WLD,2017,7.509074e+09,38.034455,164.077303,0.000000,43444,2.666773
16101,WLD,2018,7.591945e+09,40.949620,0.000000,0.000000,43342,1.320038
16102,WLD,2019,7.673534e+09,49.615022,0.000000,0.000000,0,1.767214


In [16]:
# Inflation rates
inflation_rates = pd.read_csv('raw_data/inflationrates.csv')
inflation_rates

Unnamed: 0,LOCATION,TIME,"Inflation, consumer prices (annual %)",Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70
0,AFG,1960,0.000000,,,,,,,,...,,,,,,,,,,
1,AFG,1961,0.000000,,,,,,,,...,,,,,,,,,,
2,AFG,1962,0.000000,,,,,,,,...,,,,,,,,,,
3,AFG,1963,0.000000,,,,,,,,...,,,,,,,,,,
4,AFG,1964,0.000000,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,1.457383,,,,,,,,...,,,,,,,,,,
16100,WLD,2017,2.182718,,,,,,,,...,,,,,,,,,,
16101,WLD,2018,2.457965,,,,,,,,...,,,,,,,,,,
16102,WLD,2019,2.301233,,,,,,,,...,,,,,,,,,,


In [17]:
inflation_rates1 = inflation_rates[['LOCATION',	'TIME',	'Inflation, consumer prices (annual %)']]
inflation_rates1

Unnamed: 0,LOCATION,TIME,"Inflation, consumer prices (annual %)"
0,AFG,1960,0.000000
1,AFG,1961,0.000000
2,AFG,1962,0.000000
3,AFG,1963,0.000000
4,AFG,1964,0.000000
...,...,...,...
16099,WLD,2016,1.457383
16100,WLD,2017,2.182718
16101,WLD,2018,2.457965
16102,WLD,2019,2.301233


In [18]:
merge5 = pd.merge(merge4, inflation_rates1, on=['TIME','LOCATION']).copy()
merge5

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)",broad_money_growth_%,"Listed domestic companies, total","Foreign direct investment, net inflows (% of GDP)","Inflation, consumer prices (annual %)"
0,AFG,1960,8.996973e+06,0.000000,0.000000,0.000000,0,0.000000,0.000000
1,AFG,1961,9.169410e+06,0.000000,0.000000,10.291859,0,0.000000,0.000000
2,AFG,1962,9.351441e+06,0.000000,0.000000,17.827298,0,0.000000,0.000000
3,AFG,1963,9.543205e+06,0.000000,0.000000,15.579196,0,0.000000,0.000000
4,AFG,1964,9.744781e+06,0.000000,0.000000,22.888116,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,38.330143,164.010187,0.000000,43522,3.473328,1.457383
16100,WLD,2017,7.509074e+09,38.034455,164.077303,0.000000,43444,2.666773,2.182718
16101,WLD,2018,7.591945e+09,40.949620,0.000000,0.000000,43342,1.320038,2.457965
16102,WLD,2019,7.673534e+09,49.615022,0.000000,0.000000,0,1.767214,2.301233


In [19]:
# Stock trading
stock_trading = pd.read_csv('raw_data/stockstraded.csv')
stock_trading

Unnamed: 0,LOCATION,TIME,"Stocks traded, total value (% of GDP)"
0,AFG,1960,0.000000
1,AFG,1961,0.000000
2,AFG,1962,0.000000
3,AFG,1963,0.000000
4,AFG,1964,0.000000
...,...,...,...
16099,WLD,2016,124.014981
16100,WLD,2017,116.874198
16101,WLD,2018,96.488669
16102,WLD,2019,87.492619


In [20]:
merge6 = pd.merge(merge5, stock_trading, on=['TIME','LOCATION'], how='left').copy()
merge6

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)",broad_money_growth_%,"Listed domestic companies, total","Foreign direct investment, net inflows (% of GDP)","Inflation, consumer prices (annual %)","Stocks traded, total value (% of GDP)"
0,AFG,1960,8.996973e+06,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000
1,AFG,1961,9.169410e+06,0.000000,0.000000,10.291859,0,0.000000,0.000000,0.000000
2,AFG,1962,9.351441e+06,0.000000,0.000000,17.827298,0,0.000000,0.000000,0.000000
3,AFG,1963,9.543205e+06,0.000000,0.000000,15.579196,0,0.000000,0.000000,0.000000
4,AFG,1964,9.744781e+06,0.000000,0.000000,22.888116,0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,38.330143,164.010187,0.000000,43522,3.473328,1.457383,124.014981
16100,WLD,2017,7.509074e+09,38.034455,164.077303,0.000000,43444,2.666773,2.182718,116.874198
16101,WLD,2018,7.591945e+09,40.949620,0.000000,0.000000,43342,1.320038,2.457965,96.488669
16102,WLD,2019,7.673534e+09,49.615022,0.000000,0.000000,0,1.767214,2.301233,87.492619


In [21]:
# Total reserves
total_reserves = pd.read_csv('raw_data/totalreserves.csv')
total_reserves

Unnamed: 0,LOCATION,TIME,"Total reserves (includes gold, current US$)"
0,AFG,1960,50690800.0
1,AFG,1961,42444500.0
2,AFG,1962,40592100.0
3,AFG,1963,45547800.0
4,AFG,1964,44619680.0
...,...,...,...
16099,WLD,2016,0.0
16100,WLD,2017,0.0
16101,WLD,2018,0.0
16102,WLD,2019,0.0


In [22]:
merge7 = pd.merge(merge6, total_reserves, on=['TIME','LOCATION'], how='left').copy()
merge7

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)",broad_money_growth_%,"Listed domestic companies, total","Foreign direct investment, net inflows (% of GDP)","Inflation, consumer prices (annual %)","Stocks traded, total value (% of GDP)","Total reserves (includes gold, current US$)"
0,AFG,1960,8.996973e+06,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,50690800.0
1,AFG,1961,9.169410e+06,0.000000,0.000000,10.291859,0,0.000000,0.000000,0.000000,42444500.0
2,AFG,1962,9.351441e+06,0.000000,0.000000,17.827298,0,0.000000,0.000000,0.000000,40592100.0
3,AFG,1963,9.543205e+06,0.000000,0.000000,15.579196,0,0.000000,0.000000,0.000000,45547800.0
4,AFG,1964,9.744781e+06,0.000000,0.000000,22.888116,0,0.000000,0.000000,0.000000,44619680.0
...,...,...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,38.330143,164.010187,0.000000,43522,3.473328,1.457383,124.014981,0.0
16100,WLD,2017,7.509074e+09,38.034455,164.077303,0.000000,43444,2.666773,2.182718,116.874198,0.0
16101,WLD,2018,7.591945e+09,40.949620,0.000000,0.000000,43342,1.320038,2.457965,96.488669,0.0
16102,WLD,2019,7.673534e+09,49.615022,0.000000,0.000000,0,1.767214,2.301233,87.492619,0.0


In [23]:
finance_data = merge7
finance_data

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)",broad_money_growth_%,"Listed domestic companies, total","Foreign direct investment, net inflows (% of GDP)","Inflation, consumer prices (annual %)","Stocks traded, total value (% of GDP)","Total reserves (includes gold, current US$)"
0,AFG,1960,8.996973e+06,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,50690800.0
1,AFG,1961,9.169410e+06,0.000000,0.000000,10.291859,0,0.000000,0.000000,0.000000,42444500.0
2,AFG,1962,9.351441e+06,0.000000,0.000000,17.827298,0,0.000000,0.000000,0.000000,40592100.0
3,AFG,1963,9.543205e+06,0.000000,0.000000,15.579196,0,0.000000,0.000000,0.000000,45547800.0
4,AFG,1964,9.744781e+06,0.000000,0.000000,22.888116,0,0.000000,0.000000,0.000000,44619680.0
...,...,...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,38.330143,164.010187,0.000000,43522,3.473328,1.457383,124.014981,0.0
16100,WLD,2017,7.509074e+09,38.034455,164.077303,0.000000,43444,2.666773,2.182718,116.874198,0.0
16101,WLD,2018,7.591945e+09,40.949620,0.000000,0.000000,43342,1.320038,2.457965,96.488669,0.0
16102,WLD,2019,7.673534e+09,49.615022,0.000000,0.000000,0,1.767214,2.301233,87.492619,0.0


In [24]:
# Obtaining a list of all the countries and their associated three letter codes
countries = pd.read_html('https://countrycode.org/')

In [25]:
# Viewing the Dataframe
countries

[               COUNTRY COUNTRY CODE ISO CODES  POPULATION  AREA KM2  \
 0          Afghanistan           93  AF / AFG    29121286    647500   
 1              Albania          355  AL / ALB     2986952     28748   
 2              Algeria          213  DZ / DZA    34586184   2381740   
 3       American Samoa        1-684  AS / ASM       57881       199   
 4              Andorra          376  AD / AND       84000       468   
 ..                 ...          ...       ...         ...       ...   
 235  Wallis and Futuna          681  WF / WLF       16025       274   
 236     Western Sahara          212  EH / ESH      273008    266000   
 237              Yemen          967  YE / YEM    23495361    527970   
 238             Zambia          260  ZM / ZMB    13460305    752614   
 239           Zimbabwe          263  ZW / ZWE    11651858    390580   
 
           GDP $USD  
 0    20.65 Billion  
 1     12.8 Billion  
 2    215.7 Billion  
 3    462.2 Million  
 4      4.8 Billion  
 .

In [26]:
# Determing the type for the country variable
type(countries)

list

In [27]:
# Obtain the first key of the list
countries_list = countries[0]
countries_list

Unnamed: 0,COUNTRY,COUNTRY CODE,ISO CODES,POPULATION,AREA KM2,GDP $USD
0,Afghanistan,93,AF / AFG,29121286,647500,20.65 Billion
1,Albania,355,AL / ALB,2986952,28748,12.8 Billion
2,Algeria,213,DZ / DZA,34586184,2381740,215.7 Billion
3,American Samoa,1-684,AS / ASM,57881,199,462.2 Million
4,Andorra,376,AD / AND,84000,468,4.8 Billion
...,...,...,...,...,...,...
235,Wallis and Futuna,681,WF / WLF,16025,274,
236,Western Sahara,212,EH / ESH,273008,266000,
237,Yemen,967,YE / YEM,23495361,527970,43.89 Billion
238,Zambia,260,ZM / ZMB,13460305,752614,22.24 Billion


In [28]:
# Splitting the column relating to ISO CODES
iso_code_split = countries_list['ISO CODES'].str.split("/",n=1, expand=True)
iso_code_split

Unnamed: 0,0,1
0,AF,AFG
1,AL,ALB
2,DZ,DZA
3,AS,ASM
4,AD,AND
...,...,...
235,WF,WLF
236,EH,ESH
237,YE,YEM
238,ZM,ZMB


In [29]:
# Adding the split list back to the dataframe
countries_list['ISO CODE 2L'] = iso_code_split[0]
countries_list

Unnamed: 0,COUNTRY,COUNTRY CODE,ISO CODES,POPULATION,AREA KM2,GDP $USD,ISO CODE 2L
0,Afghanistan,93,AF / AFG,29121286,647500,20.65 Billion,AF
1,Albania,355,AL / ALB,2986952,28748,12.8 Billion,AL
2,Algeria,213,DZ / DZA,34586184,2381740,215.7 Billion,DZ
3,American Samoa,1-684,AS / ASM,57881,199,462.2 Million,AS
4,Andorra,376,AD / AND,84000,468,4.8 Billion,AD
...,...,...,...,...,...,...,...
235,Wallis and Futuna,681,WF / WLF,16025,274,,WF
236,Western Sahara,212,EH / ESH,273008,266000,,EH
237,Yemen,967,YE / YEM,23495361,527970,43.89 Billion,YE
238,Zambia,260,ZM / ZMB,13460305,752614,22.24 Billion,ZM


In [30]:
# Adding the second three letter code list back to the original dataframe
countries_list['ISO CODE 3L'] = iso_code_split[1]
countries_list

Unnamed: 0,COUNTRY,COUNTRY CODE,ISO CODES,POPULATION,AREA KM2,GDP $USD,ISO CODE 2L,ISO CODE 3L
0,Afghanistan,93,AF / AFG,29121286,647500,20.65 Billion,AF,AFG
1,Albania,355,AL / ALB,2986952,28748,12.8 Billion,AL,ALB
2,Algeria,213,DZ / DZA,34586184,2381740,215.7 Billion,DZ,DZA
3,American Samoa,1-684,AS / ASM,57881,199,462.2 Million,AS,ASM
4,Andorra,376,AD / AND,84000,468,4.8 Billion,AD,AND
...,...,...,...,...,...,...,...,...
235,Wallis and Futuna,681,WF / WLF,16025,274,,WF,WLF
236,Western Sahara,212,EH / ESH,273008,266000,,EH,ESH
237,Yemen,967,YE / YEM,23495361,527970,43.89 Billion,YE,YEM
238,Zambia,260,ZM / ZMB,13460305,752614,22.24 Billion,ZM,ZMB


In [31]:
# Creating a list of countries to iterate
list_of_countries = countries_list['COUNTRY'].tolist()
list_of_countries[:5]

['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra']

In [32]:
# Creating a list of the ISO codes to iterate over
_2l_ISO_CODES = countries_list['ISO CODE 2L'].tolist()
_2l_ISO_CODES[:5]

['AF ', 'AL ', 'DZ ', 'AS ', 'AD ']

In [33]:
# Removing the spaces within the list
_2l_ISO_CODES = [x.strip(' ') for x in _2l_ISO_CODES]
_2l_ISO_CODES[:5]

['AF', 'AL', 'DZ', 'AS', 'AD']

In [34]:
# Creating a list of the three letter ISO CODES to iterate over
_3l_ISO_CODES = countries_list['ISO CODE 3L'].tolist()
_3l_ISO_CODES[:5]

[' AFG', ' ALB', ' DZA', ' ASM', ' AND']

In [35]:
# Removing spaces within the list
_3l_ISO_CODES = [x.strip(' ') for x in _3l_ISO_CODES]
_3l_ISO_CODES[:5]

['AFG', 'ALB', 'DZA', 'ASM', 'AND']

In [36]:
# Creating a dictionary for the list of countries and their codes
countries_dict = {
    "Country":list_of_countries,
    "LOCATION":_3l_ISO_CODES,
    "ISO CODE 2 letter":_2l_ISO_CODES
}

In [37]:
# Converting the Dictionary to a dataframe
country_df = pd.DataFrame(countries_dict)
country_df

Unnamed: 0,Country,LOCATION,ISO CODE 2 letter
0,Afghanistan,AFG,AF
1,Albania,ALB,AL
2,Algeria,DZA,DZ
3,American Samoa,ASM,AS
4,Andorra,AND,AD
...,...,...,...
235,Wallis and Futuna,WLF,WF
236,Western Sahara,ESH,EH
237,Yemen,YEM,YE
238,Zambia,ZMB,ZM


In [38]:
finance_data_final = pd.merge(finance_data, country_df, on='LOCATION', how='inner')
finance_data_final

Unnamed: 0,LOCATION,TIME,"Population, total","Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)",broad_money_growth_%,"Listed domestic companies, total","Foreign direct investment, net inflows (% of GDP)","Inflation, consumer prices (annual %)","Stocks traded, total value (% of GDP)","Total reserves (includes gold, current US$)",Country,ISO CODE 2 letter
0,AFG,1960,8996973.0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.0,5.069080e+07,Afghanistan,AF
1,AFG,1961,9169410.0,0.000000,0.000000,10.291859,0,0.000000,0.000000,0.0,4.244450e+07,Afghanistan,AF
2,AFG,1962,9351441.0,0.000000,0.000000,17.827298,0,0.000000,0.000000,0.0,4.059210e+07,Afghanistan,AF
3,AFG,1963,9543205.0,0.000000,0.000000,15.579196,0,0.000000,0.000000,0.0,4.554780e+07,Afghanistan,AF
4,AFG,1964,9744781.0,0.000000,0.000000,22.888116,0,0.000000,0.000000,0.0,4.461968e+07,Afghanistan,AF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13171,ZWE,2016,14030390.0,7.073321,40.671346,19.048048,0,1.669274,-1.566413,0.0,4.071932e+08,Zimbabwe,ZW
13172,ZWE,2017,14236745.0,6.889214,40.956438,38.646489,0,1.121496,0.909733,0.0,2.926212e+08,Zimbabwe,ZW
13173,ZWE,2018,14439018.0,6.625208,142.527944,28.048464,0,3.062893,0.000000,0.0,8.695109e+07,Zimbabwe,ZW
13174,ZWE,2019,14645468.0,6.398139,90.239497,249.835278,0,1.305924,0.000000,0.0,1.512405e+08,Zimbabwe,ZW


In [39]:
finance_data_final.columns

Index(['LOCATION', 'TIME', 'Population, total',
       'Automated teller machines (ATMs) (per 100,000 adults)',
       'Borrowers from commercial banks (per 1,000 adults)',
       'broad_money_growth_%', 'Listed domestic companies, total',
       'Foreign direct investment, net inflows (% of GDP)',
       'Inflation, consumer prices (annual %)',
       'Stocks traded, total value (% of GDP)',
       'Total reserves (includes gold, current US$)', 'Country',
       'ISO CODE 2 letter'],
      dtype='object')

In [40]:
finance_data_final1 = finance_data_final[['Country','LOCATION', 'TIME',
       'Automated teller machines (ATMs) (per 100,000 adults)',
       'Borrowers from commercial banks (per 1,000 adults)',
       'broad_money_growth_%', 'Listed domestic companies, total',
       'Foreign direct investment, net inflows (% of GDP)',
       'Inflation, consumer prices (annual %)',
       'Stocks traded, total value (% of GDP)',
       'Total reserves (includes gold, current US$)']]

In [41]:
finance_data_final1

Unnamed: 0,Country,LOCATION,TIME,"Automated teller machines (ATMs) (per 100,000 adults)","Borrowers from commercial banks (per 1,000 adults)",broad_money_growth_%,"Listed domestic companies, total","Foreign direct investment, net inflows (% of GDP)","Inflation, consumer prices (annual %)","Stocks traded, total value (% of GDP)","Total reserves (includes gold, current US$)"
0,Afghanistan,AFG,1960,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.0,5.069080e+07
1,Afghanistan,AFG,1961,0.000000,0.000000,10.291859,0,0.000000,0.000000,0.0,4.244450e+07
2,Afghanistan,AFG,1962,0.000000,0.000000,17.827298,0,0.000000,0.000000,0.0,4.059210e+07
3,Afghanistan,AFG,1963,0.000000,0.000000,15.579196,0,0.000000,0.000000,0.0,4.554780e+07
4,Afghanistan,AFG,1964,0.000000,0.000000,22.888116,0,0.000000,0.000000,0.0,4.461968e+07
...,...,...,...,...,...,...,...,...,...,...,...
13171,Zimbabwe,ZWE,2016,7.073321,40.671346,19.048048,0,1.669274,-1.566413,0.0,4.071932e+08
13172,Zimbabwe,ZWE,2017,6.889214,40.956438,38.646489,0,1.121496,0.909733,0.0,2.926212e+08
13173,Zimbabwe,ZWE,2018,6.625208,142.527944,28.048464,0,3.062893,0.000000,0.0,8.695109e+07
13174,Zimbabwe,ZWE,2019,6.398139,90.239497,249.835278,0,1.305924,0.000000,0.0,1.512405e+08


In [42]:
# Exporting Dataframe to CSV
finance_data_final1.to_csv('cleaned_data/financeData.csv', index=False)

# Health Data

In [43]:
# Reading the country dataframe
country_year

Unnamed: 0,LOCATION,TIME,"Population, total"
0,AFG,1960,8.996973e+06
1,AFG,1961,9.169410e+06
2,AFG,1962,9.351441e+06
3,AFG,1963,9.543205e+06
4,AFG,1964,9.744781e+06
...,...,...,...
16099,WLD,2016,7.424286e+09
16100,WLD,2017,7.509074e+09
16101,WLD,2018,7.591945e+09
16102,WLD,2019,7.673534e+09


In [44]:
# Reading cancer incidences csv
cancer_incidence = pd.read_csv('raw_data/health_data_raw/share-of-population-with-cancer.csv')
cancer_incidence

Unnamed: 0,Entity,Code,Year,Prevalence - Neoplasms - Sex: Both - Age: Age-standardized (Percent)
0,Afghanistan,AFG,1990,0.476867
1,Afghanistan,AFG,1991,0.476258
2,Afghanistan,AFG,1992,0.475649
3,Afghanistan,AFG,1993,0.475640
4,Afghanistan,AFG,1994,0.480281
...,...,...,...,...
6463,Zimbabwe,ZWE,2013,0.569652
6464,Zimbabwe,ZWE,2014,0.568133
6465,Zimbabwe,ZWE,2015,0.566873
6466,Zimbabwe,ZWE,2016,0.567007


In [45]:
cancer_incidence1 = cancer_incidence.drop(columns=['Entity']).rename(columns={'Code':'LOCATION','Year':'TIME'})
cancer_incidence1

Unnamed: 0,LOCATION,TIME,Prevalence - Neoplasms - Sex: Both - Age: Age-standardized (Percent)
0,AFG,1990,0.476867
1,AFG,1991,0.476258
2,AFG,1992,0.475649
3,AFG,1993,0.475640
4,AFG,1994,0.480281
...,...,...,...
6463,ZWE,2013,0.569652
6464,ZWE,2014,0.568133
6465,ZWE,2015,0.566873
6466,ZWE,2016,0.567007


In [46]:
health_merge = pd.merge(country_year, cancer_incidence1, on=['LOCATION','TIME'], how='left')
health_merge

Unnamed: 0,LOCATION,TIME,"Population, total",Prevalence - Neoplasms - Sex: Both - Age: Age-standardized (Percent)
0,AFG,1960,8.996973e+06,
1,AFG,1961,9.169410e+06,
2,AFG,1962,9.351441e+06,
3,AFG,1963,9.543205e+06,
4,AFG,1964,9.744781e+06,
...,...,...,...,...
16099,WLD,2016,7.424286e+09,
16100,WLD,2017,7.509074e+09,
16101,WLD,2018,7.591945e+09,
16102,WLD,2019,7.673534e+09,


In [47]:
health_merge1 = health_merge.fillna(0)
health_merge1

Unnamed: 0,LOCATION,TIME,"Population, total",Prevalence - Neoplasms - Sex: Both - Age: Age-standardized (Percent)
0,AFG,1960,8.996973e+06,0.0
1,AFG,1961,9.169410e+06,0.0
2,AFG,1962,9.351441e+06,0.0
3,AFG,1963,9.543205e+06,0.0
4,AFG,1964,9.744781e+06,0.0
...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0
16100,WLD,2017,7.509074e+09,0.0
16101,WLD,2018,7.591945e+09,0.0
16102,WLD,2019,7.673534e+09,0.0


In [48]:
health_merge1['total_cancer_cases'] = (health_merge1['Prevalence - Neoplasms - Sex: Both - Age: Age-standardized (Percent)']/100)*(health_merge1['Population, total'])
health_merge1

Unnamed: 0,LOCATION,TIME,"Population, total",Prevalence - Neoplasms - Sex: Both - Age: Age-standardized (Percent),total_cancer_cases
0,AFG,1960,8.996973e+06,0.0,0.0
1,AFG,1961,9.169410e+06,0.0,0.0
2,AFG,1962,9.351441e+06,0.0,0.0
3,AFG,1963,9.543205e+06,0.0,0.0
4,AFG,1964,9.744781e+06,0.0,0.0
...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0
16100,WLD,2017,7.509074e+09,0.0,0.0
16101,WLD,2018,7.591945e+09,0.0,0.0
16102,WLD,2019,7.673534e+09,0.0,0.0


In [49]:
health_merge2 = health_merge1.drop(columns=['Prevalence - Neoplasms - Sex: Both - Age: Age-standardized (Percent)']).copy()

In [50]:
health_merge2

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases
0,AFG,1960,8.996973e+06,0.0
1,AFG,1961,9.169410e+06,0.0
2,AFG,1962,9.351441e+06,0.0
3,AFG,1963,9.543205e+06,0.0
4,AFG,1964,9.744781e+06,0.0
...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0
16100,WLD,2017,7.509074e+09,0.0
16101,WLD,2018,7.591945e+09,0.0
16102,WLD,2019,7.673534e+09,0.0


In [51]:
# Reading death rates
cancer_deaths = pd.read_csv('raw_data/health_data_raw/total-cancer-deaths-by-type.csv')
cancer_deaths

Unnamed: 0,Code,Year,total_all_cancers
0,AFG,1990,9361.156720
1,AFG,1991,9462.417022
2,AFG,1992,9843.319829
3,AFG,1993,10321.108650
4,AFG,1994,10639.991520
...,...,...,...
6463,ZWE,2013,9304.297265
6464,ZWE,2014,9371.946138
6465,ZWE,2015,9499.036133
6466,ZWE,2016,9698.880172


In [52]:
cancer_deaths1 = cancer_deaths.rename(columns={'Code':'LOCATION','Year':'TIME'})
cancer_deaths1

Unnamed: 0,LOCATION,TIME,total_all_cancers
0,AFG,1990,9361.156720
1,AFG,1991,9462.417022
2,AFG,1992,9843.319829
3,AFG,1993,10321.108650
4,AFG,1994,10639.991520
...,...,...,...
6463,ZWE,2013,9304.297265
6464,ZWE,2014,9371.946138
6465,ZWE,2015,9499.036133
6466,ZWE,2016,9698.880172


In [53]:
health_merge3 = pd.merge(health_merge2, cancer_deaths1, on=['TIME','LOCATION'], how='left')
health_merge3

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_all_cancers
0,AFG,1960,8.996973e+06,0.0,
1,AFG,1961,9.169410e+06,0.0,
2,AFG,1962,9.351441e+06,0.0,
3,AFG,1963,9.543205e+06,0.0,
4,AFG,1964,9.744781e+06,0.0,
...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,
16100,WLD,2017,7.509074e+09,0.0,
16101,WLD,2018,7.591945e+09,0.0,
16102,WLD,2019,7.673534e+09,0.0,


In [54]:
health_merge4 = health_merge3.fillna(0)

In [55]:
health_merge4

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_all_cancers
0,AFG,1960,8.996973e+06,0.0,0.0
1,AFG,1961,9.169410e+06,0.0,0.0
2,AFG,1962,9.351441e+06,0.0,0.0
3,AFG,1963,9.543205e+06,0.0,0.0
4,AFG,1964,9.744781e+06,0.0,0.0
...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0
16100,WLD,2017,7.509074e+09,0.0,0.0
16101,WLD,2018,7.591945e+09,0.0,0.0
16102,WLD,2019,7.673534e+09,0.0,0.0


In [56]:
# Obesity Rates
obesity = pd.read_csv('raw_data/health_data_raw/share-of-adults-defined-as-obese.csv')
obesity

Unnamed: 0,Code,Year,Prevalence of obesity (both sexes) - WHO (2019)
0,AFG,1975,0.5
1,AFG,1976,0.5
2,AFG,1977,0.6
3,AFG,1978,0.6
4,AFG,1979,0.6
...,...,...,...
8311,ZWE,2012,14.3
8312,ZWE,2013,14.6
8313,ZWE,2014,14.9
8314,ZWE,2015,15.2


In [57]:
obesity1 = obesity.rename(columns={'Code':'LOCATION','Year':'TIME'})
obesity1

Unnamed: 0,LOCATION,TIME,Prevalence of obesity (both sexes) - WHO (2019)
0,AFG,1975,0.5
1,AFG,1976,0.5
2,AFG,1977,0.6
3,AFG,1978,0.6
4,AFG,1979,0.6
...,...,...,...
8311,ZWE,2012,14.3
8312,ZWE,2013,14.6
8313,ZWE,2014,14.9
8314,ZWE,2015,15.2


In [58]:
health_merge5 = pd.merge(health_merge4, obesity1, on=['TIME','LOCATION'], how='left')
health_merge5

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_all_cancers,Prevalence of obesity (both sexes) - WHO (2019)
0,AFG,1960,8.996973e+06,0.0,0.0,
1,AFG,1961,9.169410e+06,0.0,0.0,
2,AFG,1962,9.351441e+06,0.0,0.0,
3,AFG,1963,9.543205e+06,0.0,0.0,
4,AFG,1964,9.744781e+06,0.0,0.0,
...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,
16100,WLD,2017,7.509074e+09,0.0,0.0,
16101,WLD,2018,7.591945e+09,0.0,0.0,
16102,WLD,2019,7.673534e+09,0.0,0.0,


In [59]:
health_merge6 = health_merge5.fillna(0)

In [60]:
health_merge6['total_obesity_numbers'] = (health_merge6['Prevalence of obesity (both sexes) - WHO (2019)']/100)*(health_merge6['Population, total'])
health_merge6

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_all_cancers,Prevalence of obesity (both sexes) - WHO (2019),total_obesity_numbers
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,0.0
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,0.0
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,0.0
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,0.0
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,0.0
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,0.0
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,0.0
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.0


In [61]:
health_merge7 = health_merge6.drop(columns=['Prevalence of obesity (both sexes) - WHO (2019)'])
health_merge7

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_all_cancers,total_obesity_numbers
0,AFG,1960,8.996973e+06,0.0,0.0,0.0
1,AFG,1961,9.169410e+06,0.0,0.0,0.0
2,AFG,1962,9.351441e+06,0.0,0.0,0.0
3,AFG,1963,9.543205e+06,0.0,0.0,0.0
4,AFG,1964,9.744781e+06,0.0,0.0,0.0
...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0


In [62]:
health_merge8 = health_merge7.rename(columns={'total_all_cancers':'total_cancer_deaths'})
health_merge8

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers
0,AFG,1960,8.996973e+06,0.0,0.0,0.0
1,AFG,1961,9.169410e+06,0.0,0.0,0.0
2,AFG,1962,9.351441e+06,0.0,0.0,0.0
3,AFG,1963,9.543205e+06,0.0,0.0,0.0
4,AFG,1964,9.744781e+06,0.0,0.0,0.0
...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0


In [63]:
# Birthrate
birthrate = pd.read_csv('raw_data/health_data_raw/birthrate.csv')
birthrate

Unnamed: 0,LOCATION,TIME,"Birth rate, crude (per 1,000 people)"
0,AFG,1960,51.279000
1,AFG,1961,51.373000
2,AFG,1962,51.457000
3,AFG,1963,51.530000
4,AFG,1964,51.589000
...,...,...,...
16099,WLD,2016,18.962718
16100,WLD,2017,18.633493
16101,WLD,2018,18.174850
16102,WLD,2019,0.000000


In [64]:
health_merge9 = pd.merge(health_merge8, birthrate, on=['LOCATION','TIME'], how='left')
health_merge9

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,"Birth rate, crude (per 1,000 people)"
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,51.279000
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,51.373000
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,51.457000
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,51.530000
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,51.589000
...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,18.962718
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,18.633493
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,18.174850
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.000000


In [65]:
health_merge9['birth_rate'] = (health_merge9['Population, total']/1000)*(health_merge9['Birth rate, crude (per 1,000 people)'])
health_merge9

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,"Birth rate, crude (per 1,000 people)",birth_rate
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,51.279000,4.613558e+05
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,51.373000,4.710601e+05
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,51.457000,4.811971e+05
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,51.530000,4.917614e+05
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,51.589000,5.027235e+05
...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,18.962718,1.407846e+08
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,18.633493,1.399203e+08
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,18.174850,1.379825e+08
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.000000,0.000000e+00


In [66]:
health_merge10 = health_merge9.drop(columns=['Birth rate, crude (per 1,000 people)'])
health_merge10

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,birth_rate
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,4.613558e+05
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,4.710601e+05
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,4.811971e+05
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,4.917614e+05
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,5.027235e+05
...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,1.407846e+08
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,1.399203e+08
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,1.379825e+08
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.000000e+00


In [67]:
# Death rates 
deahtrates = pd.read_csv('raw_data/health_data_raw/deathrate.csv')
deahtrates

Unnamed: 0,LOCATION,TIME,"Death rate, crude (per 1,000 people)"
0,AFG,1960,32.219000
1,AFG,1961,31.649000
2,AFG,1962,31.093000
3,AFG,1963,30.551000
4,AFG,1964,30.022000
...,...,...,...
16099,WLD,2016,7.544484
16100,WLD,2017,7.541172
16101,WLD,2018,7.528823
16102,WLD,2019,0.000000


In [68]:
health_merge11 = pd.merge(health_merge10, deahtrates, on=['LOCATION','TIME'], how='left')
health_merge11

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,birth_rate,"Death rate, crude (per 1,000 people)"
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,4.613558e+05,32.219000
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,4.710601e+05,31.649000
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,4.811971e+05,31.093000
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,4.917614e+05,30.551000
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,5.027235e+05,30.022000
...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,1.407846e+08,7.544484
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,1.399203e+08,7.541172
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,1.379825e+08,7.528823
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.000000e+00,0.000000


In [69]:
health_merge11['death_rate'] = (health_merge11['Population, total']/1000) * (health_merge11['Death rate, crude (per 1,000 people)'])
health_merge11

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,birth_rate,"Death rate, crude (per 1,000 people)",death_rate
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,4.613558e+05,32.219000,2.898735e+05
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,4.710601e+05,31.649000,2.902027e+05
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,4.811971e+05,31.093000,2.907644e+05
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,4.917614e+05,30.551000,2.915545e+05
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,5.027235e+05,30.022000,2.925578e+05
...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,1.407846e+08,7.544484,5.601241e+07
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,1.399203e+08,7.541172,5.662722e+07
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,1.379825e+08,7.528823,5.715841e+07
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.000000e+00,0.000000,0.000000e+00


In [70]:
health_merge12 = health_merge11.drop(columns=['Death rate, crude (per 1,000 people)'])
health_merge12

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,birth_rate,death_rate
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,4.613558e+05,2.898735e+05
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,4.710601e+05,2.902027e+05
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,4.811971e+05,2.907644e+05
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,4.917614e+05,2.915545e+05
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,5.027235e+05,2.925578e+05
...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,1.407846e+08,5.601241e+07
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,1.399203e+08,5.662722e+07
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,1.379825e+08,5.715841e+07
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.000000e+00,0.000000e+00


In [71]:
life_expectency = pd.read_csv('raw_data/health_data_raw/lifeexpectency.csv')
life_expectency

Unnamed: 0,LOCATION,TIME,"Life expectancy at birth, total (years)"
0,AFG,1960,32.446000
1,AFG,1961,32.962000
2,AFG,1962,33.471000
3,AFG,1963,33.971000
4,AFG,1964,34.463000
...,...,...,...
16099,WLD,2016,72.180761
16100,WLD,2017,72.385581
16101,WLD,2018,72.563282
16102,WLD,2019,0.000000


In [72]:
health_merge13 = pd.merge(health_merge12, life_expectency, on=['TIME','LOCATION'], how='left')
health_merge13

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,birth_rate,death_rate,"Life expectancy at birth, total (years)"
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,4.613558e+05,2.898735e+05,32.446000
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,4.710601e+05,2.902027e+05,32.962000
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,4.811971e+05,2.907644e+05,33.471000
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,4.917614e+05,2.915545e+05,33.971000
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,5.027235e+05,2.925578e+05,34.463000
...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,1.407846e+08,5.601241e+07,72.180761
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,1.399203e+08,5.662722e+07,72.385581
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,1.379825e+08,5.715841e+07,72.563282
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000


In [73]:
# Immunisation
immunisation_DTP = pd.read_csv('raw_data/health_data_raw/immunisation_DTP.csv')
immunisation_DTP

Unnamed: 0,LOCATION,TIME,"Population, total","Immunization, DPT (% of children ages 12-23 months)"
0,AFG,1960,8.996973e+06,0.000000
1,AFG,1961,9.169410e+06,0.000000
2,AFG,1962,9.351441e+06,0.000000
3,AFG,1963,9.543205e+06,0.000000
4,AFG,1964,9.744781e+06,0.000000
...,...,...,...,...
16099,WLD,2016,7.424286e+09,85.827988
16100,WLD,2017,7.509074e+09,85.781897
16101,WLD,2018,7.591945e+09,85.603028
16102,WLD,2019,7.673534e+09,85.677425


In [74]:
health_merge14 = pd.merge(health_merge13,immunisation_DTP,on=['TIME','LOCATION'],how='left')
health_merge15 = health_merge14.drop(columns=['Population, total_y'])
health_merge16 = health_merge15.rename(columns={'Population, total_x':'Population, total'})
health_merge16

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,birth_rate,death_rate,"Life expectancy at birth, total (years)","Immunization, DPT (% of children ages 12-23 months)"
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,4.613558e+05,2.898735e+05,32.446000,0.000000
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,4.710601e+05,2.902027e+05,32.962000,0.000000
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,4.811971e+05,2.907644e+05,33.471000,0.000000
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,4.917614e+05,2.915545e+05,33.971000,0.000000
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,5.027235e+05,2.925578e+05,34.463000,0.000000
...,...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,1.407846e+08,5.601241e+07,72.180761,85.827988
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,1.399203e+08,5.662722e+07,72.385581,85.781897
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,1.379825e+08,5.715841e+07,72.563282,85.603028
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000,85.677425


In [75]:
# Immunisation Measles
immunisation_Measles = pd.read_csv('raw_data/health_data_raw/immunisation_Measles.csv')
immunisation_Measles

Unnamed: 0,LOCATION,TIME,"Immunization, measles (% of children ages 12-23 months)"
0,AFG,1960,0.000000
1,AFG,1961,0.000000
2,AFG,1962,0.000000
3,AFG,1963,0.000000
4,AFG,1964,0.000000
...,...,...,...
16099,WLD,2016,85.189899
16100,WLD,2017,85.243187
16101,WLD,2018,85.634452
16102,WLD,2019,85.682657


In [76]:
health_merge17 = pd.merge(health_merge16, immunisation_Measles, on=['LOCATION','TIME'],how='left')
health_merge17

Unnamed: 0,LOCATION,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,birth_rate,death_rate,"Life expectancy at birth, total (years)","Immunization, DPT (% of children ages 12-23 months)","Immunization, measles (% of children ages 12-23 months)"
0,AFG,1960,8.996973e+06,0.0,0.0,0.0,4.613558e+05,2.898735e+05,32.446000,0.000000,0.000000
1,AFG,1961,9.169410e+06,0.0,0.0,0.0,4.710601e+05,2.902027e+05,32.962000,0.000000,0.000000
2,AFG,1962,9.351441e+06,0.0,0.0,0.0,4.811971e+05,2.907644e+05,33.471000,0.000000,0.000000
3,AFG,1963,9.543205e+06,0.0,0.0,0.0,4.917614e+05,2.915545e+05,33.971000,0.000000,0.000000
4,AFG,1964,9.744781e+06,0.0,0.0,0.0,5.027235e+05,2.925578e+05,34.463000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,0.0,0.0,0.0,1.407846e+08,5.601241e+07,72.180761,85.827988,85.189899
16100,WLD,2017,7.509074e+09,0.0,0.0,0.0,1.399203e+08,5.662722e+07,72.385581,85.781897,85.243187
16101,WLD,2018,7.591945e+09,0.0,0.0,0.0,1.379825e+08,5.715841e+07,72.563282,85.603028,85.634452
16102,WLD,2019,7.673534e+09,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000,85.677425,85.682657


In [77]:
# Merging health data with list of countries dataframe
health_merge18 = pd.merge(country_df,health_merge17, on=['LOCATION'],how='inner')
health_merge18

Unnamed: 0,Country,LOCATION,ISO CODE 2 letter,TIME,"Population, total",total_cancer_cases,total_cancer_deaths,total_obesity_numbers,birth_rate,death_rate,"Life expectancy at birth, total (years)","Immunization, DPT (% of children ages 12-23 months)","Immunization, measles (% of children ages 12-23 months)"
0,Afghanistan,AFG,AF,1960,8996973.0,0.000000,0.000000,0.00,461355.778467,289873.473087,32.446,0.0,0.0
1,Afghanistan,AFG,AF,1961,9169410.0,0.000000,0.000000,0.00,471060.099930,290202.657090,32.962,0.0,0.0
2,Afghanistan,AFG,AF,1962,9351441.0,0.000000,0.000000,0.00,481197.099537,290764.355013,33.471,0.0,0.0
3,Afghanistan,AFG,AF,1963,9543205.0,0.000000,0.000000,0.00,491761.353650,291554.455955,33.971,0.0,0.0
4,Afghanistan,AFG,AF,1964,9744781.0,0.000000,0.000000,0.00,502723.507009,292557.815182,34.463,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13171,Zimbabwe,ZWE,ZW,2016,14030390.0,79553.232681,9698.880172,2174710.45,461094.736960,116255.811540,60.294,90.0,95.0
13172,Zimbabwe,ZWE,ZW,2017,14236745.0,80654.107052,9871.972337,0.00,451760.392340,114520.376780,60.812,89.0,90.0
13173,Zimbabwe,ZWE,ZW,2018,14439018.0,0.000000,0.000000,0.00,442931.316168,113822.778894,61.195,89.0,88.0
13174,Zimbabwe,ZWE,ZW,2019,14645468.0,0.000000,0.000000,0.00,0.000000,0.000000,0.000,90.0,85.0


In [78]:
# Exporting the Health Dataset to CSV
health_merge18.to_csv('cleaned_data/healthData.csv',index=False)

# Agriculture

In [79]:
# Opening the Agricultural Dataset
agriculture = pd.read_csv('raw_data/agriculture_data_raw/arableland.csv')
agriculture

Unnamed: 0,LOCATION,TIME,Land area (hectare),Agricultural land (% of land area),Forest area (% of land area),Cereal yield (kg per hectare),Cash Crop yield(kg per hectare),Employment in agriculture (% of total employment) (modeled ILO estimate),Livestock production index (2004-2006 = 100)
0,AFG,1960,0.000000e+00,0.000000,6.857399,0.000000,0.000000,77.550771,42.936300
1,AFG,1961,6.528600e+07,57.745918,6.171659,1115.100000,223.020000,78.334112,43.370000
2,AFG,1962,6.528600e+07,57.837821,6.233999,1079.000000,215.800000,87.037902,43.990000
3,AFG,1963,6.528600e+07,57.914407,6.296968,985.800000,197.160000,78.334112,47.030000
4,AFG,1964,6.528600e+07,58.010906,6.360574,1082.800000,216.560000,79.125366,48.560000
...,...,...,...,...,...,...,...,...,...
16099,WLD,2016,1.273546e+10,37.430740,30.716421,3967.029197,793.405839,28.401490,118.883783
16100,WLD,2017,1.273546e+10,0.000000,0.000000,4074.175586,814.835117,27.812793,118.883664
16101,WLD,2018,1.273432e+10,0.000000,0.000000,0.000000,0.000000,27.265629,118.883545
16102,WLD,2019,0.000000e+00,0.000000,0.000000,0.000000,0.000000,26.855347,118.883426


In [80]:
# Merging agriculture dataset with country list dataframe
agriculture_data_final = pd.merge(country_df, agriculture, on=['LOCATION'],how='inner')
agriculture_data_final.to_csv('cleaned_data/agricultureData.csv', index=False)
agriculture_data_final

Unnamed: 0,Country,LOCATION,ISO CODE 2 letter,TIME,Land area (hectare),Agricultural land (% of land area),Forest area (% of land area),Cereal yield (kg per hectare),Cash Crop yield(kg per hectare),Employment in agriculture (% of total employment) (modeled ILO estimate),Livestock production index (2004-2006 = 100)
0,Afghanistan,AFG,AF,1960,0.0,0.000000,6.857399,0.0,0.00,77.550771,42.936300
1,Afghanistan,AFG,AF,1961,65286000.0,57.745918,6.171659,1115.1,223.02,78.334112,43.370000
2,Afghanistan,AFG,AF,1962,65286000.0,57.837821,6.233999,1079.0,215.80,87.037902,43.990000
3,Afghanistan,AFG,AF,1963,65286000.0,57.914407,6.296968,985.8,197.16,78.334112,47.030000
4,Afghanistan,AFG,AF,1964,65286000.0,58.010906,6.360574,1082.8,216.56,79.125366,48.560000
...,...,...,...,...,...,...,...,...,...,...,...
13171,Zimbabwe,ZWE,ZW,2016,38685000.0,41.876696,35.542457,581.8,116.36,67.066002,106.250000
13172,Zimbabwe,ZWE,ZW,2017,38685000.0,0.000000,0.000000,616.0,123.20,66.764999,106.249894
13173,Zimbabwe,ZWE,ZW,2018,38685000.0,0.000000,0.000000,0.0,0.00,66.306999,106.249787
13174,Zimbabwe,ZWE,ZW,2019,0.0,0.000000,0.000000,0.0,0.00,66.543999,106.249681


# Environment Data

In [81]:
# Loading the dataframe
environment = pd.read_csv('raw_data/environment_data_raw/environment_data.csv')
environment

Unnamed: 0,LOCATION,TIME,"PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",Access to electricity (% of population),Renewable electricity output (% of total electricity output),Urban population (% of total population),Electric power consumption (kWh per capita)
0,AFG,1960,64.951772,22.25299274,2.871174195,8.401000,138.332032
1,AFG,1961,64.958268,22.27526801,3.19019355,8.684000,138.470503
2,AFG,1962,64.964764,22.29756558,3.5446595,8.976000,138.609112
3,AFG,1963,64.971261,22.31988546,3.938510556,9.276000,138.747860
4,AFG,1964,64.977759,22.34222769,4.37612284,9.586000,138.886746
...,...,...,...,...,...,...,...
16099,WLD,2016,45.155516,87.95486844,22.62591152,54.370281,3139.062621
16100,WLD,2017,45.528389,88.8495377,22.39965241,54.823078,3142.204826
16101,WLD,2018,45.482860,89.58855123,22.17565588,55.270468,3145.350176
16102,WLD,2019,45.437378,89.58944712,21.95389932,55.714285,3148.498675


In [82]:
# Merging data with the country dataframe
environment_data_final = pd.merge(country_df, environment, on=['LOCATION'],how='inner')
environment_data_final.to_csv('cleaned_data/environmentData.csv', index=False)
environment_data_final

Unnamed: 0,Country,LOCATION,ISO CODE 2 letter,TIME,"PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",Access to electricity (% of population),Renewable electricity output (% of total electricity output),Urban population (% of total population),Electric power consumption (kWh per capita)
0,Afghanistan,AFG,AF,1960,64.951772,22.25299274,2.871174195,8.401,138.332032
1,Afghanistan,AFG,AF,1961,64.958268,22.27526801,3.19019355,8.684,138.470503
2,Afghanistan,AFG,AF,1962,64.964764,22.29756558,3.5446595,8.976,138.609112
3,Afghanistan,AFG,AF,1963,64.971261,22.31988546,3.938510556,9.276,138.747860
4,Afghanistan,AFG,AF,1964,64.977759,22.34222769,4.37612284,9.586,138.886746
...,...,...,...,...,...,...,...,...,...
13171,Zimbabwe,ZWE,ZW,2016,21.726947,39.92399216,52.19703368,32.296,610.346034
13172,Zimbabwe,ZWE,ZW,2017,22.251671,40.48204803,51.67506334,32.237,610.956991
13173,Zimbabwe,ZWE,ZW,2018,22.229419,41.04158401,51.15831271,32.209,611.568559
13174,Zimbabwe,ZWE,ZW,2019,22.207190,41.04199443,50.64672958,32.210,612.180740


# Science and Technology

In [83]:
# Dataframe
science_tech = pd.read_csv('raw_data/science_tech_data_raw/sci_tech.csv')
science_tech

Unnamed: 0,LOCATION,TIME,"Population, total",High-technology exports (current US$),"Patent applications, residents",Scientific and technical journal articles,"Technical cooperation grants (BoP, current US$)"
0,AFG,1960,8.996973e+06,0.000000e+00,0,0.000,1510000
1,AFG,1961,9.169410e+06,0.000000e+00,0,0.000,1620000
2,AFG,1962,9.351441e+06,0.000000e+00,0,0.000,1870000
3,AFG,1963,9.543205e+06,0.000000e+00,0,0.000,1990000
4,AFG,1964,9.744781e+06,0.000000e+00,0,0.000,2150000
...,...,...,...,...,...,...,...
16099,WLD,2016,7.424286e+09,2.258700e+12,2128683,2376419.914,0
16100,WLD,2017,7.509074e+09,2.685150e+12,2162897,2464455.025,0
16101,WLD,2018,7.591945e+09,2.927110e+12,2294847,2554373.362,0
16102,WLD,2019,7.673534e+09,0.000000e+00,0,0.000,0


In [84]:
# Removing populations column
science_tech1 = science_tech.drop(columns=['Population, total']).copy()
science_tech1

Unnamed: 0,LOCATION,TIME,High-technology exports (current US$),"Patent applications, residents",Scientific and technical journal articles,"Technical cooperation grants (BoP, current US$)"
0,AFG,1960,0.000000e+00,0,0.000,1510000
1,AFG,1961,0.000000e+00,0,0.000,1620000
2,AFG,1962,0.000000e+00,0,0.000,1870000
3,AFG,1963,0.000000e+00,0,0.000,1990000
4,AFG,1964,0.000000e+00,0,0.000,2150000
...,...,...,...,...,...,...
16099,WLD,2016,2.258700e+12,2128683,2376419.914,0
16100,WLD,2017,2.685150e+12,2162897,2464455.025,0
16101,WLD,2018,2.927110e+12,2294847,2554373.362,0
16102,WLD,2019,0.000000e+00,0,0.000,0


In [85]:
# Merging with countries
science_tech_final = pd.merge(country_df, science_tech1, on=['LOCATION'],how='inner')
science_tech_final.to_csv('cleaned_data/scitechData.csv', index=False)
science_tech_final

Unnamed: 0,Country,LOCATION,ISO CODE 2 letter,TIME,High-technology exports (current US$),"Patent applications, residents",Scientific and technical journal articles,"Technical cooperation grants (BoP, current US$)"
0,Afghanistan,AFG,AF,1960,0.0,0,0.00,1510000
1,Afghanistan,AFG,AF,1961,0.0,0,0.00,1620000
2,Afghanistan,AFG,AF,1962,0.0,0,0.00,1870000
3,Afghanistan,AFG,AF,1963,0.0,0,0.00,1990000
4,Afghanistan,AFG,AF,1964,0.0,0,0.00,2150000
...,...,...,...,...,...,...,...,...
13171,Zimbabwe,ZWE,ZW,2016,13735165.0,8,313.80,59800000
13172,Zimbabwe,ZWE,ZW,2017,11206210.0,0,340.45,52490000
13173,Zimbabwe,ZWE,ZW,2018,9719357.0,0,359.33,54420000
13174,Zimbabwe,ZWE,ZW,2019,27810712.0,0,0.00,0
