In [31]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from functools import reduce

### Population Data

In [2]:
# Load population data
file_path = Path('resources/Population.csv')
population_df = pd.read_csv(file_path, header=2)
population_df=population_df.drop(columns=["Country Name", "Indicator Name", "Indicator Code"])
population_df.head()

Unnamed: 0,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,ABW,54208.0,55434.0,56234.0,56699.0,57029.0,57357.0,57702.0,58044.0,58377.0,...,103165.0,103776.0,104339.0,104865.0,105361.0,105846.0,106310.0,106766.0,,
1,AFE,130836765.0,134159786.0,137614644.0,141202036.0,144920186.0,148769974.0,152752671.0,156876454.0,161156430.0,...,562601578.0,578075373.0,593871847.0,609978946.0,626392880.0,643090131.0,660046272.0,677243299.0,,
2,AFG,8996967.0,9169406.0,9351442.0,9543200.0,9744772.0,9956318.0,10174840.0,10399936.0,10637064.0,...,32269592.0,33370804.0,34413603.0,35383028.0,36296111.0,37171922.0,38041757.0,38928341.0,,
3,AFW,96396419.0,98407221.0,100506960.0,102691339.0,104953470.0,107289875.0,109701811.0,112195950.0,114781116.0,...,380437896.0,390882979.0,401586651.0,412551299.0,423769930.0,435229381.0,446911598.0,458803476.0,,
4,AGO,5454938.0,5531451.0,5608499.0,5679409.0,5734995.0,5770573.0,5781305.0,5774440.0,5771973.0,...,26015786.0,26941773.0,27884380.0,28842482.0,29816769.0,30809787.0,31825299.0,32866268.0,,


In [3]:
# Re-oraganize the data
population_df2=population_df.set_index('Country Code').stack(dropna=False).reset_index().rename(columns={'Country Code':'Country_Code', 'level_1':'Year', 0:'Population'})
population_df2

Unnamed: 0,Country_Code,Year,Population
0,ABW,1960,54208.0
1,ABW,1961,55434.0
2,ABW,1962,56234.0
3,ABW,1963,56699.0
4,ABW,1964,57029.0
...,...,...,...
16753,ZWE,2018,14438812.0
16754,ZWE,2019,14645473.0
16755,ZWE,2020,14862927.0
16756,ZWE,2021,


In [4]:
# Order by Country_Code and Year, then calculate the year-over-year percetnage increase for the population for a particular country code 
population_df2.sort_values(by=['Country_Code', 'Year'])

population_df2 = population_df2.set_index(['Country_Code', 'Year']).pct_change().reset_index()
population_df2

Unnamed: 0,Country_Code,Year,Population
0,ABW,1960,
1,ABW,1961,0.022617
2,ABW,1962,0.014432
3,ABW,1963,0.008269
4,ABW,1964,0.005820
...,...,...,...
16753,ZWE,2018,0.014204
16754,ZWE,2019,0.014313
16755,ZWE,2020,0.014848
16756,ZWE,2021,0.000000


In [5]:
# Determine if previous row is the same country or not
population_df2['CC_Change'] = ((population_df2["Country_Code"] == population_df2["Country_Code"].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
population_df2['Population_Diff'] = np.where(population_df2['CC_Change']==1, (population_df2["Population"] - population_df2["Population"].shift(1)), np.NaN)

# Change to boolean
population_df2['Population_Change'] = population_df2['Population_Diff'].gt(0).astype(int).astype(int)

population_df2

Unnamed: 0,Country_Code,Year,Population,CC_Change,Population_Diff,Population_Change
0,ABW,1960,,0,,0
1,ABW,1961,0.022617,1,,0
2,ABW,1962,0.014432,1,-0.008185,0
3,ABW,1963,0.008269,1,-0.006163,0
4,ABW,1964,0.005820,1,-0.002449,0
...,...,...,...,...,...,...
16753,ZWE,2018,0.014204,1,-0.000497,0
16754,ZWE,2019,0.014313,1,0.000109,1
16755,ZWE,2020,0.014848,1,0.000535,1
16756,ZWE,2021,0.000000,1,-0.014848,0


In [6]:
# Remove Un-needed columns 
population_df3=population_df2.drop(columns=['Population', 'CC_Change', 'Population_Diff'])

population_df3

Unnamed: 0,Country_Code,Year,Population_Change
0,ABW,1960,0
1,ABW,1961,0
2,ABW,1962,0
3,ABW,1963,0
4,ABW,1964,0
...,...,...,...
16753,ZWE,2018,0
16754,ZWE,2019,1
16755,ZWE,2020,1
16756,ZWE,2021,0


### Inflation (annaul %)

In [7]:
# Load inflation data
file_path = Path('resources/Inflation.csv')
inflation_df = pd.read_csv(file_path, header=2)
inflation_df=inflation_df.drop(columns=['Country Name', 'Indicator Name', 'Indicator Code'])
inflation_df.head()

Unnamed: 0,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,ABW,,,,,,,,,,...,-2.372065,0.421441,0.474764,-0.931196,-1.028282,3.626041,4.257462,,,
1,AFE,,,,,,,,,,...,5.750981,5.37029,5.250171,6.571396,6.399343,4.720811,4.120246,5.191456,,
2,AFG,,,,,,,,,,...,7.385772,4.673996,-0.661709,4.383892,4.975952,0.626149,2.302373,,,
3,AFW,,,,,,,,,,...,2.439201,1.758052,2.130268,1.494564,1.764635,1.78405,1.758565,2.437609,,
4,AGO,,,,,,,,,,...,8.777814,7.280387,9.150372,30.695313,29.843587,19.628608,17.081215,,,


In [8]:
# Re-oraganize the data
inflation_df2=inflation_df.set_index('Country Code').stack(dropna=False).reset_index().rename(columns={'Country Code':'Country_Code', 'level_1':'Year', 0:'Inflation'})
inflation_df2.head(n=10)

Unnamed: 0,Country_Code,Year,Inflation
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
5,ABW,1965,
6,ABW,1966,
7,ABW,1967,
8,ABW,1968,
9,ABW,1969,


In [9]:
# Order by Country_Code and Year, then set inflation column as an percentage and calculate the difference from one year to another
inflation_df2.sort_values(by=['Country_Code', 'Year'])

inflation_df2['Inflation']=inflation_df2['Inflation'].div(100)

# Determine if previous row is the same country or not
inflation_df2['CC_Change'] = ((inflation_df2['Country_Code'] == inflation_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
inflation_df2['Inflation_Diff'] = np.where(inflation_df2['CC_Change']==1, (inflation_df2['Inflation'] - inflation_df2['Inflation'].shift(1)), np.NaN)

inflation_df2

Unnamed: 0,Country_Code,Year,Inflation,CC_Change,Inflation_Diff
0,ABW,1960,,0,
1,ABW,1961,,1,
2,ABW,1962,,1,
3,ABW,1963,,1,
4,ABW,1964,,1,
...,...,...,...,...,...
16753,ZWE,2018,0.106189,1,0.097249
16754,ZWE,2019,2.553050,1,2.446861
16755,ZWE,2020,5.572018,1,3.018968
16756,ZWE,2021,,1,


In [10]:
# Remove Un-needed columns 
inflation_df2=inflation_df2.drop(columns=['Inflation', 'CC_Change'])

inflation_df2

Unnamed: 0,Country_Code,Year,Inflation_Diff
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16753,ZWE,2018,0.097249
16754,ZWE,2019,2.446861
16755,ZWE,2020,3.018968
16756,ZWE,2021,


### Military Expenditure (% of GDP)

In [14]:
# Load military data
file_path = Path('resources/Military_Expenditure.csv')
military_df = pd.read_csv(file_path, header=2)
military_df=military_df.drop(columns=["Country Name", "Indicator Name", "Indicator Code"])
military_df.head()

Unnamed: 0,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,ABW,,,,,,,,,,...,,,,,,,,,,
1,AFE,,,,,1.992565,2.379536,1.93983,2.501762,2.16698,...,1.785125,1.844388,1.644433,1.42043,1.376636,1.164407,1.116059,1.151457,,
2,AFG,,,,,,,,,,...,1.07695,1.298013,0.994576,0.956772,0.945227,1.006746,1.118231,1.369684,,
3,AFW,,,1.319932,1.316581,1.436651,1.471695,1.371958,,5.553581,...,0.824629,0.806598,0.72028,0.897858,0.88335,0.919133,0.871359,1.008386,,
4,AGO,,,,,,,,,,...,4.455239,4.698455,3.105426,2.733341,2.507985,1.871776,1.64374,1.615852,,


In [15]:
# Re-oraganize the data
military_df2=military_df.set_index('Country Code').stack(dropna=False).reset_index().rename(columns={'Country Code':'Country_Code', 'level_1':'Year', 0:'Military_Expenditure'})
military_df2.head(n=10)

Unnamed: 0,Country_Code,Year,Military_Expenditure
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
5,ABW,1965,
6,ABW,1966,
7,ABW,1967,
8,ABW,1968,
9,ABW,1969,


In [16]:
# Order by Country_Code and Year, then set Military Expenditure column as an percentage and calculate the difference from one year to another
military_df2.sort_values(by=['Country_Code', 'Year'])

military_df2['Military_Expenditure']=military_df2['Military_Expenditure'].div(100)

# Determine if previous row is the same country or not
military_df2['CC_Change'] = ((military_df2['Country_Code'] == military_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
military_df2['Military_Diff'] = np.where(military_df2['CC_Change']==1, (military_df2['Military_Expenditure'] - military_df2['Military_Expenditure'].shift(1)), np.NaN)

military_df2

Unnamed: 0,Country_Code,Year,Military_Expenditure,CC_Change,Military_Diff
0,ABW,1960,,0,
1,ABW,1961,,1,
2,ABW,1962,,1,
3,ABW,1963,,1,
4,ABW,1964,,1,
...,...,...,...,...,...
16753,ZWE,2018,0.012228,1,-0.003222
16754,ZWE,2019,0.006986,1,-0.005242
16755,ZWE,2020,,1,
16756,ZWE,2021,,1,


In [17]:
# Remove Un-needed columns 
military_df2=military_df2.drop(columns=['Military_Expenditure', 'CC_Change'])

military_df2

Unnamed: 0,Country_Code,Year,Military_Diff
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16753,ZWE,2018,-0.003222
16754,ZWE,2019,-0.005242
16755,ZWE,2020,
16756,ZWE,2021,


### Exports (% of GDP)

In [18]:
# Load export data
file_path = Path('resources/Exports.csv')
exports_df = pd.read_csv(file_path, header=2)
exports_df=exports_df.drop(columns=["Country Name", "Indicator Name", "Indicator Code"])
exports_df.head()

Unnamed: 0,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,ABW,,,,,,,,,,...,76.548312,78.275504,72.852959,71.820388,72.548665,73.799397,,,,
1,AFE,,,,,,,,,,...,29.414972,28.388673,23.891382,23.685841,23.576565,25.506451,24.137013,22.62033,,
2,AFG,4.132233,4.453443,4.878051,9.171601,8.888893,11.258279,8.571429,6.772908,8.899677,...,,,,,,,,,,
3,AFW,16.764087,17.220211,16.346545,16.455685,17.537095,17.944099,17.38133,18.600435,19.747851,...,23.666712,23.440559,17.459096,16.87923,20.816501,22.520886,21.60609,16.958756,,
4,AGO,,,,,,,,,,...,50.747084,44.695031,29.754599,28.124485,29.0041,40.83629,39.343826,37.788167,,


In [19]:
# Re-oraganize the data
exports_df2=exports_df.set_index('Country Code').stack(dropna=False).reset_index().rename(columns={'Country Code':'Country_Code', 'level_1':'Year', 0:'Exports'})
exports_df2.head(n=10)

Unnamed: 0,Country_Code,Year,Exports
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
5,ABW,1965,
6,ABW,1966,
7,ABW,1967,
8,ABW,1968,
9,ABW,1969,


In [20]:
# Order by Country_Code and Year, then set Exports column as an percentage and calculate the difference from one year to another
exports_df2.sort_values(by=['Country_Code', 'Year'])

exports_df2['Exports']=exports_df2['Exports'].div(100)

# Determine if previous row is the same country or not
exports_df2['CC_Change'] = ((exports_df2['Country_Code'] == exports_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
exports_df2['Export_Diff'] = np.where(exports_df2['CC_Change']==1, (exports_df2['Exports'] - exports_df2['Exports'].shift(1)), np.NaN)

exports_df2

Unnamed: 0,Country_Code,Year,Exports,CC_Change,Export_Diff
0,ABW,1960,,0,
1,ABW,1961,,1,
2,ABW,1962,,1,
3,ABW,1963,,1,
4,ABW,1964,,1,
...,...,...,...,...,...
16753,ZWE,2018,0.280498,1,0.083909
16754,ZWE,2019,0.312510,1,0.032013
16755,ZWE,2020,0.372022,1,0.059512
16756,ZWE,2021,,1,


In [21]:
# Remove Un-needed columns 
exports_df2=exports_df2.drop(columns=['Exports', 'CC_Change'])

exports_df2

Unnamed: 0,Country_Code,Year,Export_Diff
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16753,ZWE,2018,0.083909
16754,ZWE,2019,0.032013
16755,ZWE,2020,0.059512
16756,ZWE,2021,


### Life Expectancy

In [22]:
# Load life data
file_path = Path('resources/Life_Expectancy.csv')
life_df = pd.read_csv(file_path, header=4)
life_df=life_df.drop(columns=["Country Name", "Indicator Name", "Indicator Code"])
life_df.head()

Unnamed: 0,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,ABW,65.662,66.074,66.444,66.787,67.113,67.435,67.762,68.095,68.436,...,75.299,75.441,75.583,75.725,75.868,76.01,76.152,76.293,76.434,
1,AFE,42.716053,43.166935,43.60399,44.025617,44.432721,44.826919,45.213048,45.594294,45.974059,...,60.185561,60.953363,61.647367,62.259288,62.787681,63.246264,63.648988,64.005213,64.325702,
2,AFG,32.446,32.962,33.471,33.971,34.463,34.948,35.43,35.914,36.403,...,62.054,62.525,62.966,63.377,63.763,64.13,64.486,64.833,65.173,
3,AFW,37.20538,37.632546,38.052612,38.463746,38.867073,39.264841,39.662762,40.066408,40.482832,...,55.138944,55.618986,56.088269,56.542009,56.974761,57.382363,57.762347,58.115723,58.445953,
4,AGO,37.524,37.811,38.113,38.43,38.76,39.102,39.454,39.813,40.178,...,57.236,58.054,58.776,59.398,59.925,60.379,60.782,61.147,61.487,


In [23]:
# Re-oraganize the data
life_df2=life_df.set_index('Country Code').stack(dropna=False).reset_index().rename(columns={'Country Code':'Country_Code', 'level_1':'Year', 0:'Life_Expectancy'})
life_df2.head(n=10)

Unnamed: 0,Country_Code,Year,Life_Expectancy
0,ABW,1960,65.662
1,ABW,1961,66.074
2,ABW,1962,66.444
3,ABW,1963,66.787
4,ABW,1964,67.113
5,ABW,1965,67.435
6,ABW,1966,67.762
7,ABW,1967,68.095
8,ABW,1968,68.436
9,ABW,1969,68.784


In [24]:
# Order by Country_Code and Year, then calculate the percentage difference from one year to another
life_df2.sort_values(by=['Country_Code', 'Year'])

# Determine if previous row is the same country or not
life_df2['CC_Change'] = ((life_df2['Country_Code'] == life_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
life_df2['Life_Diff'] = np.where(life_df2['CC_Change']==1, (life_df2['Life_Expectancy'] / life_df2['Life_Expectancy'].shift(1) - 1), np.NaN)

life_df2

Unnamed: 0,Country_Code,Year,Life_Expectancy,CC_Change,Life_Diff
0,ABW,1960,65.662,0,
1,ABW,1961,66.074,1,0.006275
2,ABW,1962,66.444,1,0.005600
3,ABW,1963,66.787,1,0.005162
4,ABW,1964,67.113,1,0.004881
...,...,...,...,...,...
16487,ZWE,2017,60.812,1,0.008591
16488,ZWE,2018,61.195,1,0.006298
16489,ZWE,2019,61.490,1,0.004821
16490,ZWE,2020,61.738,1,0.004033


In [25]:
# Remove Un-needed columns 
life_df2=life_df2.drop(columns=['Life_Expectancy', 'CC_Change'])

life_df2

Unnamed: 0,Country_Code,Year,Life_Diff
0,ABW,1960,
1,ABW,1961,0.006275
2,ABW,1962,0.005600
3,ABW,1963,0.005162
4,ABW,1964,0.004881
...,...,...,...
16487,ZWE,2017,0.008591
16488,ZWE,2018,0.006298
16489,ZWE,2019,0.004821
16490,ZWE,2020,0.004033


### GDP

In [26]:
# Load GDP data
file_path = Path('resources/GDP.csv')
gdp_df = pd.read_csv(file_path, header=4)
gdp_df=gdp_df.drop(columns=["Country Name", "Indicator Name", "Indicator Code"])
gdp_df.head()

Unnamed: 0,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,ABW,,,,,,,,,,...,2534637000.0,2727850000.0,2790849000.0,2962905000.0,2983637000.0,3092430000.0,3202189000.0,,,
1,AFE,20082720000.0,20509450000.0,22350430000.0,26758660000.0,24464990000.0,27878940000.0,30313840000.0,31375550000.0,34187180000.0,...,950521000000.0,964242000000.0,984807000000.0,919930000000.0,873355000000.0,985356000000.0,1012850000000.0,1009910000000.0,920792000000.0,
2,AFG,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,1400000000.0,1673333000.0,1373333000.0,...,19907320000.0,20146400000.0,20497130000.0,19134210000.0,18116560000.0,18753470000.0,18053230000.0,18799450000.0,20116140000.0,
3,AFW,10404280000.0,11128050000.0,11943350000.0,12676520000.0,13838580000.0,14862470000.0,15832850000.0,14426430000.0,14880350000.0,...,727571000000.0,820788000000.0,864967000000.0,760730000000.0,690543000000.0,683742000000.0,741692000000.0,794572000000.0,784588000000.0,
4,AGO,,,,,,,,,,...,128053000000.0,136710000000.0,145712000000.0,116194000000.0,101124000000.0,122124000000.0,101353000000.0,89417190000.0,58375980000.0,


In [27]:
# Re-oraganize the data
gdp_df2=gdp_df.set_index('Country Code').stack(dropna=False).reset_index().rename(columns={'Country Code':'Country_Code', 'level_1':'Year', 0:'GDP'})
gdp_df2

Unnamed: 0,Country_Code,Year,GDP
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16487,ZWE,2017,1.758489e+10
16488,ZWE,2018,1.811554e+10
16489,ZWE,2019,1.928429e+10
16490,ZWE,2020,1.805117e+10


In [28]:
# Order by Country_Code and Year, then calculate the percentage difference from one year to another
gdp_df2.sort_values(by=['Country_Code', 'Year'])

# Determine if previous row is the same country or not
gdp_df2['CC_Change'] = ((gdp_df2['Country_Code'] == gdp_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
gdp_df2['GDP_Diff'] = np.where(gdp_df2['CC_Change']==1, (gdp_df2['GDP'] / gdp_df2['GDP'].shift(1) - 1), np.NaN)

gdp_df2

Unnamed: 0,Country_Code,Year,GDP,CC_Change,GDP_Diff
0,ABW,1960,,0,
1,ABW,1961,,1,
2,ABW,1962,,1,
3,ABW,1963,,1,
4,ABW,1964,,1,
...,...,...,...,...,...
16487,ZWE,2017,1.758489e+10,1,-0.144232
16488,ZWE,2018,1.811554e+10,1,0.030177
16489,ZWE,2019,1.928429e+10,1,0.064516
16490,ZWE,2020,1.805117e+10,1,-0.063944


In [29]:
# Remove Un-needed columns 
gdp_df2=gdp_df2.drop(columns=['GDP', 'CC_Change'])

gdp_df2

Unnamed: 0,Country_Code,Year,GDP_Diff
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16487,ZWE,2017,-0.144232
16488,ZWE,2018,0.030177
16489,ZWE,2019,0.064516
16490,ZWE,2020,-0.063944


### Combine Data and Export

In [32]:
data_frames = [population_df3, inflation_df2, military_df2, exports_df2, life_df2, gdp_df2]

machine_learning_df = reduce(lambda left,right: pd.merge(left,right, on=['Country_Code', 'Year'], how='outer'), data_frames)

machine_learning_df 

Unnamed: 0,Country_Code,Year,Population_Change,Inflation_Diff,Military_Diff,Export_Diff,Life_Diff,GDP_Diff
0,ABW,1960,0,,,,,
1,ABW,1961,0,,,,0.006275,
2,ABW,1962,0,,,,0.005600,
3,ABW,1963,0,,,,0.005162,
4,ABW,1964,0,,,,0.004881,
...,...,...,...,...,...,...,...,...
16753,ZWE,2018,0,0.097249,-0.003222,0.083909,0.006298,0.030177
16754,ZWE,2019,1,2.446861,-0.005242,0.032013,0.004821,0.064516
16755,ZWE,2020,1,3.018968,,0.059512,0.004033,-0.063944
16756,ZWE,2021,0,,,,,


In [34]:
pd.DataFrame.to_csv(machine_learning_df, 'resources/machine_learning_df.txt', sep=',', index=False)