In [29]:
# Dependencies
import pandas as pd
import numpy as np

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

In [9]:
# for sqlite
# engine = create_engine("sqlite:///db.sqlite")

# for postgresql
# must create a new database "predicting_population" using pgAdmin first
from config import db_password
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/predicting_population"
engine = create_engine(db_string)

In [10]:
Base = automap_base()
Base.prepare(engine, reflect=True)

In [11]:
Base.classes.keys()

['countries',
 'population',
 'inflation',
 'military',
 'exports',
 'life_expectancy',
 'gdp']

In [13]:
Country = Base.classes.countries
Population = Base.classes.population
Inflation = Base.classes.inflation
MilitaryExpenditure = Base.classes.military
Exports = Base.classes.exports
LifeExpectancy = Base.classes.life_expectancy
GDP = Base.classes.gdp

session = Session(engine)

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from functools import reduce

### Population Data

In [26]:
# Order by Country_Code and Year, then calculate the year-over-year percetnage increase for the population for a particular country code 
population_df2 = pd.read_sql('SELECT * FROM population ORDER BY country_code3, year', engine, index_col='index').set_index('country_code3').reset_index()
population_df2.columns = ['Country_Code', 'Year', 'Population']
population_df2

Unnamed: 0,Country_Code,Year,Population
0,ABW,1960,54208.0
1,ABW,1961,55434.0
2,ABW,1962,56234.0
3,ABW,1963,56699.0
4,ABW,1964,57029.0
...,...,...,...
16221,ZWE,2016,14030338.0
16222,ZWE,2017,14236599.0
16223,ZWE,2018,14438812.0
16224,ZWE,2019,14645473.0


In [27]:
# Order by Country_Code and Year, then calculate the year-over-year percetnage increase for the population for a particular country code 
population_df2 = population_df2.set_index(['Country_Code', 'Year']).pct_change().reset_index()
population_df2

Unnamed: 0,Country_Code,Year,Population
0,ABW,1960,
1,ABW,1961,0.022617
2,ABW,1962,0.014432
3,ABW,1963,0.008269
4,ABW,1964,0.005820
...,...,...,...
16221,ZWE,2016,0.015614
16222,ZWE,2017,0.014701
16223,ZWE,2018,0.014204
16224,ZWE,2019,0.014313


In [30]:
# Determine if previous row is the same country or not
population_df2['CC_Change'] = ((population_df2["Country_Code"] == population_df2["Country_Code"].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
population_df2['Population_Diff'] = np.where(population_df2['CC_Change']==1, (population_df2["Population"] - population_df2["Population"].shift(1)), np.NaN)

# Change to boolean
population_df2['Population_Change'] = population_df2['Population_Diff'].gt(0).astype(int).astype(int)

population_df2

Unnamed: 0,Country_Code,Year,Population,CC_Change,Population_Diff,Population_Change
0,ABW,1960,,0,,0
1,ABW,1961,0.022617,1,,0
2,ABW,1962,0.014432,1,-0.008185,0
3,ABW,1963,0.008269,1,-0.006163,0
4,ABW,1964,0.005820,1,-0.002449,0
...,...,...,...,...,...,...
16221,ZWE,2016,0.015614,1,-0.001163,0
16222,ZWE,2017,0.014701,1,-0.000913,0
16223,ZWE,2018,0.014204,1,-0.000497,0
16224,ZWE,2019,0.014313,1,0.000109,1


In [31]:
# Remove Un-needed columns 
population_df3=population_df2.drop(columns=['Population', 'CC_Change', 'Population_Diff'])

population_df3

Unnamed: 0,Country_Code,Year,Population_Change
0,ABW,1960,0
1,ABW,1961,0
2,ABW,1962,0
3,ABW,1963,0
4,ABW,1964,0
...,...,...,...
16221,ZWE,2016,0
16222,ZWE,2017,0
16223,ZWE,2018,0
16224,ZWE,2019,1


### Inflation (annual %)

In [34]:
# Re-organize the data
inflation_df2 = pd.read_sql('SELECT * FROM inflation ORDER BY country_code3, year', engine, index_col='index').set_index('country_code3').reset_index()
inflation_df2.columns = ['Country_Code', 'Year', 'Inflation']
inflation_df2

Unnamed: 0,Country_Code,Year,Inflation
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16487,ZWE,2017,0.893962
16488,ZWE,2018,10.618866
16489,ZWE,2019,255.304991
16490,ZWE,2020,557.201817


In [36]:
# set inflation column as an percentage and calculate the difference from one year to another
inflation_df2['Inflation']=inflation_df2['Inflation'].div(100)

# Determine if previous row is the same country or not
inflation_df2['CC_Change'] = ((inflation_df2['Country_Code'] == inflation_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
inflation_df2['Inflation_Diff'] = np.where(inflation_df2['CC_Change']==1, (inflation_df2['Inflation'] - inflation_df2['Inflation'].shift(1)), np.NaN)

inflation_df2

Unnamed: 0,Country_Code,Year,Inflation,CC_Change,Inflation_Diff
0,ABW,1960,,0,
1,ABW,1961,,1,
2,ABW,1962,,1,
3,ABW,1963,,1,
4,ABW,1964,,1,
...,...,...,...,...,...
16487,ZWE,2017,0.008940,1,0.024376
16488,ZWE,2018,0.106189,1,0.097249
16489,ZWE,2019,2.553050,1,2.446861
16490,ZWE,2020,5.572018,1,3.018968


In [37]:
# Remove Un-needed columns 
inflation_df2=inflation_df2.drop(columns=['Inflation', 'CC_Change'])

inflation_df2

Unnamed: 0,Country_Code,Year,Inflation_Diff
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16487,ZWE,2017,0.024376
16488,ZWE,2018,0.097249
16489,ZWE,2019,2.446861
16490,ZWE,2020,3.018968


### Military Expenditure (% of GDP)

In [38]:
# Re-oraganize the data
military_df2 = pd.read_sql('SELECT * FROM military ORDER BY country_code3, year', engine, index_col='index').set_index('country_code3').reset_index()
military_df2.columns = ['Country_Code', 'Year', 'Military_Expenditure']
military_df2

Unnamed: 0,Country_Code,Year,Military_Expenditure
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16221,ZWE,2016,1.742494
16222,ZWE,2017,1.544948
16223,ZWE,2018,1.222795
16224,ZWE,2019,0.698601


In [39]:
# set Military Expenditure column as an percentage and calculate the difference from one year to another

military_df2['Military_Expenditure']=military_df2['Military_Expenditure'].div(100)

# Determine if previous row is the same country or not
military_df2['CC_Change'] = ((military_df2['Country_Code'] == military_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
military_df2['Military_Diff'] = np.where(military_df2['CC_Change']==1, (military_df2['Military_Expenditure'] - military_df2['Military_Expenditure'].shift(1)), np.NaN)

military_df2

Unnamed: 0,Country_Code,Year,Military_Expenditure,CC_Change,Military_Diff
0,ABW,1960,,0,
1,ABW,1961,,1,
2,ABW,1962,,1,
3,ABW,1963,,1,
4,ABW,1964,,1,
...,...,...,...,...,...
16221,ZWE,2016,0.017425,1,-0.001444
16222,ZWE,2017,0.015449,1,-0.001975
16223,ZWE,2018,0.012228,1,-0.003222
16224,ZWE,2019,0.006986,1,-0.005242


In [40]:
# Remove Un-needed columns 
military_df2=military_df2.drop(columns=['Military_Expenditure', 'CC_Change'])

military_df2

Unnamed: 0,Country_Code,Year,Military_Diff
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16221,ZWE,2016,-0.001444
16222,ZWE,2017,-0.001975
16223,ZWE,2018,-0.003222
16224,ZWE,2019,-0.005242


### Exports (% of GDP)

In [41]:
# Re-organize the data
exports_df2 = pd.read_sql('SELECT * FROM exports ORDER BY country_code3, year', engine, index_col='index').set_index('country_code3').reset_index()
exports_df2.columns = ['Country_Code', 'Year', 'Exports']
exports_df2

Unnamed: 0,Country_Code,Year,Exports
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16221,ZWE,2016,19.943532
16222,ZWE,2017,19.658905
16223,ZWE,2018,28.049757
16224,ZWE,2019,31.251040


In [42]:
# set Exports column as an percentage and calculate the difference from one year to another
exports_df2['Exports']=exports_df2['Exports'].div(100)

# Determine if previous row is the same country or not
exports_df2['CC_Change'] = ((exports_df2['Country_Code'] == exports_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
exports_df2['Export_Diff'] = np.where(exports_df2['CC_Change']==1, (exports_df2['Exports'] - exports_df2['Exports'].shift(1)), np.NaN)

exports_df2

Unnamed: 0,Country_Code,Year,Exports,CC_Change,Export_Diff
0,ABW,1960,,0,
1,ABW,1961,,1,
2,ABW,1962,,1,
3,ABW,1963,,1,
4,ABW,1964,,1,
...,...,...,...,...,...
16221,ZWE,2016,0.199435,1,0.007834
16222,ZWE,2017,0.196589,1,-0.002846
16223,ZWE,2018,0.280498,1,0.083909
16224,ZWE,2019,0.312510,1,0.032013


In [43]:
# Remove Un-needed columns 
exports_df2=exports_df2.drop(columns=['Exports', 'CC_Change'])

exports_df2

Unnamed: 0,Country_Code,Year,Export_Diff
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16221,ZWE,2016,0.007834
16222,ZWE,2017,-0.002846
16223,ZWE,2018,0.083909
16224,ZWE,2019,0.032013


### Life Expectancy

In [44]:
# Re-organize the data
life_df2 = pd.read_sql('SELECT * FROM life_expectancy ORDER BY country_code3, year', engine, index_col='index').set_index('country_code3').reset_index()
life_df2.columns = ['Country_Code', 'Year', 'Life_Expectancy']
life_df2

Unnamed: 0,Country_Code,Year,Life_Expectancy
0,ABW,1960,65.662
1,ABW,1961,66.074
2,ABW,1962,66.444
3,ABW,1963,66.787
4,ABW,1964,67.113
...,...,...,...
16221,ZWE,2016,60.294
16222,ZWE,2017,60.812
16223,ZWE,2018,61.195
16224,ZWE,2019,61.490


In [45]:
# calculate the percentage difference from one year to another
# Determine if previous row is the same country or not
life_df2['CC_Change'] = ((life_df2['Country_Code'] == life_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
life_df2['Life_Diff'] = np.where(life_df2['CC_Change']==1, (life_df2['Life_Expectancy'] / life_df2['Life_Expectancy'].shift(1) - 1), np.NaN)

life_df2

Unnamed: 0,Country_Code,Year,Life_Expectancy,CC_Change,Life_Diff
0,ABW,1960,65.662,0,
1,ABW,1961,66.074,1,0.006275
2,ABW,1962,66.444,1,0.005600
3,ABW,1963,66.787,1,0.005162
4,ABW,1964,67.113,1,0.004881
...,...,...,...,...,...
16221,ZWE,2016,60.294,1,0.012766
16222,ZWE,2017,60.812,1,0.008591
16223,ZWE,2018,61.195,1,0.006298
16224,ZWE,2019,61.490,1,0.004821


In [46]:
# Remove Un-needed columns 
life_df2=life_df2.drop(columns=['Life_Expectancy', 'CC_Change'])

life_df2

Unnamed: 0,Country_Code,Year,Life_Diff
0,ABW,1960,
1,ABW,1961,0.006275
2,ABW,1962,0.005600
3,ABW,1963,0.005162
4,ABW,1964,0.004881
...,...,...,...
16221,ZWE,2016,0.012766
16222,ZWE,2017,0.008591
16223,ZWE,2018,0.006298
16224,ZWE,2019,0.004821


### GDP

In [50]:
# Re-organize the data
gdp_df2 = pd.read_sql('SELECT * FROM gdp ORDER BY country_code3, year', engine, index_col='index').set_index('country_code3').reset_index()
gdp_df2.columns = ['Country_Code', 'Year', 'GDP']
gdp_df2

Unnamed: 0,Country_Code,Year,GDP
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16221,ZWE,2016,2.054868e+10
16222,ZWE,2017,1.758489e+10
16223,ZWE,2018,1.811554e+10
16224,ZWE,2019,1.928429e+10


In [49]:
# Order by Country_Code and Year, then calculate the percentage difference from one year to another
gdp_df2.sort_values(by=['Country_Code', 'Year'])

# Determine if previous row is the same country or not
gdp_df2['CC_Change'] = ((gdp_df2['Country_Code'] == gdp_df2['Country_Code'].shift(1))).astype(int)

# If the row above is the same country, then find the difference between the rows
gdp_df2['GDP_Diff'] = np.where(gdp_df2['CC_Change']==1, (gdp_df2['GDP'] / gdp_df2['GDP'].shift(1) - 1), np.NaN)

gdp_df2

Unnamed: 0,Country_Code,Year,GDP,CC_Change,GDP_Diff
0,ABW,1960,,0,
1,ABW,1961,,1,
2,ABW,1962,,1,
3,ABW,1963,,1,
4,ABW,1964,,1,
...,...,...,...,...,...
16221,ZWE,2016,2.054868e+10,1,0.029332
16222,ZWE,2017,1.758489e+10,1,-0.144232
16223,ZWE,2018,1.811554e+10,1,0.030177
16224,ZWE,2019,1.928429e+10,1,0.064516


In [29]:
# Remove Un-needed columns 
gdp_df2=gdp_df2.drop(columns=['GDP', 'CC_Change'])

gdp_df2

Unnamed: 0,Country_Code,Year,GDP_Diff
0,ABW,1960,
1,ABW,1961,
2,ABW,1962,
3,ABW,1963,
4,ABW,1964,
...,...,...,...
16487,ZWE,2017,-0.144232
16488,ZWE,2018,0.030177
16489,ZWE,2019,0.064516
16490,ZWE,2020,-0.063944


### Combine Data and Export

In [32]:
data_frames = [population_df3, inflation_df2, military_df2, exports_df2, life_df2, gdp_df2]

machine_learning_df = reduce(lambda left,right: pd.merge(left,right, on=['Country_Code', 'Year'], how='outer'), data_frames)

machine_learning_df 

Unnamed: 0,Country_Code,Year,Population_Change,Inflation_Diff,Military_Diff,Export_Diff,Life_Diff,GDP_Diff
0,ABW,1960,0,,,,,
1,ABW,1961,0,,,,0.006275,
2,ABW,1962,0,,,,0.005600,
3,ABW,1963,0,,,,0.005162,
4,ABW,1964,0,,,,0.004881,
...,...,...,...,...,...,...,...,...
16753,ZWE,2018,0,0.097249,-0.003222,0.083909,0.006298,0.030177
16754,ZWE,2019,1,2.446861,-0.005242,0.032013,0.004821,0.064516
16755,ZWE,2020,1,3.018968,,0.059512,0.004033,-0.063944
16756,ZWE,2021,0,,,,,


In [34]:
pd.DataFrame.to_csv(machine_learning_df, 'resources/machine_learning_df.txt', sep=',', index=False)