### Imports

In [5]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

In [6]:
population_df = pd.read_csv('world_population.csv', index_col='Country Code')

In [7]:
population_df.head()

Unnamed: 0_level_0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABW,54211.0,55438.0,56225.0,56695.0,57032.0,57360.0,57715.0,58055.0,58386.0,58726.0,...,101353.0,101453.0,101669.0,102053.0,102577.0,103187.0,103795.0,104341.0,104822.0,105264.0
AFG,8996351.0,9166764.0,9345868.0,9533954.0,9731361.0,9938414.0,10152331.0,10372630.0,10604346.0,10854428.0,...,27294031.0,28004331.0,28803167.0,29708599.0,30696958.0,31731688.0,32758020.0,33736494.0,34656032.0,35530081.0
AGO,5643182.0,5753024.0,5866061.0,5980417.0,6093321.0,6203299.0,6309770.0,6414995.0,6523791.0,6642632.0,...,21759420.0,22549547.0,23369131.0,24218565.0,25096150.0,25998340.0,26920466.0,27859305.0,28813463.0,29784193.0
ALB,1608800.0,1659800.0,1711319.0,1762621.0,1814135.0,1864791.0,1914573.0,1965598.0,2022272.0,2081695.0,...,2947314.0,2927519.0,2913021.0,2905195.0,2900401.0,2895092.0,2889104.0,2880703.0,2876101.0,2873457.0
AND,13411.0,14375.0,15370.0,16412.0,17469.0,18549.0,19647.0,20758.0,21890.0,23058.0,...,83861.0,84462.0,84449.0,83751.0,82431.0,80788.0,79223.0,78014.0,77281.0,76965.0


### Function to calculate population growth rate

In [8]:
def get_population_growth_rate_by_country_year(df,country_code):
    filtered_df = df.loc[df.index == country_code].reset_index().drop(['Country Code'], axis = 1)
    filtered_df = filtered_df.pct_change(axis = 1).drop(['1960'], axis = 1).round(5)
    dates = filtered_df.columns.values.astype(int)
    pop = filtered_df.to_numpy()
    output = np.vstack((dates, pop)).T
    return output

In [9]:
get_population_growth_rate_by_country_year(population_df,'ABW')

array([[ 1.961e+03,  2.263e-02],
       [ 1.962e+03,  1.420e-02],
       [ 1.963e+03,  8.360e-03],
       [ 1.964e+03,  5.940e-03],
       [ 1.965e+03,  5.750e-03],
       [ 1.966e+03,  6.190e-03],
       [ 1.967e+03,  5.890e-03],
       [ 1.968e+03,  5.700e-03],
       [ 1.969e+03,  5.820e-03],
       [ 1.970e+03,  5.740e-03],
       [ 1.971e+03,  6.380e-03],
       [ 1.972e+03,  6.730e-03],
       [ 1.973e+03,  6.730e-03],
       [ 1.974e+03,  4.730e-03],
       [ 1.975e+03,  2.130e-03],
       [ 1.976e+03, -1.170e-03],
       [ 1.977e+03, -3.630e-03],
       [ 1.978e+03, -4.360e-03],
       [ 1.979e+03, -2.050e-03],
       [ 1.980e+03,  1.930e-03],
       [ 1.981e+03,  7.840e-03],
       [ 1.982e+03,  1.285e-02],
       [ 1.983e+03,  1.395e-02],
       [ 1.984e+03,  1.021e-02],
       [ 1.985e+03,  3.020e-03],
       [ 1.986e+03, -6.060e-03],
       [ 1.987e+03, -1.295e-02],
       [ 1.988e+03, -1.219e-02],
       [ 1.989e+03, -7.700e-04],
       [ 1.990e+03,  1.830e-02],
       [ 1

### Function to test/train split by even/odd years

In [10]:
def feature_response_split(arr):
    even = [] #arr[arr % 2 == 0]
    odd = [] #arr[arr % 2 != 0]
    for entry in range(len(arr)):
        if [entry][0] % 2 == 0:
            even.append(arr[entry])
        else:
            odd.append(arr[entry])
    X_train, y_train = zip(*odd)
    X_test, y_test = zip(*even)
    return (np.array(X_train), np.array(y_train)), (np.array(X_test), np.array(y_test))

In [11]:
data = get_population_growth_rate_by_country_year(population_df,'ABW');
(X_train, y_train), (X_test, y_test) = feature_response_split(data)

### Function to fit DTR

In [12]:
def train_model(X_train, y_train, MaxDepth):
    regrtree = DecisionTreeRegressor(max_depth = MaxDepth, random_state = 42)
    return regrtree.fit(X_train.reshape(-1, 1),y_train)

In [13]:
data = get_population_growth_rate_by_country_year(population_df,'ABW')
(X_train, y_train), _ = feature_response_split(data)

train_model(X_train, y_train,3).predict([[2017]])

array([0.00451333])

### Function to produce RMSLE

In [14]:
### START FUNCTION
def test_model(model, y_test, X_test):
    X_predict = model.predict(X_test.reshape(-1,1))[0]
    y_predict = y_test[0]
    rmsle = np.around(np.sqrt(((np.log(1 + X_predict) - np.log(1 + y_predict)) ** 2) / float(len(y_test))), 3)
    return rmsle
### END FUNCTION

In [15]:
data = get_population_growth_rate_by_country_year(population_df,'ABW')
(X_train, y_train), (X_test, y_test) = feature_response_split(data)
lm = train_model(X_train, y_train,3)
test_model(lm, y_test, X_test)

0.003