## Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [2]:
df_pop = pd.read_csv('world_population.csv')
country_map_df = pd.read_csv('country_code_map.csv', index_col='Country Code')

In [3]:
df_pop.head()

Unnamed: 0,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,ABW,54211.0,55438.0,56225.0,56695.0,57032.0,57360.0,57715.0,58055.0,58386.0,...,101353.0,101453.0,101669.0,102053.0,102577.0,103187.0,103795.0,104341.0,104822.0,105264.0
1,AFG,8996351.0,9166764.0,9345868.0,9533954.0,9731361.0,9938414.0,10152331.0,10372630.0,10604346.0,...,27294031.0,28004331.0,28803167.0,29708599.0,30696958.0,31731688.0,32758020.0,33736494.0,34656032.0,35530081.0
2,AGO,5643182.0,5753024.0,5866061.0,5980417.0,6093321.0,6203299.0,6309770.0,6414995.0,6523791.0,...,21759420.0,22549547.0,23369131.0,24218565.0,25096150.0,25998340.0,26920466.0,27859305.0,28813463.0,29784193.0
3,ALB,1608800.0,1659800.0,1711319.0,1762621.0,1814135.0,1864791.0,1914573.0,1965598.0,2022272.0,...,2947314.0,2927519.0,2913021.0,2905195.0,2900401.0,2895092.0,2889104.0,2880703.0,2876101.0,2873457.0
4,AND,13411.0,14375.0,15370.0,16412.0,17469.0,18549.0,19647.0,20758.0,21890.0,...,83861.0,84462.0,84449.0,83751.0,82431.0,80788.0,79223.0,78014.0,77281.0,76965.0


In [4]:
country_map_df.head()

Unnamed: 0_level_0,Country Name
Country Code,Unnamed: 1_level_1
ABW,Aruba
AFG,Afghanistan
AGO,Angola
ALB,Albania
AND,Andorra


### Function to find population by country name

In [5]:
def get_year_pop(country_name):
    df = pd.merge(country_map_df, df_pop, on = 'Country Code', how = 'inner')
    df = df.loc[df['Country Name'] == country_name].drop(['Country Code', 'Country Name'], axis = 1).astype(int)
    dates = df.columns.values.astype(int)
    pop = df.to_numpy()
    output = np.vstack((dates, pop)).T
    return ouput

In [6]:
get_year_pop('Aruba').shape == (58, 2)

True

### Function to test/train split a 2d array

In [20]:
def feature_response_split(arr):
    predictors, target = arr.T
    X_train, y_train, X_test, y_test = train_test_split(predictors, target, test_size = 0.2, random_state = 42)
    return (X_train, X_test), (y_train, y_test)

In [21]:
data = get_year_pop('Aruba')
(X_train, y_train), (X_test, y_test) = feature_response_split(data)

### Function to train model

In [32]:
def train_model(X_train, y_train):
    ridge = Ridge()
    return ridge.fit(X_train.reshape(-1, 1), y_train)

In [33]:
data = get_year_pop('Aruba')
(X_train, y_train), _ = feature_response_split(data)

train_model(X_train, y_train).predict([[2017]])

array([104468.15547163])

### Function to produce mean square error

In [36]:
def test_model(model, X_test, y_test):
    from sklearn import metrics
    ridge = model.predict(X_test.reshape(-1, 1))
    return round(metrics.mean_squared_error(y_test, ridge), 2)

In [37]:
data = get_year_pop('Aruba')
(X_train, y_train), (X_test, y_test) = feature_response_split(data)
lm = train_model(X_train, y_train)

test_model(lm, X_test, y_test)

42483684.58