# 1. Data Preprocessing

## 1.1.  Reading the data from World Bank's API

In [1]:
!pip install pandas_datareader

Collecting pandas_datareader
  Downloading pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.5/109.5 kB[0m [31m769.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting lxml
  Downloading lxml-4.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting requests>=2.19.0
  Downloading requests-2.31.0-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 kB[0m [31m730.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting charset-normalizer<4,>=2
  Downloading charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.3/197.3 kB[0m [31m813.6 kB/s[0m eta [36m0:00:00[0ma [36m0:

In [1]:
from pandas_datareader import wb

Go to: <url>https://databank.worldbank.org/source/world-development-indicators</url>

In [2]:
code = "EN.CO2.BLDG.ZS"

CO$_{2}$ emissions from residential buildings and from commercial and from public services (% of total fuel combustion).

Countries: 

In [3]:
regions = ["BRA", "IND", "CHN", "ZAF", "USA", "GBR", "WLD", "EUU"]

Another interesting indicator: "EG.ELC.ACCS.ZS" electricity access

In [4]:
df_indicator = wb.download(
    country=regions,
    indicator=code,
    start=1971,
    end=2014
)



In [5]:
df_indicator

Unnamed: 0_level_0,Unnamed: 1_level_0,EN.CO2.BLDG.ZS
country,year,Unnamed: 2_level_1
Brazil,2014,4.289736
Brazil,2013,4.508896
Brazil,2012,4.746116
Brazil,2011,5.159270
Brazil,2010,5.346396
...,...,...
South Africa,1975,6.572400
South Africa,1974,6.815974
South Africa,1973,6.949115
South Africa,1972,7.742371


The data contains CO$_2$ emission from buildings (including residential, commercial, and public services). This is a time-series dataset, because each date has a value. Here every year has a single value.

In [6]:
df_indicator.to_excel("indicator_world_bank.xlsx")

These are small datasets and ideally for ML we would like to have hundreds of datapoints

**Allowing all rows to be seen**

In [7]:
import pandas as pd

In [8]:
pd.set_option("display.max_rows", None)

In [9]:
df_indicator.head(9)

Unnamed: 0_level_0,Unnamed: 1_level_0,EN.CO2.BLDG.ZS
country,year,Unnamed: 2_level_1
Brazil,2014,4.289736
Brazil,2013,4.508896
Brazil,2012,4.746116
Brazil,2011,5.15927
Brazil,2010,5.346396
Brazil,2009,6.042296
Brazil,2008,5.728822
Brazil,2007,6.021721
Brazil,2006,6.104512


**Checking for NaN values**

In [10]:
df_indicator.isnull().sum()

EN.CO2.BLDG.ZS    0
dtype: int64

## 1.2. Converting the elements of column Year, from strings into integers

In [11]:
df_indicator_backup1 = df_indicator.copy() # for later use

In [12]:
df_indicator = df_indicator.reset_index()

In [13]:
df_indicator.head()

Unnamed: 0,country,year,EN.CO2.BLDG.ZS
0,Brazil,2014,4.289736
1,Brazil,2013,4.508896
2,Brazil,2012,4.746116
3,Brazil,2011,5.15927
4,Brazil,2010,5.346396


In [14]:
type(df_indicator["year"][0])

str

In [15]:
df_indicator["year"] = df_indicator["year"].astype(int)

In [16]:
type(df_indicator["year"][0])

numpy.int64

In [17]:
df_indicator.set_index("year", inplace=True)
df_indicator.head()

Unnamed: 0_level_0,country,EN.CO2.BLDG.ZS
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,Brazil,4.289736
2013,Brazil,4.508896
2012,Brazil,4.746116
2011,Brazil,5.15927
2010,Brazil,5.346396


## 1.3. Sorting the index (so that 1971 is on top)

In [18]:
df_indicator = df_indicator.sort_index()

In [19]:
df_indicator.head()

Unnamed: 0_level_0,country,EN.CO2.BLDG.ZS
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1971,United Kingdom,17.230274
1971,World,18.615556
1971,United States,19.0614
1971,India,15.087564
1971,European Union,21.950546


In [20]:
df_indicator.columns = ["country", "Indicator"]

In [21]:
df_indicator.head()

Unnamed: 0_level_0,country,Indicator
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1971,United Kingdom,17.230274
1971,World,18.615556
1971,United States,19.0614
1971,India,15.087564
1971,European Union,21.950546


## 1.4. Columns are countries

In [22]:
df_indicator.head(10)

Unnamed: 0_level_0,country,Indicator
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1971,United Kingdom,17.230274
1971,World,18.615556
1971,United States,19.0614
1971,India,15.087564
1971,European Union,21.950546
1971,China,21.596575
1971,Brazil,7.044025
1971,South Africa,7.925393
1972,United States,18.390586
1972,Brazil,7.056473


In [23]:
df_indicator = pd.pivot_table(df_indicator, values="Indicator",
                             columns="country", index="year")

In [24]:
df_indicator

country,Brazil,China,European Union,India,South Africa,United Kingdom,United States,World
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1971,7.044025,21.596575,21.950546,15.087564,7.925393,17.230274,19.0614,18.615556
1972,7.056473,21.329853,22.129084,14.035274,7.742371,18.141505,18.390586,18.432397
1973,6.568496,20.831115,21.894976,14.183907,6.949115,17.421817,17.048219,17.565715
1974,6.133508,20.318703,20.240601,12.503688,6.815974,18.435426,16.687413,16.947899
1975,5.927298,20.465564,20.72844,12.209276,6.5724,17.964301,17.031575,17.173455
1976,5.767214,19.931858,20.164812,12.336263,6.429296,17.870337,17.26468,17.137039
1977,5.670138,19.983016,19.668188,12.602554,5.527939,18.214177,16.128702,16.512286
1978,5.635035,18.922859,19.879768,13.587028,6.764253,17.977528,16.105992,16.480944
1979,5.783516,19.026029,19.61691,13.698245,6.058166,18.140961,14.846802,15.885459
1980,6.153479,18.139023,18.604842,12.617357,4.856512,18.444117,13.948417,15.118228


In [25]:
regions # we want these codes to be the columns of df_indicator. This will facilitate for-loops later on.

['BRA', 'IND', 'CHN', 'ZAF', 'USA', 'GBR', 'WLD', 'EUU']

In [26]:
df_indicator.columns

Index(['Brazil', 'China', 'European Union', 'India', 'South Africa',
       'United Kingdom', 'United States', 'World'],
      dtype='object', name='country')

In [27]:
df_indicator.columns = [regions[0], regions[2], regions[7], regions[1], regions[3], regions[5], regions[4], regions[6]]

In [28]:
df_indicator

Unnamed: 0_level_0,BRA,CHN,EUU,IND,ZAF,GBR,USA,WLD
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1971,7.044025,21.596575,21.950546,15.087564,7.925393,17.230274,19.0614,18.615556
1972,7.056473,21.329853,22.129084,14.035274,7.742371,18.141505,18.390586,18.432397
1973,6.568496,20.831115,21.894976,14.183907,6.949115,17.421817,17.048219,17.565715
1974,6.133508,20.318703,20.240601,12.503688,6.815974,18.435426,16.687413,16.947899
1975,5.927298,20.465564,20.72844,12.209276,6.5724,17.964301,17.031575,17.173455
1976,5.767214,19.931858,20.164812,12.336263,6.429296,17.870337,17.26468,17.137039
1977,5.670138,19.983016,19.668188,12.602554,5.527939,18.214177,16.128702,16.512286
1978,5.635035,18.922859,19.879768,13.587028,6.764253,17.977528,16.105992,16.480944
1979,5.783516,19.026029,19.61691,13.698245,6.058166,18.140961,14.846802,15.885459
1980,6.153479,18.139023,18.604842,12.617357,4.856512,18.444117,13.948417,15.118228


In [29]:
df_indicator.to_excel("dfinc.xlsx")

# 2. Polynomial Feature

## 2.1. Adding polynomial features

In [30]:
degree = 3 # arbitrarily we select polynomial degree = 3

In [31]:
from sklearn.preprocessing import PolynomialFeatures

We import the PolynomialFeatures class from sckit-learn package

In [32]:
poly = PolynomialFeatures(degree=degree, include_bias=False)
# this is the polynomial features transformer
poly

To be able to find non linear pattern in our data

In [33]:
t = df_indicator.index.values.reshape(-1,1) # the feature

In [34]:
t.shape

(44, 1)

In [35]:
t

array([[1971],
       [1972],
       [1973],
       [1974],
       [1975],
       [1976],
       [1977],
       [1978],
       [1979],
       [1980],
       [1981],
       [1982],
       [1983],
       [1984],
       [1985],
       [1986],
       [1987],
       [1988],
       [1989],
       [1990],
       [1991],
       [1992],
       [1993],
       [1994],
       [1995],
       [1996],
       [1997],
       [1998],
       [1999],
       [2000],
       [2001],
       [2002],
       [2003],
       [2004],
       [2005],
       [2006],
       [2007],
       [2008],
       [2009],
       [2010],
       [2011],
       [2012],
       [2013],
       [2014]])

In [36]:
X = poly.fit_transform(t)
X

array([[1.97100000e+03, 3.88484100e+06, 7.65702161e+09],
       [1.97200000e+03, 3.88878400e+06, 7.66868205e+09],
       [1.97300000e+03, 3.89272900e+06, 7.68035432e+09],
       [1.97400000e+03, 3.89667600e+06, 7.69203842e+09],
       [1.97500000e+03, 3.90062500e+06, 7.70373438e+09],
       [1.97600000e+03, 3.90457600e+06, 7.71544218e+09],
       [1.97700000e+03, 3.90852900e+06, 7.72716183e+09],
       [1.97800000e+03, 3.91248400e+06, 7.73889335e+09],
       [1.97900000e+03, 3.91644100e+06, 7.75063674e+09],
       [1.98000000e+03, 3.92040000e+06, 7.76239200e+09],
       [1.98100000e+03, 3.92436100e+06, 7.77415914e+09],
       [1.98200000e+03, 3.92832400e+06, 7.78593817e+09],
       [1.98300000e+03, 3.93228900e+06, 7.79772909e+09],
       [1.98400000e+03, 3.93625600e+06, 7.80953190e+09],
       [1.98500000e+03, 3.94022500e+06, 7.82134662e+09],
       [1.98600000e+03, 3.94419600e+06, 7.83317326e+09],
       [1.98700000e+03, 3.94816900e+06, 7.84501180e+09],
       [1.98800000e+03, 3.95214

In [37]:
X.shape

(44, 3)

In [38]:
import numpy as np
t_future = np.arange(2015, 2051).reshape(-1,1)
t_future

array([[2015],
       [2016],
       [2017],
       [2018],
       [2019],
       [2020],
       [2021],
       [2022],
       [2023],
       [2024],
       [2025],
       [2026],
       [2027],
       [2028],
       [2029],
       [2030],
       [2031],
       [2032],
       [2033],
       [2034],
       [2035],
       [2036],
       [2037],
       [2038],
       [2039],
       [2040],
       [2041],
       [2042],
       [2043],
       [2044],
       [2045],
       [2046],
       [2047],
       [2048],
       [2049],
       [2050]])

In [39]:
X_future = poly.fit_transform(t_future)
X_future

array([[2.01500000e+03, 4.06022500e+06, 8.18135338e+09],
       [2.01600000e+03, 4.06425600e+06, 8.19354010e+09],
       [2.01700000e+03, 4.06828900e+06, 8.20573891e+09],
       [2.01800000e+03, 4.07232400e+06, 8.21794983e+09],
       [2.01900000e+03, 4.07636100e+06, 8.23017286e+09],
       [2.02000000e+03, 4.08040000e+06, 8.24240800e+09],
       [2.02100000e+03, 4.08444100e+06, 8.25465526e+09],
       [2.02200000e+03, 4.08848400e+06, 8.26691465e+09],
       [2.02300000e+03, 4.09252900e+06, 8.27918617e+09],
       [2.02400000e+03, 4.09657600e+06, 8.29146982e+09],
       [2.02500000e+03, 4.10062500e+06, 8.30376562e+09],
       [2.02600000e+03, 4.10467600e+06, 8.31607358e+09],
       [2.02700000e+03, 4.10872900e+06, 8.32839368e+09],
       [2.02800000e+03, 4.11278400e+06, 8.34072595e+09],
       [2.02900000e+03, 4.11684100e+06, 8.35307039e+09],
       [2.03000000e+03, 4.12090000e+06, 8.36542700e+09],
       [2.03100000e+03, 4.12496100e+06, 8.37779579e+09],
       [2.03200000e+03, 4.12902

In [40]:
## example
polytest = PolynomialFeatures(degree=degree, include_bias=True)
t_test = df_indicator.index.values.reshape(-1,1) # the feature
X_test = polytest.fit_transform(t_test)
X_test
##

array([[1.00000000e+00, 1.97100000e+03, 3.88484100e+06, 7.65702161e+09],
       [1.00000000e+00, 1.97200000e+03, 3.88878400e+06, 7.66868205e+09],
       [1.00000000e+00, 1.97300000e+03, 3.89272900e+06, 7.68035432e+09],
       [1.00000000e+00, 1.97400000e+03, 3.89667600e+06, 7.69203842e+09],
       [1.00000000e+00, 1.97500000e+03, 3.90062500e+06, 7.70373438e+09],
       [1.00000000e+00, 1.97600000e+03, 3.90457600e+06, 7.71544218e+09],
       [1.00000000e+00, 1.97700000e+03, 3.90852900e+06, 7.72716183e+09],
       [1.00000000e+00, 1.97800000e+03, 3.91248400e+06, 7.73889335e+09],
       [1.00000000e+00, 1.97900000e+03, 3.91644100e+06, 7.75063674e+09],
       [1.00000000e+00, 1.98000000e+03, 3.92040000e+06, 7.76239200e+09],
       [1.00000000e+00, 1.98100000e+03, 3.92436100e+06, 7.77415914e+09],
       [1.00000000e+00, 1.98200000e+03, 3.92832400e+06, 7.78593817e+09],
       [1.00000000e+00, 1.98300000e+03, 3.93228900e+06, 7.79772909e+09],
       [1.00000000e+00, 1.98400000e+03, 3.93625600e

## 2.2 Define the targets & splitting into training/test sets

Target variables are the columns of df_indicator

In [41]:
df_indicator

Unnamed: 0_level_0,BRA,CHN,EUU,IND,ZAF,GBR,USA,WLD
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1971,7.044025,21.596575,21.950546,15.087564,7.925393,17.230274,19.0614,18.615556
1972,7.056473,21.329853,22.129084,14.035274,7.742371,18.141505,18.390586,18.432397
1973,6.568496,20.831115,21.894976,14.183907,6.949115,17.421817,17.048219,17.565715
1974,6.133508,20.318703,20.240601,12.503688,6.815974,18.435426,16.687413,16.947899
1975,5.927298,20.465564,20.72844,12.209276,6.5724,17.964301,17.031575,17.173455
1976,5.767214,19.931858,20.164812,12.336263,6.429296,17.870337,17.26468,17.137039
1977,5.670138,19.983016,19.668188,12.602554,5.527939,18.214177,16.128702,16.512286
1978,5.635035,18.922859,19.879768,13.587028,6.764253,17.977528,16.105992,16.480944
1979,5.783516,19.026029,19.61691,13.698245,6.058166,18.140961,14.846802,15.885459
1980,6.153479,18.139023,18.604842,12.617357,4.856512,18.444117,13.948417,15.118228


In [42]:
len(X)

44

In [43]:
train_size = int(0.8 * len(X))

In [44]:
train_size

35

In [47]:
X_train = X[: train_size,:]
X_train

array([[1.97100000e+03, 3.88484100e+06, 7.65702161e+09],
       [1.97200000e+03, 3.88878400e+06, 7.66868205e+09],
       [1.97300000e+03, 3.89272900e+06, 7.68035432e+09],
       [1.97400000e+03, 3.89667600e+06, 7.69203842e+09],
       [1.97500000e+03, 3.90062500e+06, 7.70373438e+09],
       [1.97600000e+03, 3.90457600e+06, 7.71544218e+09],
       [1.97700000e+03, 3.90852900e+06, 7.72716183e+09],
       [1.97800000e+03, 3.91248400e+06, 7.73889335e+09],
       [1.97900000e+03, 3.91644100e+06, 7.75063674e+09],
       [1.98000000e+03, 3.92040000e+06, 7.76239200e+09],
       [1.98100000e+03, 3.92436100e+06, 7.77415914e+09],
       [1.98200000e+03, 3.92832400e+06, 7.78593817e+09],
       [1.98300000e+03, 3.93228900e+06, 7.79772909e+09],
       [1.98400000e+03, 3.93625600e+06, 7.80953190e+09],
       [1.98500000e+03, 3.94022500e+06, 7.82134662e+09],
       [1.98600000e+03, 3.94419600e+06, 7.83317326e+09],
       [1.98700000e+03, 3.94816900e+06, 7.84501180e+09],
       [1.98800000e+03, 3.95214

In [49]:
X_test = X[train_size:,:] # rows 25 ... 31
X_test

array([[2.00600000e+03, 4.02403600e+06, 8.07221622e+09],
       [2.00700000e+03, 4.02804900e+06, 8.08429434e+09],
       [2.00800000e+03, 4.03206400e+06, 8.09638451e+09],
       [2.00900000e+03, 4.03608100e+06, 8.10848673e+09],
       [2.01000000e+03, 4.04010000e+06, 8.12060100e+09],
       [2.01100000e+03, 4.04412100e+06, 8.13272733e+09],
       [2.01200000e+03, 4.04814400e+06, 8.14486573e+09],
       [2.01300000e+03, 4.05216900e+06, 8.15701620e+09],
       [2.01400000e+03, 4.05619600e+06, 8.16917874e+09]])

In [50]:
X_train.shape

(35, 3)

In [51]:
X_test.shape

(9, 3)

In [52]:
df_indicator.index

Int64Index([1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
            1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
            1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
            2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014],
           dtype='int64', name='year')

In [53]:
df_indicator.index[:train_size]

Int64Index([1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
            1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
            1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
            2004, 2005],
           dtype='int64', name='year')

In [55]:
y_train_indicator = pd.DataFrame(index=df_indicator.index[:train_size]) 
y_train_indicator

1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981


In [56]:
regions

['BRA', 'IND', 'CHN', 'ZAF', 'USA', 'GBR', 'WLD', 'EUU']

In [57]:
df_indicator.head()

Unnamed: 0_level_0,BRA,CHN,EUU,IND,ZAF,GBR,USA,WLD
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1971,7.044025,21.596575,21.950546,15.087564,7.925393,17.230274,19.0614,18.615556
1972,7.056473,21.329853,22.129084,14.035274,7.742371,18.141505,18.390586,18.432397
1973,6.568496,20.831115,21.894976,14.183907,6.949115,17.421817,17.048219,17.565715
1974,6.133508,20.318703,20.240601,12.503688,6.815974,18.435426,16.687413,16.947899
1975,5.927298,20.465564,20.72844,12.209276,6.5724,17.964301,17.031575,17.173455


In [58]:
df_indicator.loc[1971:1990, "BRA"]

year
1971    7.044025
1972    7.056473
1973    6.568496
1974    6.133508
1975    5.927298
1976    5.767214
1977    5.670138
1978    5.635035
1979    5.783516
1980    6.153479
1981    6.784000
1982    7.438704
1983    8.026440
1984    7.689197
1985    7.913945
1986    7.491558
1987    7.800390
1988    8.511921
1989    8.538303
1990    8.938942
Name: BRA, dtype: float64

In [59]:
df_indicator.loc[:train_size, "BRA"]

Series([], Name: BRA, dtype: float64)

In [60]:
train_size

35

In [61]:
1971 + train_size

2006

In [62]:
df_indicator.index[0] + train_size

2006

In [63]:
df_indicator.loc[df_indicator.index[0] + train_size, "BRA"] # returns the value at 1971 + 35 = 2006

6.1045123077413

In [64]:
df_indicator.loc[:(df_indicator.index[0] + train_size), "BRA"] # notice the colon

year
1971    7.044025
1972    7.056473
1973    6.568496
1974    6.133508
1975    5.927298
1976    5.767214
1977    5.670138
1978    5.635035
1979    5.783516
1980    6.153479
1981    6.784000
1982    7.438704
1983    8.026440
1984    7.689197
1985    7.913945
1986    7.491558
1987    7.800390
1988    8.511921
1989    8.538303
1990    8.938942
1991    8.735680
1992    8.814062
1993    8.748152
1994    8.940114
1995    8.629018
1996    8.034153
1997    7.635852
1998    7.568926
1999    7.695609
2000    7.320995
2001    7.230568
2002    7.184062
2003    6.573187
2004    6.369242
2005    6.177134
2006    6.104512
Name: BRA, dtype: float64

In [66]:
for c in regions:
    y_train_indicator[c] = df_indicator.loc[:(df_indicator.index[0] + train_size), c]

In [67]:
y_train_indicator

Unnamed: 0_level_0,BRA,IND,CHN,ZAF,USA,GBR,WLD,EUU
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1971,7.044025,15.087564,21.596575,7.925393,19.0614,17.230274,18.615556,21.950546
1972,7.056473,14.035274,21.329853,7.742371,18.390586,18.141505,18.432397,22.129084
1973,6.568496,14.183907,20.831115,6.949115,17.048219,17.421817,17.565715,21.894976
1974,6.133508,12.503688,20.318703,6.815974,16.687413,18.435426,16.947899,20.240601
1975,5.927298,12.209276,20.465564,6.5724,17.031575,17.964301,17.173455,20.72844
1976,5.767214,12.336263,19.931858,6.429296,17.26468,17.870337,17.137039,20.164812
1977,5.670138,12.602554,19.983016,5.527939,16.128702,18.214177,16.512286,19.668188
1978,5.635035,13.587028,18.922859,6.764253,16.105992,17.977528,16.480944,19.879768
1979,5.783516,13.698245,19.026029,6.058166,14.846802,18.140961,15.885459,19.61691
1980,6.153479,12.617357,18.139023,4.856512,13.948417,18.444117,15.118228,18.604842


In [68]:
y_test_indicator = pd.DataFrame(index=df_indicator.index[train_size:])
y_test_indicator

2006
2007
2008
2009
2010
2011
2012
2013
2014


In [69]:
for c in regions:
    y_test_indicator[c] = df_indicator.loc[(df_indicator.index[0] + train_size):, c]

In [70]:
y_test_indicator

Unnamed: 0_level_0,BRA,IND,CHN,ZAF,USA,GBR,WLD,EUU
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006,6.104512,7.464892,6.495481,6.463258,9.236012,17.852515,9.862386,16.485827
2007,6.021721,6.926106,6.156229,7.648682,9.652542,17.425913,9.471573,14.44475
2008,5.728822,6.596951,5.798459,7.653411,10.089048,19.615369,9.636771,16.024392
2009,6.042296,6.272677,5.573417,7.821341,10.765703,20.262562,9.615836,16.880886
2010,5.346396,6.208523,5.401937,3.412499,10.190292,21.763386,9.204829,17.275371
2011,5.15927,6.17323,5.205434,4.123685,10.230125,19.065864,8.805714,15.673458
2012,4.746116,5.70149,5.302652,5.553239,9.712099,20.131135,8.63235,16.344142
2013,4.508896,5.704307,5.306225,5.591515,10.67944,20.827589,8.804332,17.060117
2014,4.289736,5.490006,5.358338,5.469054,11.011918,19.061397,8.59532,15.459913


In [71]:
y_train_indicator.shape

(35, 8)

In [72]:
y_test_indicator.shape

(9, 8)

## 2.3 Scaling the features and the target

We do not need to scale them when doing linear regression or ARIMA, so this section doesn't affect LR or ARIMA.

In [73]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [75]:
X_test.shape

(9, 3)