# Model Selection

### Import libreries

In [9]:
import os
import typing

from sklearn.gaussian_process.kernels import *
from sklearn.kernel_approximation import Nystroem
from sklearn.gaussian_process import GaussianProcessRegressor
from scipy.interpolate import griddata
from scipy.stats import norm
import numpy as np
import pandas as pd
import time

### Read data

In [3]:
X = pd.read_csv("./train_x.csv").to_numpy()
y = pd.read_csv("./train_y.csv").to_numpy()

In [4]:
grid_x, grid_y = np.mgrid[0:0.9988:50j, 0:0.9988:50j]

In [5]:
grid_z0 = griddata(X, y, (grid_x, grid_y), method='nearest')
interpolated_data_y = pd.DataFrame(grid_z0.ravel(), columns = ['pm25'])

In [6]:
grid_z0.ravel()

array([ 2.67844831,  2.21802735,  4.97206948, ..., 12.04067001,
       11.66635693,  7.68859294])

In [7]:
coor = np.array(list(zip(grid_x.ravel(), grid_y.ravel())))

In [12]:
interpolated_data_X = pd.DataFrame(coor, columns = ['lon', 'lat'])

In [13]:
interpolated_data_X.to_csv('interpolated_X_2500.csv', index=False)
interpolated_data_y.to_csv('interpolated_y_2500.csv', index=False)

### Toy example for RBF

In [14]:
kernel = RBF()
start = time.time()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood()

75.33604407310486


In [25]:
kernel = RBF()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = Matern(nu=0.5)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())
      
kernel = Matern(nu=1.5)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = Matern(nu=2.5)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = RationalQuadratic()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = ExpSineSquared()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = DotProduct() + WhiteKernel()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

RBF(length_scale=0.0168)
-305565.38943646516


  K_gradient = K[..., np.newaxis] * D \


Matern(length_scale=0.036, nu=0.5)
-165616.9917406278
Matern(length_scale=0.0253, nu=1.5)
-205393.44635639447
Matern(length_scale=0.0223, nu=2.5)
-228279.36014855502
RationalQuadratic(alpha=0.522, length_scale=0.0106)
-131886.14096834682
ExpSineSquared(length_scale=0.000532, periodicity=198)
-305565.3582388254
DotProduct(sigma_0=18.2) + WhiteKernel(noise_level=223)
-10346.768229266692


In [27]:
kernel = DotProduct(sigma_0=18.2) + WhiteKernel(noise_level=223)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0).fit(coor, grid_z0)
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

DotProduct(sigma_0=18.3) + WhiteKernel(noise_level=272)
-64436.42911961255


In [28]:
kernel = Matern(length_scale=0.0253, nu=1.5)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0).fit(coor, interpolated_data_y.to_numpy())

In [29]:
predictions = gpc.predict(X, return_std=True)

In [41]:
d = {'mean': predictions[0].ravel(), 'sd': np.sqrt(predictions[1].ravel())}

In [42]:
pred_df = pd.DataFrame(d)

In [43]:
pred_df['action'] = np.where(pred_df['mean'] >= 35.5,
                             pred_df['mean'] + pred_df['sd'] * norm.ppf(20/25),
                             pred_df['mean'] + pred_df['sd'] * norm.ppf(1/6))

In [44]:
pred_df

Unnamed: 0,mean,sd,action
0,34.430858,0.522925,33.924969
1,57.268516,0.447328,57.644996
2,11.061016,0.590040,10.490198
3,52.904142,0.526108,53.346925
4,12.442281,0.501976,11.956658
...,...,...,...
15184,29.910568,0.580684,29.348802
15185,27.877195,0.567593,27.328093
15186,37.410700,0.593769,37.910429
15187,19.579256,0.589772,19.008697


In [45]:
pred_df['action'].to_numpy()

array([33.9249686 , 57.64499608, 10.49019841, ..., 37.91042854,
       19.00869742, 39.16262533])