# Model Selection

### Import libreries

In [1]:
import os
import typing

from sklearn.gaussian_process.kernels import *
from sklearn.kernel_approximation import Nystroem
from sklearn.gaussian_process import GaussianProcessRegressor
from scipy.interpolate import griddata
from scipy.stats import norm
import numpy as np
import pandas as pd
import time

### Read data

In [2]:
X = pd.read_csv("./train_x.csv").to_numpy()
y = pd.read_csv("./train_y.csv").to_numpy()

In [4]:
grid_x, grid_y = np.mgrid[0:0.9988:50j, 0:0.9988:50j]

In [5]:
grid_z0 = griddata(X, y, (grid_x, grid_y), method='nearest')
interpolated_data_y = pd.DataFrame(grid_z0.ravel(), columns = ['pm25'])

In [6]:
grid_z0.ravel()

array([ 2.67844831,  2.21802735,  4.97206948, ..., 12.04067001,
       11.66635693,  7.68859294])

In [7]:
coor = np.array(list(zip(grid_x.ravel(), grid_y.ravel())))

In [8]:
interpolated_data_X = pd.DataFrame(coor, columns = ['lon', 'lat'])

In [None]:
interpolated_data_X.to_csv('interpolated_X_2500.csv', index=False)
interpolated_data_y.to_csv('interpolated_y_2500.csv', index=False)

### Toy example for RBF

In [None]:
kernel = RBF()
start = time.time()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood()

In [None]:
kernel = RBF()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = Matern(nu=0.5)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())
      
kernel = Matern(nu=1.5)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = Matern(nu=2.5)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = RationalQuadratic()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = ExpSineSquared()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

kernel = DotProduct() + WhiteKernel()
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

In [None]:
kernel = DotProduct(sigma_0=18.2) + WhiteKernel(noise_level=223)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0).fit(coor, grid_z0)
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

In [9]:
kernel = Matern(length_scale=0.0253, nu=1.5)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=100,
                               alpha = 0.1).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())

Matern(length_scale=0.0297, nu=1.5)
-174273.92580377776


In [None]:
predictions = gpc.predict(X, return_std=True)

In [None]:
d = {'mean': predictions[0].ravel(), 'sd': np.sqrt(predictions[1].ravel())}

In [None]:
pred_df = pd.DataFrame(d)

In [None]:
pred_df['action'] = np.where(pred_df['mean'] >= 35.5,
                             pred_df['mean'] + pred_df['sd'] * norm.ppf(20/25),
                             pred_df['mean'] + pred_df['sd'] * norm.ppf(1/6))

In [10]:
norm.ppf(20/25)

0.8416212335729143

### Nyostre aprox


In [3]:
feature_map_nystroem = Nystroem(kernel = Matern(length_scale=0.0253, nu=1.5),
                                random_state=1,
                                n_components=300)
data_transformed = feature_map_nystroem.fit_transform(X, y)

In [4]:
data_transformed.shape

(15189, 300)

In [None]:
kernel = Matern(length_scale=0.0253, nu=1.5)
gpc = GaussianProcessRegressor(kernel=kernel, 
                               random_state=0, 
                               n_restarts_optimizer=10).fit(data_transformed, y)

In [None]:
print(gpc.kernel_)
print(gpc.log_marginal_likelihood())