# Model Selection

### Import libreries

In [1]:
import os
import typing
from sklearn.cluster import KMeans
from sklearn.gaussian_process.kernels import *
from sklearn.kernel_approximation import Nystroem
from sklearn.gaussian_process import GaussianProcessRegressor
from scipy.interpolate import griddata
from scipy.stats import norm
import numpy as np
import pandas as pd
import time

### Read data

In [3]:
X = pd.read_csv("./train_x.csv").to_numpy()
y = pd.read_csv("./train_y.csv").to_numpy()
X_y = pd.read_csv("./train_x.csv")
X_y['pm25'] = pd.read_csv("./train_y.csv").to_numpy()

### Interpolate data to reduced plane

In [146]:
grid_x, grid_y = np.mgrid[0:0.9988:75j, 0:0.9988:75j]
grid_z0 = griddata(X, y, (grid_x, grid_y), method='nearest')
interpolated_data_y = pd.DataFrame(grid_z0.ravel(), columns = ['pm25'])
coor = np.array(list(zip(grid_x.ravel(), grid_y.ravel())))

In [84]:
np.append(X, y, axis=1)

array([[ 0.8575    ,  0.68625   , 36.20316838],
       [ 0.41125   ,  0.675     , 55.94634794],
       [ 0.8625    ,  0.90625   , 12.04206554],
       ...,
       [ 0.135     ,  0.64      , 36.64130863],
       [ 0.07125   ,  0.78875   , 18.66263411],
       [ 0.31625   ,  0.2125    , 38.37685245]])

In [34]:
init_k_means_centrodis = pd.DataFrame(np.array([grid_x.ravel(), grid_y.ravel(), grid_z0.ravel()]).T,
                                      columns = ['x', 'y', 'z'])
init_k_means_centrodis.to_numpy()

array([[ 0.        ,  0.        ,  2.67844831],
       [ 0.        ,  0.02038367,  2.21802735],
       [ 0.        ,  0.04076735,  4.97206948],
       ...,
       [ 0.9988    ,  0.95803265, 12.04067001],
       [ 0.9988    ,  0.97841633, 11.66635693],
       [ 0.9988    ,  0.9988    ,  7.68859294]])

### Make K-means data

In [36]:
kmeans = KMeans(n_clusters=2500,
                init=init_k_means_centrodis.to_numpy(),
                random_state=0).fit(X_y)
kmeans.cluster_centers_

  kmeans = KMeans(n_clusters=2500,


array([[1.18750000e-02, 4.37500000e-03, 2.67990937e+00],
       [3.75000000e-03, 1.00000000e-02, 2.21802735e+00],
       [3.72500000e-02, 6.27500000e-02, 4.99032336e+00],
       ...,
       [9.83333333e-01, 9.73333333e-01, 1.20621698e+01],
       [9.86250000e-01, 9.73750000e-01, 1.16663569e+01],
       [9.86250000e-01, 9.90000000e-01, 7.68859294e+00]])

### Tune on kmeans data

In [None]:
gpc = GaussianProcessRegressor(kernel=Matern(nu=1.5), 
                               random_state=0,
                               n_restarts_optimizer=100,
                               alpha = 0.1).fit(coor, interpolated_data_y.to_numpy())
print(gpc.kernel_, ' : ', gpc.log_marginal_likelihood())

### Hyperparameter Tuning

In [7]:
kernels = [RBF(),
           Matern(nu=0.5),
           Matern(nu=1.5),
           Matern(nu=2.5),
           RationalQuadratic(),
           ExpSineSquared(),
           DotProduct() + WhiteKernel()]

In [12]:
for kernel in kernels:
    gpc = GaussianProcessRegressor(kernel=kernel, 
                                   random_state=0, 
                                   n_restarts_optimizer=100,
                                   alpha = 0.1).fit(coor, interpolated_data_y.to_numpy())
    print(gpc.kernel_, ' : ', gpc.log_marginal_likelihood())

RBF(length_scale=0.0224)  :  -230199.7803302184


  K_gradient = K[..., np.newaxis] * D \


Matern(length_scale=0.0402, nu=0.5)  :  -147630.3189565221
Matern(length_scale=0.0297, nu=1.5)  :  -174273.92580377776
Matern(length_scale=0.0269, nu=2.5)  :  -188743.0522545688
RationalQuadratic(alpha=0.528, length_scale=0.0119)  :  -120489.31124962137
ExpSineSquared(length_scale=0.00612, periodicity=23)  :  -230197.91099507353
DotProduct(sigma_0=18.2) + WhiteKernel(noise_level=223)  :  -10346.768229266689


In [144]:
norm.ppf(20/21)

1.668391193947079