In [1]:
import numpy as np

from matplotlib import pyplot
import matplotlib.pyplot as plt

# You have have to install the libraries below.
# sklearn, csv
import csv

from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge

In [5]:
# The csv file air-quality-train.csv contains the training data.
# After loaded, each row of X_train will correspond to CO, NO2, O3, SO2.
# The vector y_train will contain the PM2.5 concentrations.
# Each row of X_train corresponds to the same timestamp.
X_train = []
y_train = []

with open('air-quality-train.csv', 'r') as air_quality_train:
    air_quality_train_reader = csv.reader(air_quality_train)
    next(air_quality_train_reader)
    for row in air_quality_train_reader:
        row = [float(string) for string in row]
        row[0] = int(row[0])
        
        X_train.append([row[1], row[2], row[3], row[4]])
        y_train.append(row[5])
        
# The csv file air-quality-test.csv contains the testing data.
# After loaded, each row of X_test will correspond to CO, NO2, O3, SO2.
# The vector y_test will contain the PM2.5 concentrations.
# Each row of X_train corresponds to the same timestamp.
X_test = []
y_test = []

with open('air-quality-test.csv', 'r') as air_quality_test:
    air_quality_test_reader = csv.reader(air_quality_test)
    next(air_quality_test_reader)
    for row in air_quality_test_reader:
        row = [float(string) for string in row]
        row[0] = int(row[0])
        
        X_test.append([row[1], row[2], row[3], row[4]])
        y_test.append(row[5])

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [105]:
# TODOs for part (a)
#    1. Use SVR loaded to train a SVR model with rbf kernel, regularizer (C) set to 1 and rbg kernel parameter (gamma) 0.1
#    2. Print the RMSE on the test dataset
svr_rbf = SVR(kernel = 'rbf', C = 1, gamma = 0.1)
y_pred_svr = svr_rbf.fit(X_train,y_train).predict(X_test)
rmse_svr = np.sqrt(mean_squared_error(y_test,y_pred_svr))
print('RMSE obtained for SVR: {}'.format(rmse_svr))

# TODOs for part (b)
#    1. Use KernelRidge to train a Kernel Ridge  model with rbf kernel, regularizer (C) set to 1 and rbg kernel parameter (gamma) 0.1
#    2. Print the RMSE on the test dataset 
krr_rbf = KernelRidge(kernel = 'rbf', alpha = 0.5, gamma = 0.1)
y_pred_krr = krr_rbf.fit(X_train,y_train).predict(X_test)
rmse_krr = np.sqrt(mean_squared_error(y_test,y_pred_krr))
print('RMSE obtained for KRR: {}'.format(rmse_krr))


# Use this seed.
seed = 0
np.random.seed(seed) 

K = 5 #The number of folds we will create 

# TODOs for part (c)
#   1. Create a partition of training data into K=5 folds 
#   Hint: it suffice to create 5 subarrays of indices   
idx = np.arange(X_train.shape[0])
np.random.shuffle(idx)
idx_split = np.array_split(idx,K)

# Specify the grid search space 
reg_range = np.logspace(-1,1,3)     # Regularization paramters
kpara_range = np.logspace(-2, 0, 3) # Kernel parameters 

# TODOs for part (d)
#    1.  Select the best parameters for both SVR and KernelRidge based on k-fold cross-validation error estimate (use RMSE as the performance metric)
#    2.  Print the best paramters for both SVR and KernelRidge selected  
#    3.  Train both SVR and KernelRidge on the full training data with selected best parameters 
#    4.  Print both the RMSE on the test dataset of SVR and KernelRidge 

best_rmse_svr = float('inf')
best_rmse_krr = float('inf')

for i in range(len(reg_range)):
    reg = reg_range[i]
    for j in range(len(kpara_range)):
        kpara = kpara_range[j]
        sum_rmse_svr = 0
        sum_rmse_krr = 0
        for k in range(K):
            train_idx = np.concatenate(idx_split[:k]+idx_split[(k+1):])
            validation_idx = np.array(idx_split[k])
            
            # svr
            svr_rbf = SVR(kernel = 'rbf', C = reg, gamma = kpara)
            y_pred_svr = svr_rbf.fit(X_train[train_idx[:],:] , y_train[train_idx[:]]).predict(X_train[validation_idx[:],:])
            sum_rmse_svr += np.sqrt(mean_squared_error(y_train[validation_idx[:]],y_pred_svr))
            
            # krr
            krr_rbf = KernelRidge(kernel = 'rbf', alpha = 1/(2*reg), gamma = kpara)
            y_pred_krr = krr_rbf.fit(X_train[train_idx[:],:] , y_train[train_idx[:]]).predict(X_train[validation_idx[:],:])
            sum_rmse_krr += np.sqrt(mean_squared_error(y_train[validation_idx[:]],y_pred_krr))
            
        if (sum_rmse_svr < best_rmse_svr):
            best_rmse_svr = sum_rmse_svr
            best_reg_svr = reg
            best_kpara_svr = kpara
            
        if (sum_rmse_krr < best_rmse_krr):
            best_rmse_krr = sum_rmse_krr
            best_reg_krr = reg
            best_kpara_krr = kpara

# With Optimal Parameters
svr_rbf = SVR(kernel = 'rbf', C = best_reg_svr, gamma = best_kpara_svr)
y_pred_svr = svr_rbf.fit(X_train,y_train).predict(X_test)
optimal_rmse_svr = np.sqrt(mean_squared_error(y_test,y_pred_svr))

krr_rbf = KernelRidge(kernel = 'rbf', alpha = 1/(2*best_reg_krr), gamma = best_kpara_svr)
y_pred_krr = krr_rbf.fit(X_train,y_train).predict(X_test)
optimal_rmse_krr = np.sqrt(mean_squared_error(y_test,y_pred_krr))

print('\nOptimal SVR\nregualarization term: {} \nkernel parameter: {}'.format(best_reg_svr,best_kpara_svr))
print('RMSE on test data: {}'.format(optimal_rmse_svr))

print('\nOptimal KRR\nregualarization term: {} \nkernel parameter: {}'.format(best_reg_krr,best_kpara_krr))
print('RMSE on test data: {}'.format(optimal_rmse_krr))



RMSE obtained for SVR: 36.152137059139214
RMSE obtained for KRR: 37.84320015147227

Optimal SVR
regualarization term: 10.0 
kernel parameter: 0.01
RMSE: 31.26472587569616

Optimal KRR
regualarization term: 10.0 
kernel parameter: 0.01
RMSE: 33.36136004426222
