#### General guidance

This serves as a template which will guide you through the implementation of this task. It is advised
to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

# Add any additional imports here (however, the task is solvable without using 
# any additional imports)
# import ...

 #### Loading data

In [2]:
data = pd.read_csv("train.csv")
y = data["y"].to_numpy()
data = data.drop(columns="y")
# print a few data samples
print(data.head())

         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  


#### Calculating the average RMSE

In [3]:
def calculate_RMSE(w, X, y):
    """This function takes test data points (X and y), and computes the empirical RMSE of 
    predicting y from X using a linear model with weights w. 

    Parameters
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression 
    X: matrix of floats, dim = (15,13), inputs with 13 features
    y: array of floats, dim = (15,), input labels

    Returns
    ----------
    rmse: float: dim = 1, RMSE value
    """
    rmse = 0

    #n= y.size       # in 1st version n = w.size

    y_preds = np.dot(X, w)
    k = y - y_preds

    rmse = np.sqrt(np.mean(k**2))
    
    assert np.isscalar(rmse)
    return rmse

#### Fitting the regressor

In [4]:
def fit(X, y, lam):
    """
    This function receives training data points, then fits the ridge regression on this data
    with regularization hyperparameter lambda. The weights w of the fitted ridge regression
    are returned. 

    Parameters
    ----------
    X: matrix of floats, dim = (135,13), inputs with 13 features
    y: array of floats, dim = (135,), input labels
    lam: float. lambda parameter, used in regularization term

    Returns
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression
    """
    weights = np.zeros((13,))
    # weights = np.linalg.inv(np.dot(X.T,X) + lam*np.identity(weights.size)) @ np.dot(X.T, y) #Closed-form solution: eq. 7.13
    n_features = X.shape[1]

    A = X.T @ X + lam * np.eye(n_features)
    b = X.T @ y
    weights = np.linalg.solve(A,b)



    assert weights.shape == (13,)
    return weights

#### Performing computation

In [5]:
"""
Main cross-validation loop, implementing 10-fold CV. In every iteration 
(for every train-test split), the RMSE for every lambda is calculated, 
and then averaged over iterations.

Parameters
---------- 
X: matrix of floats, dim = (150, 13), inputs with 13 features
y: array of floats, dim = (150, ), input labels
lambdas: list of floats, len = 5, values of lambda for which ridge regression is fitted and RMSE estimated
n_folds: int, number of folds (pieces in which we split the dataset), parameter K in KFold CV

Compute
----------
avg_RMSE: array of floats: dim = (5,), average RMSE value for every lambda
"""
X = data.to_numpy()
# The function calculating the average RMSE
lambdas = [0.1, 1, 10, 100, 200]
n_folds = 10

RMSE_mat = np.zeros((n_folds, len(lambdas)))
# --------------------------

kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
for i in range(5):
    lam = lambdas[i]
    Rs = np.array([])
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index] # Split our Data in Training and Test sets
        y_train, y_test = y[train_index], y[test_index] # according to the indices given by KFold

        w = fit(X_train, y_train, lam)      # Train our model -> get weights
        R = calculate_RMSE(w, X_test,y_test)    # Calculate how good the model is
        Rs = np.append(Rs, [R])             
        #print(R)
    RMSE_mat[:, i] = Rs # All R values for the 10 folds are stored in the matrix in column i

# print(RMSE_mat)


avg_RMSE = np.mean(RMSE_mat, axis=0) # avg_RMSE: array of floats: dim = (5,), average RMSE value for every lambda
assert avg_RMSE.shape == (5,)
print(avg_RMSE)

[5.38380157 5.36264675 5.36223749 5.88532073 6.20091603]


In [121]:
# Save results in the required format
np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")