<a href="https://colab.research.google.com/github/sk-ruban/dsa5208/blob/main/kernel_ridge_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We will split the notebook into the following five sections:
1. Data preparation
2. Set up MPI processes
3. Applying distributed kernel ridge regression
4. Obtaining predicted median value using kernel function
5. Evaluating model performance
6. Cross Validation / Model Tuning


In [131]:
# Install dependencies
# !pip3 install wheel
# !pip3 install mpi4py

In [5]:
# Import Dependencies
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from mpi4py import MPI

## **Data Preparation**

In [6]:
# Import Data
df = pd.read_csv('data/housing_short.tsv', sep='\t', header=None)

In [7]:
df.columns = ['longitude', 'latitude', 'housingMedianAge', 'totalRooms',
              'totalBedrooms', 'population', 'households', 'medianIncome',
              'oceanProximity', 'medianHouseValue']

features = ['longitude', 'latitude', 'housingMedianAge', 'totalRooms',
              'totalBedrooms', 'population', 'households', 'medianIncome',
              'oceanProximity']

# print(df.info())
# print("\nFirst few rows:")
# print(df.head())

In [8]:
#Normalize Data
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

print(df.head())

   longitude  latitude  housingMedianAge  totalRooms  totalBedrooms  \
0  -1.076704  0.743879          0.761259   -0.834767      -1.012490   
1  -1.071483  0.733996         -0.777695    2.611221       1.598957   
2  -1.081925  0.729055          1.607684   -0.509506      -0.849441   
3  -1.087146  0.729055          1.607684   -0.616449      -0.729160   
4  -1.087146  0.729055          1.607684   -0.420849      -0.608878   

   population  households  medianIncome  oceanProximity  medianHouseValue  
0   -1.060704   -0.996316      2.647538         1.55863            452600  
1    1.127364    1.920014      2.634566         1.55863            358500  
2   -0.877575   -0.849347      2.065575         1.55863            352100  
3   -0.812323   -0.728314      1.185764         1.55863            341300  
4   -0.804955   -0.613044      0.206434         1.55863            342200  


In [9]:
# Split Data
X = df[features]
y = df['medianHouseValue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (3500, 9)
Test set shape: (1500, 9)


## **Set up MPI Processes**

In [10]:
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

In [11]:
# Split data according to process

chunk_size = len(X_train) // size
start = rank * chunk_size
end = start + chunk_size if rank < size - 1 else len(X_train)
X_chunk = X_train[start:end]
y_chunk = y_train[start:end]

## **Applying Kernel Ridge Regression**


In [12]:
# Define Kernel Computation Function
def compute_gaussian_kernel(X1, X2, sigma):
    dists = np.sum(X1**2, axis=1).reshape(-1, 1) + np.sum(X2**2, axis=1) - 2 * np.dot(X1, X2.T)
    return np.exp(-dists / (2 * sigma ** 2))                                                # exp( - dists / 2.sigma )


# Apply Kernel Computation Function to all split data
def compute_local_kernel(X_chunk, X_full, sigma):
    return compute_gaussian_kernel(X_chunk, X_full, sigma)

# Add up all the computed rows to get matrix K
sigma = 1.0  # TUNE PARAMETER
local_K = compute_local_kernel(X_chunk, X_train, sigma)
K = np.zeros((len(X_train), len(X_train)))
comm.Allreduce(local_K, K, op=MPI.SUM)

AttributeError: 'Series' object has no attribute 'reshape'

In [139]:
# Solve for matrix A which is defined as K + lambda I
lambda_ = 1.0  
A = K + lambda_ * np.eye(K.shape[0])

# Solve for alpha, using A(alpha) = y, and the Conjugate Gradient Method
def conjugate_gradient(A, b, max_iter=1000, tol=1e-6):
    x = np.zeros_like(b)
    r = b - A @ x
    p = r.copy()
    r_norm_sq = np.dot(r, r)
    
    for _ in range(max_iter):
        Ap = A @ p
        alpha = r_norm_sq / np.dot(p, Ap)
        x += alpha * p
        r -= alpha * Ap
        r_norm_sq_new = np.dot(r, r)
        if np.sqrt(r_norm_sq_new) < tol:
            break
        beta = r_norm_sq_new / r_norm_sq
        p = r + beta * p
        r_norm_sq = r_norm_sq_new
    
    return x

alpha = conjugate_gradient(A, y_train)

In [None]:
# Prediction function
def predict(X_new):
    K_new = compute_gaussian_kernel(X_new, X_train, sigma)
    return K_new @ alpha

In [None]:
if rank == 0:
    y_train_pred = predict(X_train)
    y_test_pred = predict(X_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    print(f"Train RMSE: ${train_rmse:.2f}")
    print(f"Test RMSE: ${test_rmse:.2f}")

In [140]:
# Solve with SKLEARN
"""
Best parameters for linear kernel: {'alpha': 10}
linear kernel - Train RMSE: $188273.19
linear kernel - Test RMSE: $187795.50

Best parameters for rbf kernel: {'alpha': 0.1, 'gamma': 0.1}
rbf kernel - Train RMSE: $43552.43
rbf kernel - Test RMSE: $49753.43
"""

def train_and_evaluate(kernel, X_train, y_train, X_test, y_test):
    if kernel == 'linear':
        param_grid = {'alpha': [0.1, 1, 10]}
    else:
        param_grid = {'alpha': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}
    
    krr = KernelRidge(kernel=kernel)
    grid_search = GridSearchCV(krr, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    print(f"Best parameters for {kernel} kernel:", best_params)
    
    best_model = grid_search.best_estimator_
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    print(f"{kernel} kernel - Train RMSE: ${train_rmse:.2f}")
    print(f"{kernel} kernel - Test RMSE: ${test_rmse:.2f}")
    print()

# Try different kernels
for kernel in ['linear', 'rbf']:
    train_and_evaluate(kernel, X_train, y_train, X_test, y_test)

Best parameters for linear kernel: {'alpha': 10}
linear kernel - Train RMSE: $188273.19
linear kernel - Test RMSE: $187795.50

Best parameters for rbf kernel: {'alpha': 0.1, 'gamma': 0.1}
rbf kernel - Train RMSE: $43552.43
rbf kernel - Test RMSE: $49753.43

