In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Load California Housing Dataset
data = fetch_california_housing(as_frame=True)
df = data.frame
X = df.drop(columns=['MedHouseVal'])
y = df['MedHouseVal']
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [4]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svr = SVR(kernel='rbf', C=10, gamma=0.1)
svr.fit(X_train, y_train)

y_pred = svr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("🔍 Mean Squared Error:", mse)
print("📈 R² Score:", r2)

🔍 Mean Squared Error: 1.1002849092569063
📈 R² Score: 0.16034999185221166


## SVR from scratch but memory allocation is big as matrix multiplication is complex 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from cvxopt import matrix, solvers


data = fetch_california_housing(as_frame=True)
df = data.frame
X = df.drop(columns=['MedHouseVal'])
y = df['MedHouseVal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)


scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

# SVR parameters
C = 1.0
epsilon = 0.1
n = X_train_scaled.shape[0]

# Linear kernel
K = np.dot(X_train_scaled, X_train_scaled.T)

# Setup QP
P = matrix(np.vstack([
    np.hstack([K, -K]),
    np.hstack([-K, K])
]).astype(np.double))

q = matrix(epsilon * np.ones((2 * n, 1)) + np.vstack([y_train_scaled, -y_train_scaled]))
G = matrix(np.vstack([np.identity(2 * n), -np.identity(2 * n)]))
h = matrix(np.vstack([C * np.ones((2 * n, 1)), np.zeros((2 * n, 1))]))
A = matrix(np.hstack([np.ones(n), -np.ones(n)]).reshape(1, -1))
b = matrix(np.zeros(1))

# Solve QP
sol = solvers.qp(P, q, G, h, A, b)
alphas = np.array(sol['x'])


alpha_diff = alphas[:n] - alphas[n:]
w = np.dot(alpha_diff.T, X_train_scaled).flatten()
b = np.mean(y_train_scaled.flatten() - np.dot(X_train_scaled, w))

def predict(X):
    return np.dot(X, w) + b


y_pred_scaled = predict(X_test_scaled)

# Inverse transform predictions and actual values
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
y_test_inv = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()


mse = mean_squared_error(y_test_inv, y_pred)
print("MSE:", mse)

# Identify support vectors
support_vectors = np.where(np.abs(alpha_diff) > 1e-5)[0]

X_plot = np.linspace(X_train_scaled.min(), X_train_scaled.max(), 100).reshape(-1, 1)
y_plot_scaled = predict(X_plot)

# Inverse transform for plotting
X_plot_inv = scaler_X.inverse_transform(X_plot)
y_plot = scaler_y.inverse_transform(y_plot_scaled.reshape(-1, 1))

X_train_inv = scaler_X.inverse_transform(X_train_scaled)
y_train_inv = scaler_y.inverse_transform(y_train_scaled.reshape(-1, 1))
support_X = X_train_inv[support_vectors]
support_y = y_train_inv[support_vectors]

plt.figure(figsize=(10, 6))
plt.scatter(X_train_inv, y_train_inv, color="blue", label="Training data", alpha=0.4)
plt.plot(X_plot_inv, y_plot, color="red", label="SVR prediction")
plt.scatter(support_X, support_y, color="green", label="Support Vectors", s=100, marker='x')
plt.xlabel("Median Income")
plt.ylabel("House Price")
plt.title("Support Vector Regression (Linear Kernel)")
plt.legend()
plt.grid(True)
plt.show()


MemoryError: Unable to allocate 4.06 GiB for an array with shape (16512, 33024) and data type float64

In [5]:
def param_tuning(X_train,y_train,param_grid):
    svr = SVR()
    grid = GridSearchCV(estimator=svr,param_grid=param_grid,cv=3,n_jobs=1,verbose=2,scoring='neg_mean_squared_error')
    grid.fit(X_train,y_train)
    return grid

In [None]:
from urllib.parse import urlparse
import mlflow

from mlflow.models import infer_signature
signature = infer_signature(X_train,y_train)
param_grid = {
    'kernel':['linear','rbf'],
    'degree':[1,2,3],
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1]
}

with mlflow.start_run():
    grid_searh = param_tuning(X_train,y_train,param_grid)
    best_model = grid_searh.best_estimator_
    y_pred = best_model.predict(X_test)
    mse=mean_squared_error(y_test,y_pred)
    r2 =r2_score(y_test,y_pred)
    
    mlflow.log_param('best kernel',grid_searh.best_params_['kernel'])
    mlflow.log_param('best degree',grid_searh.best_params_['degree'])
    mlflow.log_param('best C',grid_searh.best_params_['C'])
    mlflow.log_param('best gamma',grid_searh.best_params_['gamma'])
    mlflow.log_metrics({
        'MSE':mse,
        'R2':r2
    })
    mlflow.set_tracking_uri(uri='http://127.0.0.1:5000')
    urltrack_store_type = urlparse(mlflow.get_tracking_uri()).scheme
    
    if urltrack_store_type != ['file']:
        mlflow.sklearn.log_model(best_model,'model',registered_model_name='Best SVR model')
    else:
        mlflow.sklearn.log_model(best_model,'model',signature=signature,registered_model_name='Best SVR model')
        
    print(f'Best Hyperparameters :{grid_searh.best_params_}')
    print(f'MSE:{mse}')
    print(f'r2:{r2}')
    
    


Fitting 3 folds for each of 54 candidates, totalling 162 fits
