# **1.Loading and Preprocessing (2 marks):**

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
california = fetch_california_housing()
data = pd.DataFrame(california.data, columns=california.feature_names)
data['MedHouseVal'] = california.target

# Check for missing values
print("Missing values:\n", data.isnull().sum())

# Feature scaling
scaler = StandardScaler()
features = california.feature_names
data[features] = scaler.fit_transform(data[features])

# Split data into features and target
X = data.drop('MedHouseVal', axis=1)
y = data['MedHouseVal']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Missing values:
 MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


# 2.Regression Algorithm Implementation (5 marks):
 Implement the following regression algorithms:

# **Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Linear Regression:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression:
MSE: 0.5558915986952441
MAE: 0.5332001304956565
R²: 0.575787706032451


# **Decision Tree Regressor**

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print("\nDecision Tree Regressor:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


Decision Tree Regressor:
MSE: 0.4942716777366763
MAE: 0.4537843265503876
R²: 0.6228111330554302


# **Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("\nRandom Forest Regressor:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


Random Forest Regressor:
MSE: 0.25549776668540763
MAE: 0.32761306601259704
R²: 0.805024407701793


# **Gradient Boosting Regressor**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

print("\nGradient Boosting Regressor:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


Gradient Boosting Regressor:
MSE: 0.29399901242474274
MAE: 0.37165044848436773
R²: 0.7756433164710084


# **Support Vector Regressor (SVR)**
 For each algorithm:

In [None]:
from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

print("\nSupport Vector Regressor:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


Support Vector Regressor:
MSE: 0.3551984619989418
MAE: 0.3977630963437868
R²: 0.7289407597956463


# **Provide a brief explanation of how it works**

Explanation of Preprocessing Steps
Loading the Dataset:

The fetch_california_housing function provides easy access to the dataset

Converting to a pandas DataFrame allows for more intuitive data manipulation and exploration

Handling Missing Values:

No missing values were found in this dataset (confirmed with isnull().sum())

If present, missing values would need to be addressed because most ML algorithms cannot handle them directly

The approach would depend on the nature and amount of missing data

Feature Scaling:

Standardization (subtracting mean and dividing by standard deviation) was performed

Justification:

Features in this dataset have different scales (e.g., 'AveRooms' vs. 'Latitude')

Many ML algorithms (especially distance-based ones like SVM, KNN) perform better when features are on similar scales

Gradient descent-based algorithms converge faster with standardized features

Regularization becomes more effective when features are scaled

# **Model Evaluation and Comparison (2 marks):**

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd

# 1. Load and prepare data
california_housing = fetch_california_housing(as_frame=True)
X = california_housing.data
y = california_housing.target

# 2. Preprocessing - scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # This creates our properly scaled features

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# 4. Define models to compare
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "SVR": SVR(),
    "K-Neighbors": KNeighborsRegressor()
}

# 5. Evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "MSE": mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }

# 6. Train and evaluate all models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    results[name] = evaluate_model(model, X_test, y_test)

# 7. Display results
results_df = pd.DataFrame(results).T
print(results_df)

                        MSE       MAE        R2
Linear Regression  0.555892  0.533200  0.575788
Decision Tree      0.494272  0.453784  0.622811
Random Forest      0.255498  0.327613  0.805024
SVR                0.355198  0.397763  0.728941
K-Neighbors        0.433811  0.445525  0.668950
