# California House Price Prediction - Advanced

## 1. Overview
This project aims to predict median house values in California districts. We will implement advanced techniques including feature engineering, pipeline integration, cross-validation, and hyperparameter tuning with Ridge Regression.

## 2. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

%matplotlib inline

## 3. Load Data

In [None]:
df = pd.read_csv('Data_file - data_file.csv')
df.head()

## 4. Feature Engineering
Creating new features to capture more meaningful relationships:
- `rooms_per_household`
- `bedrooms_per_room`
- `population_per_household`

In [None]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

df.head()

## 5. Data Splitting

In [None]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 6. Unified Pipeline Construction
We will create a pipeline that handles preprocessing and modeling together to prevent data leakage.

In [None]:
num_attribs = list(X.select_dtypes(include=[np.number]))
cat_attribs = ['ocean_proximity']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_attribs),
])

# Initial Linear Regression Pipeline
lin_reg_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', LinearRegression())
])

## 7. Cross-Validation
Evaluating model stability using K-Fold Cross-Validation.

In [None]:
scores = cross_val_score(lin_reg_pipeline, X_train, y_train, scoring='r2', cv=5)
print("Cross-validated R2 scores:", scores)
print("Mean R2 Score:", scores.mean())

## 8. Hyperparameter Tuning (Ridge Regression)
Using Ridge Regression and GridSearchCV to tune the regularization strength (`alpha`).

In [None]:
ridge_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', Ridge())
])

param_grid = {'model__alpha': [0.1, 1, 10, 50, 100]}

grid_search = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

print("Best Ridge R2:", grid_search.best_score_)
print("Best Parameters:", grid_search.best_params_)

final_model = grid_search.best_estimator_

## 9. Final Evaluation & Residual Analysis
Evaluating the best model on the test set and analyzing residuals.

In [None]:
y_pred = final_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Final Test RMSE: {rmse}")
print(f"Final Test R2: {r2}")

In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

## 10. Interpretability
Examining the coefficients to understand feature importance.

In [None]:
# Extract feature names from the preprocessor
onehot_columns = list(final_model.named_steps['preprocessing'].named_transformers_['cat'].get_feature_names_out(cat_attribs))
feature_names = num_attribs + onehot_columns

# Extract coefficients from the Ridge model
coefficients = final_model.named_steps['model'].coef_

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)

print(coef_df)

## 11. Save Model
Saving the final trained pipeline for future use.

In [None]:
joblib.dump(final_model, "california_housing_final.pkl")
print("Model saved to california_housing_final.pkl")