# California Housing Prices

## Setup môi trường

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor
import joblib

## Đọc dữ liệu

In [2]:
data = pd.read_csv('housing.csv')

## Tiền xử lý dữ liệu

### Xác định cột số và cột phân loại

In [3]:
numeric_features = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
if 'median_house_value' in numeric_features:
    numeric_features.remove('median_house_value')

categorical_features = ['ocean_proximity'] if 'ocean_proximity' in data.columns else []

### Tạo ColumnTransformer cho tiền xử lý

In [4]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', RobustScaler()) 
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore')) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


### Xác định biến mục tiêu và phân chia dữ liệu

In [5]:
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Tạo hàm đánh giá mô hình (MSE, RMSE, MAE, R2)

In [6]:

def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    
    print(f'--- {model_name} ---')
    print(f'Train - MSE: {mse_train:.2f}, RMSE: {rmse_train:.2f}, MAE: {mae_train:.2f}, R² Score: {r2_train:.2f}')
    print(f'Test  - MSE: {mse_test:.2f}, RMSE: {rmse_test:.2f}, MAE: {mae_test:.2f}, R² Score: {r2_test:.2f}')
    
    return pipeline, {
        'Model': model_name,
        'Train MSE': mse_train,
        'Test MSE': mse_test,
        'Train RMSE': rmse_train,
        'Test RMSE': rmse_test,
        'Train MAE': mae_train,
        'Test MAE': mae_test,
        'Train R²': r2_train,
        'Test R²': r2_test
    }

## Huấn luyện mô hình

### Hồi quy tuyến tính (Linear Regression)

In [7]:
lr_model = LinearRegression()
lr_pipeline, lr_metrics = evaluate_model(lr_model, X_train, X_test, y_train, y_test, 'Linear Regression')

joblib.dump(lr_pipeline, 'linear_regression_model.pkl')

--- Linear Regression ---
Train - MSE: 4683203783.50, RMSE: 68433.94, MAE: 49594.84, R² Score: 0.65
Test  - MSE: 4908290571.35, RMSE: 70059.19, MAE: 50670.49, R² Score: 0.63


['linear_regression_model.pkl']

### Ridge Regression vói GridSearchCV

In [8]:

ridge_param_grid = {'model__alpha': [0.1, 1.0, 10.0, 100.0]}
ridge_model = Ridge()
ridge_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', ridge_model)])
ridge_search = GridSearchCV(ridge_pipeline, ridge_param_grid, cv=5, scoring='neg_mean_squared_error')
ridge_search.fit(X_train, y_train)
best_ridge = ridge_search.best_estimator_

_, ridge_metrics = evaluate_model(best_ridge.named_steps['model'], X_train, X_test, y_train, y_test, 'Best Ridge Regression')

print(f'Best Alpha for Ridge Regression: {ridge_search.best_params_["model__alpha"]}')

joblib.dump(best_ridge, 'ridge_regression_model.pkl')


--- Best Ridge Regression ---
Train - MSE: 4683389726.50, RMSE: 68435.30, MAE: 49593.46, R² Score: 0.65
Test  - MSE: 4909344636.57, RMSE: 70066.72, MAE: 50674.62, R² Score: 0.63
Best Alpha for Ridge Regression: 1.0


['ridge_regression_model.pkl']

### Neural NetWork (MLPRegressor) với GridSearchCV

In [None]:

nn_param_grid = {
    'model__hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'model__alpha': [0.0001, 0.001, 0.01],
}
nn_model = MLPRegressor(max_iter=1000, random_state=42, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10)
nn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', nn_model)])
nn_search = GridSearchCV(nn_pipeline, nn_param_grid, cv=3, scoring='neg_mean_squared_error')
nn_search.fit(X_train, y_train)
best_nn = nn_search.best_estimator_

_, nn_metrics = evaluate_model(best_nn.named_steps['model'], X_train, X_test, y_train, y_test, 'Best Neural Network')

print(f'Best Params for Neural Network: {nn_search.best_params_}')

joblib.dump(best_nn, 'neural_network_model.pkl')



### Stacking

In [1]:

estimators = [
    ('lr', lr_pipeline.named_steps['model']),
    ('ridge', best_ridge.named_steps['model']),
    ('nn', best_nn.named_steps['model'])
]

stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(alpha=1.0),
    passthrough=False,
    n_jobs=-1
)

stacking_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', stacking_model)])

stacking_pipeline.fit(X_train, y_train)

_, stacking_metrics = evaluate_model(stacking_pipeline.named_steps['model'], X_train, X_test, y_train, y_test, 'Stacking Model')

joblib.dump(stacking_pipeline, 'stacking_model.pkl')

NameError: name 'lr_pipeline' is not defined

In [None]:

results = pd.DataFrame([
    lr_metrics,
    ridge_metrics,
    nn_metrics,
    stacking_metrics
])

print("\nKết quả đánh giá các mô hình:")
print(results)