In [1]:
# ============================================================
# 1. IMPORTS
# ============================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import joblib

# ============================================================
# 2. LOAD CLEAN ORIGINAL DATA
# ============================================================
df = pd.read_csv("housing.csv")

# ============================================================
# 3. FEATURE ENGINEERING
# ============================================================
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["population_per_household"] = df["population"] / df["households"]

df.drop(["total_rooms", "total_bedrooms", "population", "households"], axis=1, inplace=True)

# ============================================================
# 4. COLUMN GROUPS
# ============================================================
numeric_cols = [
    "longitude",
    "latitude",
    "housing_median_age",
    "median_income",
    "rooms_per_household",
    "population_per_household"
]

categorical_cols = ["ocean_proximity"]

# ============================================================
# 5. PREPROCESSOR
# ============================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# ============================================================
# 6. FINAL MODEL PIPELINE
# ============================================================
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=200,
        max_depth=30,
        min_samples_leaf=2,
        random_state=42
    ))
])

# ============================================================
# 7. TRAIN MODEL
# ============================================================
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

model.fit(X, y)

# ============================================================
# 8. SAVE MODEL
# ============================================================
joblib.dump(model, "final_model.pkl")
print("✔ final_model.pkl saved successfully!")


✔ final_model.pkl saved successfully!


In [4]:
import os
os.getcwd()



'C:\\Users\\asus\\OneDrive\\Documents\\machine learning'

In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline

gb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(
        random_state=42,
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4
    ))
])

gb_model.fit(X, y)
joblib.dump(gb_model, "gradient_boosting_model.pkl")
print("✔ gradient_boosting_model.pkl saved!")


✔ gradient_boosting_model.pkl saved!


In [6]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

xgb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="rmse"   # IMPORTANT for XGBoost
    ))
])

xgb_model.fit(X, y)
joblib.dump(xgb_model, "xgboost_model.pkl")
print("✔ xgboost_model.pkl saved!")


✔ xgboost_model.pkl saved!
