In [1]:
from math import pi
import polars as pl
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
from sklearn.model_selection import KFold
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor

set_config(transform_output='polars')

import polars as pl
from sklearn.model_selection import train_test_split

from scipy.io import arff
import pandas as pd

from pathlib import Path

current_folder = globals()['_dh'][0]
path = current_folder / "data" / "dataset.arff"

data, meta = arff.loadarff(path)
df_pd = pd.DataFrame(data)
raw = pl.from_pandas(df_pd)
raw = raw.with_columns(
    pl.col("zipcode").cast(pl.Utf8)
)

X = raw.drop("price")
y = raw["price"].to_numpy()  # 1D array (خیلی مهم)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

class ZipTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, n_splits=5):
        self.n_splits = n_splits

    def fit(self, X, y):
        self.global_mean_ = np.mean(y)

        # mean price per zipcode (برای test / inference)
        self.zip_mean_ = (
            X.with_columns(pl.Series("target", y))
             .group_by("zipcode")
             .agg(zip_mean=pl.col("target").mean())
        )
        return self

    def transform(self, X):
        with pl.StringCache():
            X = X.join(self.zip_mean_, on="zipcode", how="left")

        # zipcodeهای دیده‌نشده
        X = X.with_columns(
            pl.col("zip_mean").fill_null(self.global_mean_)
        )

        return X

numeric_features = ["bedrooms", "bathrooms", "sqft_living"]
categorical_features = ["zipcode"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
    ]
)

from sklearn.linear_model import Ridge

print("Training Ridge Regression Model...")

ridge_pipe = Pipeline(
    steps=[
        ("zip_encoder", ZipTargetEncoder()),
        ("preprocess", preprocessor),
        ("model", Ridge())
    ]
)

param_dist = {
    "model__alpha": np.logspace(-3, 3, 50)
}

search = RandomizedSearchCV(
    ridge_pipe,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="neg_mean_squared_error",
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

best_ridge = search.best_estimator_
print(search.best_params_)

# 7) ارزیابی روی test
pred = best_ridge.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, pred))
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)

print(f"Tuned RF RMSE: {rmse:,.2f}")
print(f"Tuned RF MAE: {mae:,.2f}")
print(f"Tuned RF R²:  {r2:.3f}")

# ridge_model = Ridge(alpha=1.0)

# ridge_pipe = Pipeline(
#     steps=[
#         ("zip_encoder", ZipTargetEncoder()),
#         ("preprocess", preprocessor),
#         ("model", ridge_model),
#     ]
# )

# ridge_pipe.fit(X_train, y_train)

# alphas = [0.01, 0.1, 1, 5, 10]

# for a in alphas:
#     ridge_pipe.set_params(model__alpha=a)
#     ridge_pipe.fit(X_train, y_train)
#     pred = ridge_pipe.predict(X_test)
#     rmse = np.sqrt(mean_squared_error(y_test, pred))
#     mae = mean_absolute_error(y_test, pred)
#     print(f"Alpha: {a}, RMSE: {rmse:.2f} --- MAE: {mae:.2f} --- R²: {ridge_pipe.score(X_test, y_test):.2f}")

print("Training Ridge Regression Model Finished.")

print("Training Decision Tree Model...")

dt_pipe = Pipeline(
    steps=[
        ("zip_encoder", ZipTargetEncoder()),
        ("preprocess", preprocessor),
        ("model", DecisionTreeRegressor(random_state=42)),
    ]
)

param_dist = {
    "model__max_depth": [None, 5, 10, 20, 30, 50],
    "model__min_samples_split": [2, 5, 10, 20, 50],
    "model__min_samples_leaf": [1, 2, 5, 10, 20],
    "model__max_features": [None, "sqrt", "log2"],
    "model__criterion": ["squared_error", "friedman_mse"],
}

search = RandomizedSearchCV(
    estimator=dt_pipe,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    scoring="neg_mean_squared_error",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

best_dt = search.best_estimator_

pred = best_dt.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, pred))
r2 = r2_score(y_test, pred)

print("Best params:", search.best_params_)
print(f"Decision Tree RMSE: {rmse:,.2f}")
print(f"Decision Tree R²: {r2:.3f}")

# dt_model = DecisionTreeRegressor()
# dt_pipe = Pipeline(steps=[
#                       ('zip_encoder', ZipTargetEncoder()),
#                       ('preprocessor', preprocessor),
#                       ('dt', dt_model),
#                       ])

# max_depth = range(1, 20)
# for a in max_depth:
#     dt_pipe.set_params(dt__max_depth=a)
#     dt_pipe.fit(X_train, y_train)
#     pred = dt_pipe.predict(X_test)
#     rmse = np.sqrt(mean_squared_error(y_test, pred))
#     mae = mean_absolute_error(y_test, pred)
#     print(f"max_depth: {a}, RMSE: {rmse:.2f} --- MAE: {mae:.2f} --- R²: {dt_pipe.score(X_test, y_test):.2f}")

print("Training Decision Tree Model Finished.")

print("Training Random Forest Model...")

# 1) مدل پایه
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# 2) پایپ‌لاین
rf_pipe = Pipeline(
    steps=[
        ("zip_encoder", ZipTargetEncoder()),
        ("preprocess", preprocessor),
        ("model", rf),
    ]
)

# 3) فضای جستجو (Hyperparameter Space)
param_dist = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [None, 5, 10, 20, 30, 50],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 4, 8],
    "model__max_features": ["sqrt", "log2", None],
    "model__bootstrap": [True, False],
}

# 4) Randomized Search
search = RandomizedSearchCV(
    estimator=rf_pipe,
    param_distributions=param_dist,
    n_iter=30,                 # بیشترش بهتر، ولی کندتر
    cv=5,
    scoring="neg_mean_squared_error",  # سازگار با نسخه‌های قدیمی
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# 5) اجرا
search.fit(X_train, y_train)

print("Best Params:")
print(search.best_params_)

# 6) بهترین مدل
best_model = search.best_estimator_

# 7) ارزیابی روی test
pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, pred))
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)

print(f"Tuned RF RMSE: {rmse:,.2f}")
print(f"Tuned RF MAE: {mae:,.2f}")
print(f"Tuned RF R²:  {r2:.3f}")


print("Training Random Forest Model Finished.")

print("Training XGBoost Model...")

import numpy as np
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# مدل پایه XGBoost
xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
    tree_method="hist"  # سریع‌تر (روی خیلی از سیستم‌ها بهتره)
)

xgb_pipe = Pipeline(
    steps=[
        ("zip_encoder", ZipTargetEncoder()),
        ("preprocess", preprocessor),
        ("model", xgb),
    ]
)

param_dist = {
    "model__n_estimators": [300, 600, 900, 1200],
    "model__learning_rate": [0.01, 0.03, 0.05, 0.1, 0.2],
    "model__max_depth": [3, 4, 5, 6, 8, 10],
    "model__min_child_weight": [1, 3, 5, 10],
    "model__subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "model__colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "model__gamma": [0, 0.05, 0.1, 0.2, 0.5],
    "model__reg_alpha": [0, 0.01, 0.1, 1.0, 10.0],
    "model__reg_lambda": [0.5, 1.0, 2.0, 5.0, 10.0],
}

search = RandomizedSearchCV(
    estimator=xgb_pipe,
    param_distributions=param_dist,
    n_iter=30,                    # اگر زمان داشتی بکن 50 یا 80
    cv=5,
    scoring="neg_mean_squared_error",  # سازگار با نسخه‌های قدیمی sklearn
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

print("Best Params:")
print(search.best_params_)

best_xgb = search.best_estimator_

pred = best_xgb.predict(X_test)

mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)

print(f"Tuned XGBoost RMSE: {rmse:,.2f}")
print(f"Tuned XGBoost MAE:  {mae:,.2f}")
print(f"Tuned XGBoost R²:   {r2:.3f}")

print("Training XGBoost Model Finished.")

Training Ridge Regression Model...
{'model__alpha': np.float64(0.002329951810515372)}
Tuned RF RMSE: 201,174.41
Tuned RF MAE: 113,750.73
Tuned RF R²:  0.732
Training Ridge Regression Model Finished.
Training Decision Tree Model...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best params: {'model__min_samples_split': 10, 'model__min_samples_leaf': 5, 'model__max_features': None, 'model__max_depth': 50, 'model__criterion': 'friedman_mse'}
Decision Tree RMSE: 229,691.24
Decision Tree R²: 0.651
Training Decision Tree Model Finished.
Training Random Forest Model...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Params:
{'model__n_estimators': 800, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': None, 'model__bootstrap': False}
Tuned RF RMSE: 209,394.89
Tuned RF MAE: 105,645.06
Tuned RF R²:  0.710
Training Random Forest Model Finished.
Training XGBoost Model...
Fitting 5 folds for each of 30 