## imports and other stuff

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# For MLflow tracking
import mlflow
import mlflow.sklearn
import mlflow.xgboost

# Use both environment variables and explicit authentication
os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/qetibakh/ML-Homework1.mlflow/"
os.environ["MLFLOW_TRACKING_USERNAME"] = "qetibakh"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "5d5729711347a10a69c47eea029df715cc67e1ce"

# Test connection
print("MLflow tracking URI:", mlflow.get_tracking_uri())

# Load data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Save original copies for reference
train_original = train_data.copy()
test_original = test_data.copy()

MLflow tracking URI: https://dagshub.com/qetibakh/ML-Homework1.mlflow/
Training data shape: (1460, 81)
Test data shape: (1459, 80)


## cleaning

In [48]:
# Start MLflow run for cleaning
if mlflow.active_run():
    mlflow.end_run()
mlflow.start_run(run_name="data_cleaning")

# Function to log cleaning steps
def log_cleaning_step(description, affected_columns, technique):
    mlflow.log_param(f"cleaning_step_{description}_columns", str(affected_columns))
    mlflow.log_param(f"cleaning_step_{description}_technique", technique)

# Combine train and test for preprocessing
all_data = pd.concat([train_data.drop('SalePrice', axis=1), test_data])
print(f"Combined data shape: {all_data.shape}")

# Identify numerical and categorical features
num_features = all_data.select_dtypes(include=[np.number]).columns
cat_features = all_data.select_dtypes(include=['object']).columns

# Handle missing numerical data
numerical_na_cols = all_data[num_features].isnull().sum()[all_data[num_features].isnull().sum() > 0].index.tolist()
print(f"Numerical features with missing values: {numerical_na_cols}")

for col in numerical_na_cols:
    # Custom handling for each column with missing values
    if "garage" in col.lower() or "bsmt" in col.lower():
        # These are likely 0 if missing (no garage/basement)
        all_data[col].fillna(0, inplace=True)
        log_cleaning_step(f"fill_na_{col}", col, "fill_with_zero_as_feature_absent")
    else:
        # Fill with median for other numerical features
        all_data[col].fillna(all_data[col].median(), inplace=True)
        log_cleaning_step(f"fill_na_{col}", col, "fill_with_median")

# Handle missing categorical data
categorical_na_cols = all_data[cat_features].isnull().sum()[all_data[cat_features].isnull().sum() > 0].index.tolist()
print(f"Categorical features with missing values: {categorical_na_cols}")

for col in categorical_na_cols:
    if col in ["Alley", "PoolQC", "Fence", "MiscFeature"]:
        # These are often NA because the feature doesn't exist
        all_data[col].fillna("None", inplace=True)
        log_cleaning_step(f"fill_na_{col}", col, "fill_with_None_as_feature_absent")
    elif "garage" in col.lower() or "bsmt" in col.lower():
        # If these are missing, likely no garage/basement
        all_data[col].fillna("None", inplace=True)
        log_cleaning_step(f"fill_na_{col}", col, "fill_with_None_as_feature_absent")
    else:
        # For other categorical features, use mode
        all_data[col].fillna(all_data[col].mode()[0], inplace=True)
        log_cleaning_step(f"fill_na_{col}", col, "fill_with_mode")

# Check for remaining missing values
missing_after = all_data.isnull().sum().sum()
if missing_after > 0:
    print("Columns with remaining missing values:")
    print(all_data.isnull().sum()[all_data.isnull().sum() > 0])

    # Fill any remaining NaN values with appropriate values
    # For numeric columns, use median
    num_cols = all_data.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        if all_data[col].isnull().sum() > 0:
            all_data[col] = all_data[col].fillna(all_data[col].median())
            print(f"Filled NaN values in numeric column {col} with median")

    # For categorical columns, use mode
    cat_cols = all_data.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if all_data[col].isnull().sum() > 0:
            all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
            print(f"Filled NaN values in categorical column {col} with mode")

# Final verification
assert all_data.isnull().sum().sum() == 0, "There are still missing values after cleaning!"
print("All missing values have been handled successfully.")

# End cleaning MLflow run
mlflow.end_run()

Combined data shape: (2919, 80)
Numerical features with missing values: ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']
Categorical features with missing values: ['MSZoning', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType']
All missing values have been handled successfully.
🏃 View run data_cleaning at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0/runs/f1ba8f56753e409ea55373c418e3bd7e
🧪 View experiment at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0


## feature engineering

In [43]:
# Start MLflow run for feature engineering
if mlflow.active_run():
    mlflow.end_run()
mlflow.start_run(run_name="feature_engineering")

# Function to log feature engineering steps
def log_feature_eng_step(description, technique, affected_columns=None, created_columns=None):
    if affected_columns:
        mlflow.log_param(f"fe_step_{description}_affected", str(affected_columns))
    if created_columns:
        mlflow.log_param(f"fe_step_{description}_created", str(created_columns))
    mlflow.log_param(f"fe_step_{description}_technique", technique)

# Transform skewed numerical features
numeric_features = all_data.select_dtypes(include=[np.number]).columns

# Calculate skewness for each numerical feature
from scipy import stats
skewed_features = all_data[numeric_features].apply(lambda x: stats.skew(x.dropna())).sort_values(ascending=False)
high_skew_features = skewed_features[skewed_features > 0.5].index

print(f"Number of highly skewed features: {len(high_skew_features)}")
mlflow.log_param("num_skewed_features", len(high_skew_features))

# Apply log transformation to highly skewed features
for feature in high_skew_features:
    # Adding 1 to avoid log(0)
    all_data[feature] = np.log1p(all_data[feature])

log_feature_eng_step("log_transform_skewed", "log1p_transform", affected_columns=high_skew_features.tolist())

# Encode categorical variables
# Ordinal encoding for quality and condition features
ordinal_quality_mapping = {
    'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'None': 0, 'NA': 0
}

quality_cols = [col for col in all_data.columns if 'Qual' in col or 'Cond' in col]
quality_cols = [col for col in quality_cols if col in cat_features]

for col in quality_cols:
    all_data[col + '_Encoded'] = all_data[col].map(ordinal_quality_mapping)

log_feature_eng_step("ordinal_encoding_quality", "ordinal_mapping",
                    affected_columns=quality_cols,
                    created_columns=[col + '_Encoded' for col in quality_cols])

# One-hot encoding for other categorical features
categorical_cols = [col for col in cat_features if col not in quality_cols]
print(f"Number of categorical columns to one-hot encode: {len(categorical_cols)}")

# Apply one-hot encoding
all_data_encoded = pd.get_dummies(all_data, columns=categorical_cols, drop_first=True)
print(f"Shape after one-hot encoding: {all_data_encoded.shape}")

log_feature_eng_step("one_hot_encoding", "pd_get_dummies",
                    affected_columns=categorical_cols,
                    created_columns=[col for col in all_data_encoded.columns if col not in all_data.columns])

# Create new features
# Total square footage
all_data_encoded['TotalSF'] = all_data_encoded['1stFlrSF'] + all_data_encoded['2ndFlrSF'] + all_data_encoded['TotalBsmtSF']
log_feature_eng_step("total_square_feet", "sum_areas",
                    affected_columns=['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF'],
                    created_columns=['TotalSF'])

# Total bathrooms
all_data_encoded['TotalBath'] = all_data_encoded['FullBath'] + 0.5 * all_data_encoded['HalfBath'] + \
                               all_data_encoded['BsmtFullBath'] + 0.5 * all_data_encoded['BsmtHalfBath']
log_feature_eng_step("total_bathrooms", "weighted_sum",
                    affected_columns=['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'],
                    created_columns=['TotalBath'])

# House age and when it was remodeled
all_data_encoded['Age'] = all_data_encoded['YrSold'] - all_data_encoded['YearBuilt']
all_data_encoded['Remodeled'] = (all_data_encoded['YearRemodAdd'] != all_data_encoded['YearBuilt']).astype(int)
all_data_encoded['RemodAge'] = all_data_encoded['YrSold'] - all_data_encoded['YearRemodAdd']
log_feature_eng_step("age_features", "year_differences",
                    affected_columns=['YrSold', 'YearBuilt', 'YearRemodAdd'],
                    created_columns=['Age', 'Remodeled', 'RemodAge'])

# Log features created
mlflow.log_param("num_features_before", all_data.shape[1])
mlflow.log_param("num_features_after", all_data_encoded.shape[1])
mlflow.log_param("new_features_created", all_data_encoded.shape[1] - all_data.shape[1])

# End feature engineering MLflow run
mlflow.end_run()

Number of highly skewed features: 26
Number of categorical columns to one-hot encode: 33
Shape after one-hot encoding: (2919, 230)
🏃 View run feature_engineering at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0/runs/8f979e457b224b8b88927596f3474b3a
🧪 View experiment at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0


## feature selection

In [44]:
# Start MLflow run for feature selection
if mlflow.active_run():
    mlflow.end_run()

mlflow.start_run(run_name="feature_selection")

# Let's first check what's happening with the indices
print(f"Original train data shape: {train_original.shape}")
print(f"Original train index range: {min(train_original.index)} to {max(train_original.index)}")
print(f"all_data_encoded shape: {all_data_encoded.shape}")
print(f"all_data_encoded index range: {min(all_data_encoded.index)} to {max(all_data_encoded.index)}")

# We need to understand the difference between train_original.index and train_idx
train_idx = train_original.index

# A better approach - use integer positions since we know train data comes first
train_rows = train_original.shape[0]
X_train_full = all_data_encoded.iloc[:train_rows].copy()
X_test = all_data_encoded.iloc[train_rows:].copy()
y_train = np.log1p(train_original['SalePrice'])  # Log transform for better model performance

# Verify the shapes are correct
print(f"X_train_full shape: {X_train_full.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
assert X_train_full.shape[0] == y_train.shape[0], "X and y have different number of samples!"

# Check for categorical columns that might cause issues
categorical_columns = X_train_full.select_dtypes(include=['object']).columns
if len(categorical_columns) > 0:
    print(f"Found categorical columns that need handling: {categorical_columns.tolist()}")
    # Create a numeric-only version of the data
    X_train_full_numeric = X_train_full.select_dtypes(include=[np.number])
    print(f"Using {X_train_full_numeric.shape[1]} numeric features for all feature selection methods")
else:
    X_train_full_numeric = X_train_full
    print("All features are already numeric")

# Check for and handle NaN values
if X_train_full_numeric.isnull().values.any():
    print(f"Found NaN values in numeric data.")
    # Count NaN values by column
    nan_counts = X_train_full_numeric.isnull().sum()
    columns_with_nan = nan_counts[nan_counts > 0]
    print(f"Columns with NaN values:\n{columns_with_nan}")

    # Identify columns that are entirely NaN
    all_nan_columns = [col for col in X_train_full_numeric.columns
                      if X_train_full_numeric[col].isnull().all()]

    if all_nan_columns:
        print(f"These columns are entirely NaN and will be dropped: {all_nan_columns}")
        X_train_full_numeric = X_train_full_numeric.drop(columns=all_nan_columns)

    # For the remaining columns with some NaN values, fill with median
    for col in X_train_full_numeric.columns:
        if X_train_full_numeric[col].isnull().any():
            X_train_full_numeric[col] = X_train_full_numeric[col].fillna(X_train_full_numeric[col].median())
            print(f"Filled NaN values in column {col} with median")

    # Verify no NaN values remain
    assert not X_train_full_numeric.isnull().values.any(), "NaN values still present after filling"
    print("All NaN values have been handled")

# Correlation with target variable
correlation_method = {}

# Add SalePrice back for correlation calculation
X_train_with_target = X_train_full_numeric.copy()
X_train_with_target['SalePrice'] = train_original['SalePrice']

# Calculate correlation using only numeric data
correlation = X_train_with_target.corr()['SalePrice'].sort_values(ascending=False)

# Select features with absolute correlation > threshold
thresholds = [0.1, 0.2, 0.3]
for threshold in thresholds:
    corr_selected = correlation[abs(correlation) > threshold].index.tolist()
    corr_selected.remove('SalePrice')  # Remove target
    correlation_method[f"corr_{threshold}"] = corr_selected
    print(f"Selected {len(corr_selected)} features using correlation threshold {threshold}")
    mlflow.log_param(f"num_features_corr_{threshold}", len(corr_selected))

# Lasso feature selection
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Use numeric-only data for Lasso
lasso_selector = SelectFromModel(Lasso(alpha=0.005, random_state=42))
lasso_selector.fit(X_train_full_numeric, y_train)

lasso_selected = X_train_full_numeric.columns[lasso_selector.get_support()].tolist()
print(f"Selected {len(lasso_selected)} features using Lasso")
mlflow.log_param("num_features_lasso", len(lasso_selected))

# Random Forest feature importance
from sklearn.ensemble import RandomForestRegressor

# Use numeric-only data for Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_full_numeric, y_train)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Select top features by importance
rf_selected_top30 = X_train_full_numeric.columns[indices[:min(30, len(indices))]].tolist()
rf_selected_top50 = X_train_full_numeric.columns[indices[:min(50, len(indices))]].tolist()

print(f"Selected top 30 features using Random Forest importance")
print(f"Selected top 50 features using Random Forest importance")
mlflow.log_param("num_features_rf_top30", len(rf_selected_top30))
mlflow.log_param("num_features_rf_top50", len(rf_selected_top50))

# Create different feature sets for experimentation
feature_sets = {
    'all_features_numeric': X_train_full_numeric.columns.tolist(),
    'corr_0.1': correlation_method['corr_0.1'],
    'corr_0.3': correlation_method['corr_0.3'],
    'lasso_selected': lasso_selected,
    'rf_top30': rf_selected_top30,
    'rf_top50': rf_selected_top50
}

# Log feature sets
for set_name, features in feature_sets.items():
    mlflow.log_param(f"feature_set_{set_name}_count", len(features))

# End feature selection MLflow run
mlflow.end_run()

Original train data shape: (1460, 81)
Original train index range: 0 to 1459
all_data_encoded shape: (2919, 235)
all_data_encoded index range: 0 to 1459
X_train_full shape: (1460, 235)
y_train shape: (1460,)
X_test shape: (1459, 235)
Found categorical columns that need handling: ['Condition1', 'Condition2', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'GarageQual', 'GarageCond', 'SaleCondition']
Using 52 numeric features for all feature selection methods
Found NaN values in numeric data.
Columns with NaN values:
Condition1_Encoded       1460
Condition2_Encoded       1460
SaleCondition_Encoded    1460
dtype: int64
These columns are entirely NaN and will be dropped: ['Condition1_Encoded', 'Condition2_Encoded', 'SaleCondition_Encoded']
All NaN values have been handled
Selected 35 features using correlation threshold 0.1
Selected 29 features using correlation threshold 0.2
Selected 23 features using correlation threshold 0.3
Selected 23 features using Lasso
Selected top 

## training

In [45]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import numpy as np

# Define evaluation function
def rmse_cv(model, X, y, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y,
                                   scoring="neg_mean_squared_error",
                                   cv=kf))
    return rmse

# Train models on different feature sets
for set_name, features in feature_sets.items():
    print(f"\nTraining models using feature set: {set_name} with {len(features)} features")

    # Prepare this feature set
    X = X_train_full[features]

    # Start MLflow run for this feature set
    with mlflow.start_run(run_name=f"model_training_{set_name}"):
        mlflow.log_param("feature_set", set_name)
        mlflow.log_param("num_features", len(features))

        # Linear Regression (baseline)
        with mlflow.start_run(run_name="linear_regression", nested=True):
            lr = LinearRegression()
            rmse = rmse_cv(lr, X, y_train).mean()
            print(f"Linear Regression RMSE: {rmse:.4f}")

            mlflow.log_param("model_type", "LinearRegression")
            mlflow.log_metric("rmse_cv", rmse)

            # Fit on full training data
            lr.fit(X, y_train)
            mlflow.sklearn.log_model(lr, "model", input_example=X.iloc[:5])

            # Check for overfitting
            train_pred = lr.predict(X)
            train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
            mlflow.log_metric("train_rmse", train_rmse)
            mlflow.log_metric("overfitting_ratio", train_rmse/rmse)

            print(f"  Train RMSE: {train_rmse:.4f}")
            print(f"  Overfitting ratio: {train_rmse/rmse:.4f}")

        # Random Forest with different parameters
        # Try different values for n_estimators and max_depth
        n_estimators_list = [50, 100, 200]
        max_depth_list = [None, 10, 20, 30]

        for n_est in n_estimators_list:
            for max_d in max_depth_list:
                with mlflow.start_run(run_name=f"rf_nest_{n_est}_depth_{max_d}", nested=True):
                    rf = RandomForestRegressor(n_estimators=n_est, max_depth=max_d, random_state=42)
                    rmse = rmse_cv(rf, X, y_train).mean()
                    print(f"Random Forest (n_est={n_est}, max_depth={max_d}) RMSE: {rmse:.4f}")

                    mlflow.log_param("model_type", "RandomForest")
                    mlflow.log_param("n_estimators", n_est)
                    mlflow.log_param("max_depth", max_d)
                    mlflow.log_metric("rmse_cv", rmse)

                    # Fit on full training data
                    rf.fit(X, y_train)
                    mlflow.sklearn.log_model(rf, "model", input_example=X.iloc[:5])

                    # Check for overfitting
                    train_pred = rf.predict(X)
                    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
                    mlflow.log_metric("train_rmse", train_rmse)
                    mlflow.log_metric("overfitting_ratio", train_rmse/rmse)

                    print(f"  Train RMSE: {train_rmse:.4f}")
                    print(f"  Overfitting ratio: {train_rmse/rmse:.4f}")

        # XGBoost with different parameters
        # Try different learning rates, max_depths, and n_estimators
        learning_rates = [0.01, 0.05, 0.1]
        max_depths = [3, 5, 7]
        n_estimators = [100, 200]

        for lr in learning_rates:
            for depth in max_depths:
                for n_est in n_estimators:
                    with mlflow.start_run(run_name=f"xgb_lr_{lr}_depth_{depth}_nest_{n_est}", nested=True):
                        xgb_model = xgb.XGBRegressor(
                            objective='reg:squarederror',
                            learning_rate=lr,
                            max_depth=depth,
                            n_estimators=n_est,
                            random_state=42
                        )

                        rmse = rmse_cv(xgb_model, X, y_train).mean()
                        print(f"XGBoost (lr={lr}, depth={depth}, n_est={n_est}) RMSE: {rmse:.4f}")

                        mlflow.log_param("model_type", "XGBoost")
                        mlflow.log_param("learning_rate", lr)
                        mlflow.log_param("max_depth", depth)
                        mlflow.log_param("n_estimators", n_est)
                        mlflow.log_metric("rmse_cv", rmse)

                        # Fit on full training data
                        xgb_model.fit(X, y_train)
                        mlflow.xgboost.log_model(xgb_model, "model", input_example=X.iloc[:5])

                        # Check for overfitting
                        train_pred = xgb_model.predict(X)
                        train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
                        mlflow.log_metric("train_rmse", train_rmse)
                        mlflow.log_metric("overfitting_ratio", train_rmse/rmse)

                        print(f"  Train RMSE: {train_rmse:.4f}")
                        print(f"  Overfitting ratio: {train_rmse/rmse:.4f}")

# More detailed hyperparameter tuning for XGBoost
# We'll assume 'rf_top50' was one of the better feature sets
best_features = feature_sets['rf_top50']
X_best = X_train_full[best_features]

with mlflow.start_run(run_name="xgboost_hyperparameter_tuning"):
    mlflow.log_param("feature_set", "rf_top50")
    mlflow.log_param("model_type", "XGBoost")

    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    grid_search = GridSearchCV(estimator=xgb_model,
                               param_grid=param_grid,
                               scoring='neg_mean_squared_error',
                               cv=5,
                               verbose=1)

    grid_search.fit(X_best, y_train)

    # Log best parameters and score
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best RMSE: {np.sqrt(-grid_search.best_score_):.4f}")

    for param, value in grid_search.best_params_.items():
        mlflow.log_param(f"best_{param}", value)

    mlflow.log_metric("best_rmse", np.sqrt(-grid_search.best_score_))

    # Log best model
    best_model = grid_search.best_estimator_
    mlflow.xgboost.log_model(best_model, "best_model")

    # Check for overfitting on the best model
    train_pred = best_model.predict(X_best)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("overfitting_ratio", train_rmse/np.sqrt(-grid_search.best_score_))

    print(f"Best model train RMSE: {train_rmse:.4f}")
    print(f"Best model overfitting ratio: {train_rmse/np.sqrt(-grid_search.best_score_):.4f}")

    # Register model in MLflow Model Registry
    mlflow.xgboost.log_model(
        best_model,
        "best_model",
        input_example=X_best.iloc[:5],
        registered_model_name="house_price_xgboost"
    )


Training models using feature set: all_features_numeric with 49 features
Linear Regression RMSE: 0.1411
  Train RMSE: 0.1320
  Overfitting ratio: 0.9361
🏃 View run linear_regression at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0/runs/c119bfe1915140e7b1d613e38ec918ab
🧪 View experiment at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0
Random Forest (n_est=50, max_depth=None) RMSE: 0.1469
  Train RMSE: 0.0573
  Overfitting ratio: 0.3899
🏃 View run rf_nest_50_depth_None at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0/runs/aabd007194074bf3a18afb48cc5ddcf7
🧪 View experiment at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0
Random Forest (n_est=50, max_depth=10) RMSE: 0.1474
  Train RMSE: 0.0676
  Overfitting ratio: 0.4587
🏃 View run rf_nest_50_depth_10 at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0/runs/657eb27d58b2421699a88cb8bf2135b6
🧪 View experiment at: https://dagshub.com/qetibakh/ML-



Best model train RMSE: 0.0549
Best model overfitting ratio: 0.4164


Registered model 'house_price_xgboost' already exists. Creating a new version of this model...
2025/04/10 16:25:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: house_price_xgboost, version 2
Created version '2' of model 'house_price_xgboost'.


🏃 View run xgboost_hyperparameter_tuning at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0/runs/71b58a3c56f14349b6309a50df5f0c3e
🧪 View experiment at: https://dagshub.com/qetibakh/ML-Homework1.mlflow/#/experiments/0
