In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

In [2]:
# add file path and load the dataset using pandas
file_path_train = r'C:\Users\asus\Documents\GitHub\tugas_day26\artifacts\train.csv'

df = pd.read_csv(file_path_train)

# display the first few rows of the dataset
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
numeric_features = [
    'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', 
    '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 
    'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 
    'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 
    'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
    'MiscVal', 'MoSold', 'YrSold'
]


categorical_features = [
    'MSSubClass', 'LotShape', 'LandContour', 'Utilities', 'LandSlope', 
    'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 
    'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 
    'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 
    'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'Functional','MSZoning', 
    'Street', 'Alley', 'LotConfig', 'Neighborhood', 'Condition1', 
    'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 
    'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 
    'Electrical', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition'
]

In [4]:
# Ganti missing value di numeric_features dengan 0
df[numeric_features] = df[numeric_features].fillna(0)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df = df.drop(columns='Id', axis=1)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
# Convert categorical variables
le = LabelEncoder()

for col in categorical_features:
    df[col] = le.fit_transform(df[col])

In [7]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,5,3,65.0,8450,1,2,3,3,0,4,...,0,3,4,4,0,2,2008,8,4,208500
1,0,3,80.0,9600,1,2,3,3,0,2,...,0,3,4,4,0,5,2007,8,4,181500
2,5,3,68.0,11250,1,2,0,3,0,4,...,0,3,4,4,0,9,2008,8,4,223500
3,6,3,60.0,9550,1,2,0,3,0,0,...,0,3,4,4,0,2,2006,8,0,140000
4,5,3,84.0,14260,1,2,0,3,0,2,...,0,3,4,4,0,12,2008,8,4,250000


In [8]:
target_column_name = "SalePrice"
# drop target from the dataframe. Use df.drop(target_column_name, axis=1)
X = df.drop(target_column_name, axis=1)

# use target_column_name as the target variable
y = np.log10(df[target_column_name])

In [10]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Scale numerical features
minmax = MinMaxScaler()
X_train[numeric_features] = minmax.fit_transform(X_train[numeric_features])
X_test[numeric_features] = minmax.transform(X_test[numeric_features])

In [12]:
X_train.to_csv('X_train.csv', index=False)

In [11]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
254,0,3,0.223642,0.033186,1,2,3,3,0,4,...,0.0,0.0,3,4,4,0.0,0.454545,1.0,8,4
1066,5,3,0.188498,0.030555,1,2,0,3,0,4,...,0.0,0.0,3,4,4,0.0,0.363636,0.75,8,4
638,1,3,0.214058,0.034948,1,2,3,3,0,4,...,0.0,0.0,3,2,4,0.0,0.363636,0.5,8,4
799,4,3,0.191693,0.027577,1,2,3,3,0,0,...,0.0,0.0,3,2,4,0.0,0.454545,0.25,8,4
380,4,3,0.159744,0.017294,1,1,3,3,0,4,...,0.0,0.0,3,4,4,0.0,0.363636,1.0,8,4


In [12]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('houseprice_prediction')

def evaluate_model(y_true, y_pred):
    """Calculate evaluation metrics"""
    return {
        'RMSE': sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2 Score': r2_score(y_true, y_pred),
    }


2025/01/19 14:10:06 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/01/19 14:10:06 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

In [13]:
models = {
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': 10,  # Contoh parameter, bisa disesuaikan
            'min_samples_split': 2,
            'random_state': 42
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': 100,
            'max_depth': 10,
            'random_state': 42
        }
    },
    'xgboost': {
        'model': XGBRegressor(),
        'params': {
            'n_estimators': 100,
            'learning_rate': 0.1,
            'max_depth': 3,
            'random_state': 42,
            'objective': 'reg:squarederror'  # Specify regression objective
        }
    }
}


In [14]:
# Train and evaluate models
for model_name, model_info in models.items():
    with mlflow.start_run(run_name=model_name):
        # Log model parameters
        mlflow.log_params(model_info['params'])
        
        # Train model
        model = model_info['model']
        model.set_params(**model_info['params'])
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate and log metrics
        metrics = evaluate_model(y_test, y_pred)
        mlflow.log_metrics(metrics)
        
        # Log model
        mlflow.sklearn.log_model(model, 
                                 model_name,
                                 registered_model_name=f"houseprice_{model_name}")
        
        print(f"\nModel: {model_name}")
        for metric_name, value in metrics.items():
            print(f"{metric_name}: {value:.4f}")

Successfully registered model 'houseprice_decision_tree'.
Created version '1' of model 'houseprice_decision_tree'.



Model: decision_tree
RMSE: 0.0803
MAE: 0.0590
R2 Score: 0.8167


Successfully registered model 'houseprice_random_forest'.
Created version '1' of model 'houseprice_random_forest'.



Model: random_forest
RMSE: 0.0640
MAE: 0.0430
R2 Score: 0.8837


Successfully registered model 'houseprice_xgboost'.
Created version '1' of model 'houseprice_xgboost'.



Model: xgboost
RMSE: 0.0586
MAE: 0.0402
R2 Score: 0.9026


In [15]:
# Transition the best model to production
from mlflow.tracking import MlflowClient

def load_production_model(model_name):
    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/houseprice_{model_name}/Production"
    )
    return model



client = MlflowClient()

def transition_model_to_production(model_name):
    client = MlflowClient()
    latest_version = client.get_latest_versions(f"houseprice_{model_name}", stages=["None"])[0]
    client.transition_model_version_stage(
        name=f"houseprice_{model_name}",
        version=latest_version.version,
        stage="Production"
    )

In [16]:
# Example: Transition the best performing model to production
# Note: You should choose the best model based on your evaluation metrics
transition_model_to_production('xgboost')

In [17]:
def get_all_runs():
    experiment = mlflow.get_experiment_by_name('houseprice_prediction')
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    return runs

In [19]:
runs_df = get_all_runs()
metrics_comparison = runs_df[['tags.mlflow.runName', 'metrics.RMSE', 'metrics.MAE',
                              'metrics.R2 Score']]
print("\nModel Performance Comparison:")
print(metrics_comparison)


Model Performance Comparison:
  tags.mlflow.runName  metrics.RMSE  metrics.MAE  metrics.R2 Score
0             xgboost      0.058559     0.040210          0.902575
1       random_forest      0.063975     0.042971          0.883720
2       decision_tree      0.080320     0.058996          0.816712


In [20]:
# Analyze feature importance for the best model (Random Forest)
best_model = models['xgboost']['model']
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
})

# Sort features by importance
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Display top 10 most important features
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Top 10 Most Important Features:
         feature  importance
16   OverallQual    0.238486
59  GarageFinish    0.116719
60    GarageCars    0.107336
40    CentralAir    0.064331
57    GarageType    0.055398
45     GrLivArea    0.053216
19  YearRemodAdd    0.038758
37   TotalBsmtSF    0.035820
29      BsmtQual    0.030437
18     YearBuilt    0.027630
