In [2]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/MLOps/Day 4/Housing.csv")
print(df.head())

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


Identify feature type

In [3]:
#'Price' is the target, rest are features
numeric_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
categorical_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'furnishingstatus']
x = df.drop('price', axis=1)
y = df['price']

Build Preprocessing Transformer

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Numeric pipleines: fill missing and scale feature
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#Categorical pipeliines: fill then OneHot encode
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both into a column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

Build and Combine Model Pipeline

In [5]:
from sklearn.ensemble import RandomForestRegressor

pipelines = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

Data Splitting

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Tune Hyperparameter with GridSearchCV

In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [5, 10, None],
    'regressor__min_samples_split': [2,5]
}

grid = GridSearchCV(pipelines, param_grid, cv=5, n_jobs=-1)
scoring = 'neg_mean_squared_error'
grid.fit(x_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print("Best RMSE (CV):", (-grid.best_score_)**0.5)


Best parameters: {'regressor__max_depth': 10, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 100}
Best RMSE (CV): nan


  print("Best RMSE (CV):", (-grid.best_score_)**0.5)


Evaluate Best Pipeline

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

best_pipeline = grid.best_estimator_
y_pred = best_pipeline.predict(x_test)

print("Test RMSE:", mean_squared_error(y_test, y_pred)**0.5)
print("Test R2:", r2_score(y_test, y_pred))


Test RMSE: 1421865.8390219533
Test R2: 0.6000248555980363


Save the Best Model

In [10]:
from joblib import dump

dump(best_pipeline, 'house_price_pipeline.pkl')
print("Saved to house_price_pipeline.pkl")


Saved to house_price_pipeline.pkl


To predict on new data

In [None]:
from joblib import load
pipeline = load('house_price_pipeline.pkl')
# new_x = pd.DataFrame([{
#     "area": 3000,
#     "bedrooms": 3,
#     "bathrooms": 2,
#     "stories": 2,
#     "mainroad": "yes",
#     "guestroom": "no",
#     "basement": "yes",
#     "hotwaterheating": "no",
#     "airconditioning": "yes",
#     "parking": 2,
#     "prefarea": "yes",
#     "furnishingstatus": "semi-furnished"
# }])

y_new = pipeline.predict(new_x)
print(y_new)
