In [4]:
import pandas as pd
import numpy as np 
import seaborn as sns

# 🏗️ Pipeline Building — Step by Step


In [5]:
# Load your CSV
df = pd.read_csv("Housing.csv")


In [6]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [8]:
df = df.dropna()

### 1️⃣ 🎯 Pick your target & features


In [9]:
target = 'price'

categorical_cols = [
    'mainroad', 'guestroom', 'basement', 
    'hotwaterheating', 'airconditioning', 
    'prefarea', 'furnishingstatus'
]

numeric_cols = [
    'area', 'bedrooms', 'bathrooms', 
    'stories', 'parking'
]


In [10]:
df[categorical_cols].head()

Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,yes,no,no,no,yes,yes,furnished
1,yes,no,no,no,yes,no,furnished
2,yes,no,yes,no,no,yes,semi-furnished
3,yes,no,yes,no,yes,yes,furnished
4,yes,yes,yes,no,yes,no,furnished


In [11]:
df[numeric_cols].head()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking
0,7420,4,2,3,2
1,8960,4,4,4,3
2,9960,3,2,2,2
3,7500,4,2,2,3
4,7420,4,1,2,2


### 2️⃣ 🛠️ Create preprocessing steps

In [12]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler() # we are scaling num data here 

categorical_transformer = OneHotEncoder(drop='first') # OHE   

preprocessor = ColumnTransformer(
    transformers=[
        # 'num' → a name you give to this transformer (you can call it anything)
        # numeric_transformer → the actual preprocessing pipeline for numeric data (e.g., imputation + scaling)
        # numeric_cols → list of numeric column names from your dataframe
        ('num', numeric_transformer, numeric_cols),

        #Same for 'cat' → categorical pipeline + categorical column names.
        
        ('cat', categorical_transformer, categorical_cols)
    ]
)


### 2️⃣.5️⃣ 📏⚖️ Feature Scaling (inside preprocessor)

In [13]:
from sklearn.feature_selection import SelectKBest, f_regression

# Select top features (you can tune 'k' later)
feature_selector = SelectKBest(score_func=f_regression, k='all')


### 3️⃣ 🤖 Build the pipeline


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression


# 7️⃣ Build pipeline with Linear Regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),       # 2️⃣.5️⃣ Scaling + OHE
    ('feature_select', feature_selector), # 6️⃣ Feature selection
    ('model', LinearRegression())         # 7️⃣ Linear Regression model
])


### 4️⃣ 👀 See the pipeline visually

In [15]:
from sklearn import set_config
set_config(display='diagram')  # 🖼️ Pretty diagram in Jupyter
pipeline


0,1,2
,steps,"[('preprocessor', ...), ('feature_select', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function f_r...00191B52FC0E0>
,k,'all'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### 5️⃣ ✂️ Train/test split

In [16]:
from sklearn.model_selection import train_test_split, GridSearchCV
X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


### 6️⃣ 🏋️ Fit the pipeline

In [17]:
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('feature_select', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function f_r...00191B52FC0E0>
,k,'all'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


7️⃣ 📊 Evaluate

In [18]:
from sklearn.metrics import r2_score, mean_squared_error
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("R²:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R²: 0.6529242642153177
RMSE: 1324506.96009144


8️⃣ 🔧 Optional: Hyperparameter Tuning

In [19]:
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, 
                           cv=5, scoring='r2', n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best R²:", grid_search.best_score_)
print("Best params:", grid_search.best_params_)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


ValueError: Invalid parameter 'max_depth' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive', 'tol'].

## 📊 new data on old pipeline 

In [37]:
df2 = pd.read_csv("cal_housing.csv")

In [38]:
df2.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [39]:
print(df2.columns.tolist())


['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']


In [41]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Correct column names from your df2
numeric_features = [
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income'
]
categorical_features = ['ocean_proximity']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Full pipeline with HistGradientBoostingRegressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', HistGradientBoostingRegressor(random_state=42))
])

# Prepare data
target = 'median_house_value'
X = df2.drop(columns=[target])
y = df2[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Calculate and print R² and RMSE
print("HistGradientBoosting → R²:", r2_score(y_test, y_pred))
print("HistGradientBoosting → RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

HistGradientBoosting → R²: 0.8251962302630622
HistGradientBoosting → RMSE: 47860.67585122866
