In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold,RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import xgboost as xgb

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('outliers_treated.csv')

In [3]:
df.drop(columns=['Unnamed: 0','price_per_sqft','areaWithType','outliers','area_room_ratio','store room','pooja room','others'],inplace=True)

In [4]:
df['luxury_cat'] = pd.cut(df['luxury_score'],bins=[-1,50,150,175],labels=['low','medium','high'])
df['floorNum_cat'] = pd.cut(df['floorNum'],bins=[-1,3,10,51],labels=['low','medium','high'])

In [5]:
df.drop(columns=['luxury_score','facing','floorNum','society'],inplace=True)

In [6]:
df.loc[df['agePossession'] == 'Undefined','agePossession'] = np.nan

In [7]:
df['furnishing_type'] = df['furnishing_type'].replace({0:'unfurnished',1:'semi_furnished',2:'furnished'})

In [8]:
df['furnishing_type'].value_counts()

furnishing_type
unfurnished       2430
semi_furnished    1026
furnished          193
Name: count, dtype: int64

In [9]:
x = df.drop(columns=['price'])
y = df['price']

In [10]:
y = np.log1p(y)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [12]:
x.isnull().sum()

property_type             0
sector                    0
area                      0
bedRoom                   0
bathroom                  0
balcony                   0
agePossession           315
super_built_up_area    1735
built_up_area          2042
carpet_area            1752
study room                0
servant room              0
furnishing_type           0
luxury_cat                0
floorNum_cat             17
dtype: int64

## Target Encoding with XGBoost

In [24]:
missing_pipe_cat_age = Pipeline([
    ('Missing_Cat', SimpleImputer(strategy='most_frequent')),
   ('Onehot',OneHotEncoder(drop='first',handle_unknown='ignore'))
])

missing_pipe_cat_floor = Pipeline([
    ('Missing_Cat', SimpleImputer(strategy='most_frequent')),
   ('ordinal',OrdinalEncoder())
])

missing_pipe_num = Pipeline([
    ('Missing_Num', SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
])


Target_preprocessor = ColumnTransformer([
    ('missing_pipe_cat_age',missing_pipe_cat_age,['agePossession']),
    ('missing_pipe_cat_floor',missing_pipe_cat_floor,['floorNum_cat']),
    ('missing_pipe_num',missing_pipe_num,[ 'super_built_up_area','built_up_area', 'carpet_area']),
    ('ordinal',OrdinalEncoder(),['balcony','luxury_cat']),
    ('onehot',OneHotEncoder(drop='first', handle_unknown='ignore'), ['furnishing_type','property_type']),
    ('target',ce.TargetEncoder(),['sector']),
    ('scaling',StandardScaler(),['area', 'bedRoom', 'bathroom', 'study room', 'servant room'])
],remainder='passthrough')


model_pipe = Pipeline([

    ('preprocessor',Target_preprocessor),
    ('model',XGBRegressor(random_state=0, n_estimators = 1))    

])

## Early Stopping Rounds with xgb.cv()
### DMatrix does not support object dtypes , so we have to convert all of them to category 
### and we have to enable the enable_categorical=True parameter (Tough work!)

In [None]:
x['property_type'] = x['property_type'].astype('category')
x['sector'] = x['sector'].astype('category')
x['balcony'] = x['balcony'].astype('category')
x['agePossession'] = x['agePossession'].astype('category')
x['furnishing_type'] = x['furnishing_type'].astype('category') 


cv_score = xgb.cv(dtrain=xgb.DMatrix(x,y,enable_categorical=True),
                  params={
                      'gamma': 0,
                      'max_depth': 5,
                      'learning_rate': 0.1,
                      'max_bin': 256,
                      'reg_lambda': 5,
                      'min_child_weight': 1,
                      'subsample': 0.6,
                      'colsample_bynode': 0.5,
                      'objective': 'reg:squarederror'
        
                  }, # Must include a params dictionary
                  num_boost_round=1000,                    # Number of boosting iterations
                  early_stopping_rounds=10,
                  nfold=10,                                # k-fold value 
                  as_pandas=True,                          # Return a pandas DataFrame
                  show_stdv=True)


print(cv_score)

num_estimators = cv_score['test-rmse-mean'].idxmin()
test_rmse_mean = cv_score['test-rmse-mean'].min()

print(f'num_estimators: {num_estimators}')
print(f'test_rmse_mean : {test_rmse_mean}')

     train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0           0.507401        0.001944        0.509786       0.016552
1           0.468907        0.001879        0.473086       0.015095
2           0.434299        0.001662        0.441114       0.014278
3           0.402607        0.001496        0.412079       0.014111
4           0.374424        0.001420        0.386097       0.013344
..               ...             ...             ...            ...
260         0.040798        0.000648        0.165552       0.017392
261         0.040666        0.000659        0.165558       0.017353
262         0.040505        0.000652        0.165532       0.017384
263         0.040360        0.000642        0.165563       0.017366
264         0.040211        0.000630        0.165469       0.017371

[265 rows x 4 columns]
num_estimators: 264
test_rmse_mean : 0.16546866258286141


### The n_estimator = 264 for base XGBoost

In [25]:
model_pipe.fit(x_train,y_train)


In [26]:
y_pred = model_pipe.predict(x_test)

y_pred = np.expm1(y_pred)


R2 = r2_score(np.expm1(y_test),y_pred)
mse = mean_squared_error(np.expm1(y_test),y_pred)
mae = mean_absolute_error(np.expm1(y_test),y_pred)

print(f"R2:{R2}")
print(f"MSE:{mse}")
print(f"MAE:{mae}")

R2:0.22861117263190944
MSE:4.984010379604114
MAE:1.168018014960093


In [27]:
tree_dump = model_pipe.steps[1][1].get_booster().get_dump(dump_format='text',with_stats=True)

for trees in tree_dump:
    print(trees) 

0:[f14<0.381864011] yes=1,no=2,missing=2,gain=386.288574,cover=2919
	1:[f13<1.09369445] yes=3,no=4,missing=4,gain=108.055809,cover=2251
		3:[f14<-0.479729891] yes=7,no=8,missing=8,gain=56.7579041,cover=1780
			7:[f14<-0.996512532] yes=15,no=16,missing=16,gain=15.2498322,cover=818
				15:[f8<3] yes=31,no=32,missing=32,gain=0.741226196,cover=329
					31:[f13<0.737081349] yes=63,no=64,missing=64,gain=0.335510254,cover=298
						63:leaf=-0.222526833,cover=140
						64:leaf=-0.19132176,cover=158
					32:[f13<1.03817868] yes=65,no=66,missing=66,gain=0.21001482,cover=31
						65:leaf=-0.152318671,cover=29
						66:leaf=-0.0319518596,cover=2
				16:[f13<0.899127007] yes=33,no=34,missing=34,gain=2.33795929,cover=489
					33:[f15<0.802489102] yes=67,no=68,missing=68,gain=0.759914398,cover=289
						67:leaf=-0.137073025,cover=283
						68:leaf=-0.0245427508,cover=6
					34:[f8<3] yes=69,no=70,missing=70,gain=0.891414642,cover=200
						69:leaf=-0.109881274,cover=110
						70:leaf=-0.0678034425,co

# With Default parameters

## Gain Distribution Analysis:
Root node gain: 386.77 (very strong initial split)

Wide gain range: From very small (0.0001) to large (386.77)

Many low-gain splits: ~50% of splits have gains < 2.0

High-value splits: Top 10% of splits have gains > 7.0

## Cover Distribution Analysis:
Root cover: 2919 samples (all data)

Uneven distribution: Some nodes cover 500+ samples, others cover just 1-2

Many small nodes: Numerous leaves with cover < 10 (potential overfitting)

Critical Observations:
Overfitting Risk: Many splits with very low gains (< 1.0) and small cover values

Example: gains of 0.0001, 0.02, 0.17 on nodes covering 1-5 samples

## Important Features:

f14 and f13 are dominant features (appear in top splits)

Root split on f14 with very high gain (386.77)

Extreme Leaf Values: Some leaves have very large values

Examples: 0.567, 0.528, 0.457 (risk of overfitting)