In [7]:
import numpy as np
import pandas as pd

from  sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import r2_score,mean_absolute_error


In [93]:
#we made all columns as it is , so that we can perform all operation in a pipeline
df=pd.read_csv("gurgaon_properties_post_feature_selection_v2.csv")

In [6]:
df.head(5)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [8]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [94]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [10]:
df.sample(5)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
1294,house,sector 3,1.5,3.0,3.0,0,Old Property,210.0,0.0,0.0,unfurnished,Low,Low Floor
1823,flat,sector 81,2.21,3.0,3.0,3+,Relatively New,1881.0,1.0,0.0,furnished,Medium,Mid Floor
784,flat,sector 37d,1.1,3.0,3.0,3,Relatively New,1990.0,0.0,0.0,unfurnished,Medium,Mid Floor
2142,house,sector 104,0.6,4.0,3.0,1,Moderately Old,1170.0,0.0,0.0,unfurnished,Low,Low Floor
1673,flat,sector 54,6.75,4.0,4.0,2,Moderately Old,2076.0,1.0,0.0,furnished,Medium,Mid Floor


In [12]:
X=df.drop(columns='price')
y=df["price"]

In [9]:
X.head(1)

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor


In [13]:
y.head(5)

0    0.82
1    0.95
2    0.32
3    1.60
4    0.48
Name: price, dtype: float64

In [14]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [16]:
df.columns

Index(['property_type', 'sector', 'price', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [17]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [18]:
preprocessor=ColumnTransformer([
    ('num',StandardScaler(),['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
    ('cat',OrdinalEncoder(),columns_to_encode)],
    remainder="passthrough"
)

In [19]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()

In [21]:
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',model)
])

In [22]:
kfold=KFold(n_splits=10,shuffle=True,random_state=42)
scores=cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')

In [23]:
scores.mean()

0.7363096633436828

In [24]:
scores.mean(),scores.std()

(0.7363096633436828, 0.0323800575442993)

In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [26]:
pipeline.fit(X_train,y_train)

In [27]:
y_pred=pipeline.predict(X_test)

In [28]:
y_pred = np.expm1(y_pred)

In [29]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.946382216008936

In [30]:
def model_selection(model_name,model):

    output=[]
    output.append(model_name)
    
    preprocessor=ColumnTransformer([
    ('num',StandardScaler(),['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
    ('cat',OrdinalEncoder(),columns_to_encode)],
    remainder="passthrough"
    )

    pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',model)
    ])

    kfold=KFold(n_splits=10,shuffle=True,random_state=42)
    scores=cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
    output.append(scores.mean())

    X_train,X_test,y_train,y_test=train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    pipeline.fit(X_train,y_train)
    y_pred=pipeline.predict(X_test)
    y_pred = np.expm1(y_pred)
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [36]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.8/124.9 MB 2.8 MB/s eta 0:00:45
    --------------------------------------- 1.8/124.9 MB 3.6 MB/s eta 0:00:35
    --------------------------------------- 2.6/124.9 MB 3.9 MB/s eta 0:00:32
   - -------------------------------------- 3.4/124.9 MB 4.0 MB/s eta 0:00:31
   - -------------------------------------- 4.5/124.9 MB 4.0 MB/s eta 0:00:31
   - -------------------------------------- 6.0/124.9 MB 4.6 MB/s eta 0:00:27
   -- ------------------------------------- 7.3/124.9 MB 4.8 MB/s eta 0:00:25
   -- ------------------------------------- 8.7/124.9 MB 5.0 MB/s eta 0:00:24
   --- ------------------------------------ 9.4/124.9 MB 5.2 MB/s eta 0:00:23
   --- -

In [31]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [32]:
model_list=[]

for name,model in model_dict.items():
    model_list.append(model_selection(name,model))

In [33]:
model_list

[['linear_reg', 0.7363096633436828, 0.946382216008936],
 ['svr', 0.7642012011196353, 0.847263647348393],
 ['ridge', 0.7363125343993552, 0.9463387741853373],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.7712747201868904, 0.7273456265460585],
 ['random forest', 0.8805697504974848, 0.525958748894236],
 ['extra trees', 0.8677923269324419, 0.5529135912113864],
 ['gradient boosting', 0.8726556841324218, 0.5759888311796971],
 ['adaboost', 0.7526316998057764, 0.8130765126255426],
 ['mlp', 0.8100169432965572, 0.6829148220681825],
 ['xgboost', 0.8894876835260124, 0.5040475141482346]]

In [34]:
pd.DataFrame(model_list,columns=["model","r2","mae"]).sort_values(by="mae",ascending=True)

Unnamed: 0,model,r2,mae
10,xgboost,0.889488,0.504048
5,random forest,0.88057,0.525959
6,extra trees,0.867792,0.552914
7,gradient boosting,0.872656,0.575989
9,mlp,0.810017,0.682915
4,decision tree,0.771275,0.727346
8,adaboost,0.752632,0.813077
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


### Onehot encoding

In [35]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
scores.mean()

0.8546067827628422

In [36]:
scores.std()

0.015998393588058008

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [38]:
pipeline.fit(X_train,y_train)

In [39]:
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
mean_absolute_error(np.expm1(y_test),y_pred)

0.6497458331374444

In [40]:
def model_selection(model_name,model):

    output=[]
    output.append(model_name)
    
    pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',model)
    ])

    kfold=KFold(n_splits=10,shuffle=True,random_state=42)
    scores=cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
    output.append(scores.mean())

    X_train,X_test,y_train,y_test=train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    pipeline.fit(X_train,y_train)
    y_pred=pipeline.predict(X_test)
    y_pred = np.expm1(y_pred)
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [41]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [42]:
model_list=[]

for name,model in model_dict.items():
    model_list.append(model_selection(name,model))

In [43]:
model_list

[['linear_reg', 0.8546067827628422, 0.6497458331374444],
 ['svr', 0.7697413260547326, 0.8341243500492146],
 ['ridge', 0.8546870746558243, 0.6528314488879507],
 ['LASSO', 0.05943378064493578, 1.528905986892753],
 ['decision tree', 0.8029556220523979, 0.6809325973029099],
 ['random forest', 0.8913335868850046, 0.5026846459068949],
 ['extra trees', 0.89549975928065, 0.46590240499584434],
 ['gradient boosting', 0.8767090890130989, 0.5700420396852552],
 ['adaboost', 0.7538795256707155, 0.8251888375354758],
 ['mlp', 0.8753773736548431, 0.5128413433605429],
 ['xgboost', 0.8958499681743852, 0.4934562667923469]]

In [44]:
pd.DataFrame(model_list,columns=["model","r2","mae"]).sort_values(by="mae",ascending=True)

Unnamed: 0,model,r2,mae
6,extra trees,0.8955,0.465902
10,xgboost,0.89585,0.493456
5,random forest,0.891334,0.502685
9,mlp,0.875377,0.512841
7,gradient boosting,0.876709,0.570042
0,linear_reg,0.854607,0.649746
2,ridge,0.854687,0.652831
4,decision tree,0.802956,0.680933
8,adaboost,0.75388,0.825189
1,svr,0.769741,0.834124


### OneHotEncoding With PCA

In [45]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [46]:
from sklearn.decomposition  import PCA
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [47]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
scores.mean()

0.06225201431451135

In [48]:
scores.std()

0.01986059407164014

In [49]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [50]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [51]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [52]:
model_output

[['linear_reg', 0.06225201431451135, 1.5267074088549337],
 ['svr', 0.21807348496172213, 1.3611626793047429],
 ['ridge', 0.06225201516179148, 1.5267074078044667],
 ['LASSO', 0.05967578446737004, 1.5287392557835464],
 ['decision tree', 0.6964420082698518, 0.761508966234373],
 ['random forest', 0.764289063779886, 0.6684794621030556],
 ['extra trees', 0.7398403200639376, 0.7012785056283563],
 ['gradient boosting', 0.6106227078866426, 0.9879063301936338],
 ['adaboost', 0.3021787081276231, 1.3783376777942136],
 ['mlp', 0.20949808848647508, 1.4153318875109506],
 ['xgboost', 0.6222047517390725, 0.9675805121065025]]

In [53]:
pd.DataFrame(model_output,columns=["model","r2","mae"]).sort_values(by="mae")

Unnamed: 0,model,r2,mae
5,random forest,0.764289,0.668479
6,extra trees,0.73984,0.701279
4,decision tree,0.696442,0.761509
10,xgboost,0.622205,0.967581
7,gradient boosting,0.610623,0.987906
1,svr,0.218073,1.361163
8,adaboost,0.302179,1.378338
9,mlp,0.209498,1.415332
2,ridge,0.062252,1.526707
0,linear_reg,0.062252,1.526707


### Target Encoder

In [53]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.0-py3-none-any.whl.metadata (7.9 kB)
Collecting scikit-learn>=1.6.0 (from category_encoders)
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Downloading category_encoders-2.8.0-py3-none-any.whl (85 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.1 MB 2.1 MB/s eta 0:00:06
   ---- ----------------------------------- 1.3/11.1 MB 3.4 MB/s eta 0:00:03
   ------ --------------------------------- 1.8/11.1 MB 2.8 MB/s eta 0:00:04
   -------- ------------------------------- 2.4/11.1 MB 2.7 MB/s eta 0:00:04
   ----------- ---------------------------- 3.1/11.1 MB 3.0 MB/s eta 0:00:03
   -------------- ------------------------- 3.9/11.1 MB 3.1 MB/s eta 0:00:03
   ---------------- ----------------------- 4.5/11.1 MB 3.1 MB/s eta 0:00:03
   -------------------- -----

In [54]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [55]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [56]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
scores.mean(),scores.std()

(0.829521918225536, 0.01838446337912288)

In [57]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [58]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [59]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [60]:
model_output

[['linear_reg', 0.829521918225536, 0.7130109838896392],
 ['svr', 0.782917405117426, 0.8188507474317226],
 ['ridge', 0.8295359700269425, 0.7135228301064969],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.8306504718294654, 0.5872406623548723],
 ['random forest', 0.9012817610896935, 0.4528334308107171],
 ['extra trees', 0.9020248715201203, 0.45702090176325133],
 ['gradient boosting', 0.8892109962810991, 0.508584133968532],
 ['adaboost', 0.8153271947808836, 0.7238168108834526],
 ['mlp', 0.8522047176395002, 0.6061886392753489],
 ['xgboost', 0.9047983252719011, 0.447518119423869]]

In [62]:
pd.DataFrame(model_output,columns=["model","r2","mae"]).sort_values(by="mae")

Unnamed: 0,model,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.901282,0.452833
6,extra trees,0.902025,0.457021
7,gradient boosting,0.889211,0.508584
4,decision tree,0.83065,0.587241
9,mlp,0.852205,0.606189
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
8,adaboost,0.815327,0.723817
1,svr,0.782917,0.818851


In [65]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
Downloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2
Note: you may need to restart the kernel to use updated packages.


In [66]:
from xgboost import XGBRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [114]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 
                     'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [115]:
# Defining the XGBoost pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))
])

In [116]:
# Expanded Bayesian search space
param_space = {
    'regressor__n_estimators': Integer(50, 1000),
    'regressor__max_depth': Integer(3, 20),
    'regressor__learning_rate': Real(0.001, 0.5, prior='log-uniform'),
    'regressor__subsample': Real(0.3, 1.0),
    'regressor__colsample_bytree': Real(0.3, 1.0),
    'regressor__gamma': Real(0, 10),
    'regressor__lambda': Real(0, 20),
    'regressor__alpha': Real(0, 20),
    'regressor__min_child_weight': Integer(1, 10),
    'regressor__max_delta_step': Integer(0, 10)
}

In [117]:
# Setting up Bayesian Optimization with cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
search = BayesSearchCV(pipeline, param_space, cv=kfold, scoring='r2', n_jobs=-1, verbose=4, n_iter=30, random_state=42)

In [118]:
# Fitting the model
search.fit(X, y_transformed)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

In [119]:
final_pipe = search.best_estimator_

In [120]:
search.best_params_

OrderedDict([('regressor__alpha', 0.0),
             ('regressor__colsample_bytree', 0.3),
             ('regressor__gamma', 0.0),
             ('regressor__lambda', 19.133163135001094),
             ('regressor__learning_rate', 0.014917272023072267),
             ('regressor__max_delta_step', 10),
             ('regressor__max_depth', 14),
             ('regressor__min_child_weight', 6),
             ('regressor__n_estimators', 1000),
             ('regressor__subsample', 1.0)])

In [121]:
search.best_score_

0.906960638940401

In [84]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [122]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(final_pipe, file)

In [123]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [133]:
df.iloc[0].values

array(['flat', 'sector 36', 0.82, 3.0, 2.0, '2', 'New Property', 850.0,
       0.0, 0.0, 'unfurnished', 'Low', 'Low Floor'], dtype=object)

In [128]:
data = [['flat', 'sector 36', 3.0, 2.0, '2', 'New Property', 850.0, 0.0,0.0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor


In [129]:
np.expm1(final_pipe.predict(one_df))

array([0.8239058], dtype=float32)

In [131]:
df.iloc[0]["price"]

0.82