In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('flat_price_prediction_data.csv')

In [4]:
data.columns

Index(['sector', 'num_bedrooms', 'num_bathrooms', 'num_balconies', 'age_group',
       'super_built_up_area', 'floor_number', 'servant_room', 'furnished',
       'Centrally Air Conditioned', 'False Ceiling Lighting',
       'Intercom Facility', 'Private Garden / Terrace',
       'Separate entry for servant room', 'Spacious Interiors',
       'Swimming Pool', 'Club house / Community Center', 'Piped-gas', 'ac',
       'geyser', 'fan', 'wardrobe', 'light', 'Low Density Society',
       'total_num_rooms', 'price'],
      dtype='object')

In [4]:
data

Unnamed: 0,sector,num_bedrooms,num_bathrooms,num_balconies,age_group,super_built_up_area,floor_number,servant_room,furnished,Centrally Air Conditioned,...,Club house / Community Center,Piped-gas,ac,geyser,fan,wardrobe,light,Low Density Society,total_num_rooms,price
0,sector 7,2,2.0,1.0,Moderately New Flat,1169.100000,4.0,0,0,0,...,0,0,0,0,3,1,4,0,5.0,0.45
1,sector 3,2,2.0,1.0,Old Flat,844.350000,1.0,0,1,0,...,0,0,0,1,4,3,3,0,5.0,0.50
2,sohna road,2,2.0,3.0,New Flat,772.905000,12.0,0,0,0,...,1,0,0,0,0,0,0,0,7.0,0.40
3,sector 61,2,2.0,2.0,Under Construction,1558.800000,2.0,0,0,0,...,1,0,0,0,0,0,0,0,7.0,1.47
4,sector 92,2,2.0,3.0,Under Construction,1345.000000,5.0,0,0,0,...,0,0,0,0,0,0,0,0,8.0,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2833,sector 28,2,2.0,2.0,Moderately Old Flat,1280.525333,1.0,0,0,0,...,0,0,0,0,0,0,0,0,6.0,1.35
2834,sector 86,3,3.0,3.0,Under Construction,1895.000000,9.0,1,0,0,...,0,0,0,0,0,0,0,0,10.0,1.05
2835,sector 48,5,5.0,4.0,Old Flat,3905.000000,4.0,1,0,0,...,1,0,0,0,0,0,0,0,15.0,3.30
2836,sector 108,3,3.0,3.0,Moderately New Flat,1822.000000,3.0,0,0,0,...,1,0,0,0,0,0,0,0,10.0,0.95


In [86]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2838 entries, 0 to 2837
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   sector                           2838 non-null   object 
 1   num_bedrooms                     2838 non-null   int64  
 2   num_bathrooms                    2838 non-null   float64
 3   num_balconies                    2838 non-null   float64
 4   age_group                        2838 non-null   object 
 5   super_built_up_area              2838 non-null   float64
 6   floor_number                     2838 non-null   float64
 7   servant_room                     2838 non-null   int64  
 8   furnished                        2838 non-null   int64  
 9   Centrally Air Conditioned        2838 non-null   int64  
 10  False Ceiling Lighting           2838 non-null   int64  
 11  Intercom Facility                2838 non-null   int64  
 12  Private Garden / Ter

In [4]:
X = data.drop(columns=['price'])
y = data['price']

In [9]:
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

In [10]:
categorical_columns

Index(['sector', 'age_group'], dtype='object')

In [11]:
numerical_columns

Index(['num_bedrooms', 'num_bathrooms', 'num_balconies', 'super_built_up_area',
       'floor_number', 'servant_room', 'furnished',
       'Centrally Air Conditioned', 'False Ceiling Lighting',
       'Intercom Facility', 'Private Garden / Terrace',
       'Separate entry for servant room', 'Spacious Interiors',
       'Swimming Pool', 'Club house / Community Center', 'Piped-gas', 'ac',
       'geyser', 'fan', 'wardrobe', 'light', 'Low Density Society',
       'total_num_rooms'],
      dtype='object')

# Ordinal Encoding

In [91]:
preprocessor = ColumnTransformer(
    transformers=[
        ('yeo_johnson', PowerTransformer(method='yeo-johnson'), ['super_built_up_area']),
        ('standardize', StandardScaler(), numerical_columns),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns)
    ], 
    remainder='passthrough'
)

In [92]:
from sklearn.linear_model import LinearRegression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [9]:
def yeo_johnson_transform(column_series):
    index_ = column_series.index
    data_2d = np.array(column_series).reshape(-1, 1)
    yeo_johnson_transformer = PowerTransformer(method='yeo-johnson')

    transformed_data = yeo_johnson_transformer.fit_transform(data_2d)
    transformed_data = pd.Series(transformed_data.flatten(), index=index_)
    return (yeo_johnson_transformer, transformed_data)

In [10]:
transformer, y_transformed = yeo_johnson_transform(y)

In [95]:
# kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [96]:
# print(scores.mean(), scores.std())

In [97]:
def get_model_performance(model_name, model):
    
    output = []
    print(model_name)
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, yeo_johnson_transform(y)[1], cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipeline.fit(X_train, yeo_johnson_transform(y_train)[1])

    y_pred = pipeline.predict(X_test)

    y_pred = transformer.inverse_transform(y_pred.reshape(-1, 1))

output.append(mean_absolute_error(y_test, y_pred))
    
    return output

In [98]:
get_model_performance('lr', LinearRegression())

In [99]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [100]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [101]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(get_model_performance(model_name, model))

In [102]:
model_performance = pd.DataFrame(model_output, columns=['name','r2','mae'])
# model_performance.sort_values(['mae'])

# One Hot Encoding

In [103]:
preprocessor = ColumnTransformer(
    transformers=[
        ('yeo_johnson', PowerTransformer(method='yeo-johnson'), ['super_built_up_area']),
        ('standardize', StandardScaler(), numerical_columns),
        ('ohe', OneHotEncoder(handle_unknown='infrequent_if_exist'), categorical_columns)
    ], 
    remainder='passthrough'
)

In [104]:
def get_model_performance(model_name, model):
    
    output = []
    print(model_name)
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, yeo_johnson_transform(y)[1], cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, yeo_johnson_transform(y_train)[1])
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = transformer.inverse_transform(y_pred.reshape(-1, 1))
    
    output.append(mean_absolute_error(y_test, y_pred))
    
    return output

In [105]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(get_model_performance(model_name, model))

In [106]:
model_performance = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_performance.sort_values(['mae'])

# Target Encoder

In [12]:
import category_encoders as ce

preprocessor = ColumnTransformer(
    transformers=[
        ('yeo_johnson', PowerTransformer(method='yeo-johnson'), ['super_built_up_area']),
        ('standardize', StandardScaler(), numerical_columns),
        ('target', ce.TargetEncoder(), categorical_columns)
    ], 
    remainder='passthrough'
)

In [108]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(get_model_performance(model_name, model))

In [109]:
model_performance = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_performance.sort_values(['mae'])

# Target Encoder with XGBoost gives an MAE of 25 Lakhs!

In [11]:
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

In [13]:
import category_encoders as ce
from xgboost import XGBRegressor

preprocessor = ColumnTransformer(
    transformers=[
        ('yeo_johnson', PowerTransformer(method='yeo-johnson'), ['super_built_up_area']),
        ('standardize', StandardScaler(), numerical_columns),
        ('target', ce.TargetEncoder(), categorical_columns)
    ], 
    remainder='passthrough'
)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())
])

Default Hyperparameters

- learning_rate: 0.3
- min_split_loss: 0
- max_depth: 6
- min_child_weight: 1
- subsample: 1
- colsample_bytree, colsample_bylevel, colsample_bynode: 1
- lambda: 1
- alpha: 0
- tree_method: auto

Meaning of Hyperparameters

- min_split_loss: Minimum loss that must be reduced if parent node is to be splitted.
- max_depth: if 0 then no limit. 'exact' tree means, we need to specify the value as non 0.
- min_child_weight: For linear regression it is min number of samples required to be in child node for parent to be splitted. More value means lesser overfitting.
- subsample: If set to 0.5 means 50% of the training data will be randomly sampled before each boosting step. So introduces randomness and hence generalize more.
- colsample_bytree: Fraction of total columns, which will be sampled before giving it to each tree.
- colsample_byleve: Subsampling occurs once every level of tree.
- colsample_bynode: It is like random forest.
- lambda: L2 reguralization term.
- alpha: L1 reguralization term.

In [16]:
def get_performance_wrt_hyperparaneters(**hyperparameters):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', XGBRegressor(**hyperparameters))
    ])
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, yeo_johnson_transform(y)[1], cv=kfold, scoring='r2')
    print('Cross Validation Score : mean = {} and standard deviation = {}'.format(scores.mean(), scores.std()))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipeline.fit(X_train, yeo_johnson_transform(y_train)[1])

    y_pred = pipeline.predict(X_test)

    y_pred = transformer.inverse_transform(y_pred.reshape(-1, 1))

    print(mean_absolute_error(y_test, y_pred))

In [18]:
get_performance_wrt_hyperparaneters(**{'learning_rate': 0.1})

Cross Validation Score : mean = 0.8913324479528804 and standard deviation = 0.02205402536279398
0.2721114747708952


In [26]:
for min_split_loss in [0, 0.01, 0.001, 0.0009]:
    print(f'min_split_loss : {min_split_loss}')
    get_performance_wrt_hyperparaneters(**{'min_split_loss': min_split_loss})

min_split_loss : 0
Cross Validation Score : mean = 0.8917229962593247 and standard deviation = 0.021733137184142392
0.2536920588369101
min_split_loss : 0.01
Cross Validation Score : mean = 0.8924990526122519 and standard deviation = 0.022058201259877976
0.2587851723063159
min_split_loss : 0.001
Cross Validation Score : mean = 0.8933204268770437 and standard deviation = 0.020898306196916374
0.2558921619284321
min_split_loss : 0.0009
Cross Validation Score : mean = 0.8926832228031177 and standard deviation = 0.021370599917055685
0.258395694309557


In [29]:
for max_depth in [3, 6, 7, 8, 10, 15]:
    print(f'max_depth : {max_depth}')
    get_performance_wrt_hyperparaneters(**{'max_depth': max_depth})

max_depth : 3
Cross Validation Score : mean = 0.8849974639805677 and standard deviation = 0.02140670452858811
0.2858366379771434
max_depth : 6
Cross Validation Score : mean = 0.8917229962593247 and standard deviation = 0.021733137184142392
0.2536920588369101
max_depth : 7
Cross Validation Score : mean = 0.8923258462531212 and standard deviation = 0.019808096073298875
0.2732250122537076
max_depth : 8
Cross Validation Score : mean = 0.8853231846429847 and standard deviation = 0.018647374323609665
0.2555941768095527
max_depth : 10
Cross Validation Score : mean = 0.8854133868341842 and standard deviation = 0.01929808799727534
0.2746263324542784
max_depth : 15
Cross Validation Score : mean = 0.8788744236210562 and standard deviation = 0.020037143640844444
0.28694415423232067


In [27]:
for learning_rate in [0.2, 0.25, 0.3, 0.35, 0.4, 1]:
    print(f'learning_rate : {learning_rate}')
    get_performance_wrt_hyperparaneters(**{'learning_rate': learning_rate})

learning_rate : 0.2
Cross Validation Score : mean = 0.8955243980354792 and standard deviation = 0.02065847567774461
0.271238154030182
learning_rate : 0.25
Cross Validation Score : mean = 0.8956116432322118 and standard deviation = 0.020303900856367303
0.2814686101507133
learning_rate : 0.3
Cross Validation Score : mean = 0.8917229962593247 and standard deviation = 0.021733137184142392
0.2536920588369101
learning_rate : 0.35
Cross Validation Score : mean = 0.8926453305320171 and standard deviation = 0.022003483609311824
0.2672922757309927
learning_rate : 0.4
Cross Validation Score : mean = 0.8887423641805041 and standard deviation = 0.020692140894328224
0.275830446582445
learning_rate : 1
Cross Validation Score : mean = 0.8279707064817522 and standard deviation = 0.030627561998339308
0.3592498455668839


In [30]:
for colsample_bynode in [0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    print(f'colsample_bynode : {colsample_bynode}')
    get_performance_wrt_hyperparaneters(**{'colsample_bynode': colsample_bynode})

colsample_bynode : 0.5
Cross Validation Score : mean = 0.8905738812192953 and standard deviation = 0.01812470729466061
0.27746438337043977
colsample_bynode : 0.6
Cross Validation Score : mean = 0.8889737560254016 and standard deviation = 0.021613235594983713
0.2645759413024069
colsample_bynode : 0.7
Cross Validation Score : mean = 0.8892852561325191 and standard deviation = 0.01992982553685962
0.27161580172223104
colsample_bynode : 0.8
Cross Validation Score : mean = 0.8932259018143409 and standard deviation = 0.020801004030449544
0.27286463080997203
colsample_bynode : 0.9
Cross Validation Score : mean = 0.8920464782069617 and standard deviation = 0.01953763391192177
0.27561660429121737
colsample_bynode : 1
Cross Validation Score : mean = 0.8917229962593247 and standard deviation = 0.021733137184142392
0.2536920588369101


In [32]:
for colsample_bytree in [0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    print(f'colsample_bytree : {colsample_bytree}')
    get_performance_wrt_hyperparaneters(**{'colsample_bytree': colsample_bytree})

colsample_bytree : 0.5
Cross Validation Score : mean = 0.8905991099483532 and standard deviation = 0.015220798473881914
0.26465144693011966
colsample_bytree : 0.6
Cross Validation Score : mean = 0.8948995280376995 and standard deviation = 0.018905848398649082
0.2612301607450969
colsample_bytree : 0.7
Cross Validation Score : mean = 0.8931639186529476 and standard deviation = 0.0203220708628161
0.2720492018108637
colsample_bytree : 0.8
Cross Validation Score : mean = 0.8937469317072726 and standard deviation = 0.02085896396825552
0.2654019955766034
colsample_bytree : 0.9
Cross Validation Score : mean = 0.8957621924220598 and standard deviation = 0.01938199044852358
0.2644769008646549
colsample_bytree : 1
Cross Validation Score : mean = 0.8917229962593247 and standard deviation = 0.021733137184142392
0.2536920588369101


In [31]:
for colsample_bylevel in [0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    print(f'colsample_bylevel : {colsample_bylevel}')
    get_performance_wrt_hyperparaneters(**{'colsample_bylevel': colsample_bylevel})

colsample_bylevel : 0.5
Cross Validation Score : mean = 0.8905738812192953 and standard deviation = 0.01812470729466061
0.27746438337043977
colsample_bylevel : 0.6
Cross Validation Score : mean = 0.8889737560254016 and standard deviation = 0.021613235594983713
0.2645759413024069
colsample_bylevel : 0.7
Cross Validation Score : mean = 0.8892852561325191 and standard deviation = 0.01992982553685962
0.27161580172223104
colsample_bylevel : 0.8
Cross Validation Score : mean = 0.8932259018143409 and standard deviation = 0.020801004030449544
0.27286463080997203
colsample_bylevel : 0.9
Cross Validation Score : mean = 0.8920464782069617 and standard deviation = 0.01953763391192177
0.27561660429121737
colsample_bylevel : 1
Cross Validation Score : mean = 0.8917229962593247 and standard deviation = 0.021733137184142392
0.2536920588369101


In [156]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

param_distributions = {
    'model__learning_rate': uniform(0.01, 0.5),  
    'model__n_estimators': randint(50, 500),
    'model__max_depth': randint(3, 12),
    'model__min_child_weight': randint(1, 7),
    'model__subsample': uniform(0.6, 0.3),  
    'model__colsample_bytree': uniform(0.6, 0.3), 
    'model__gamma': uniform(0, 0.4), 
    'model__alpha': [0, 0.1, 0.01, 0.001], 
    'model__lambda': [1, 0.1, 0.01, 0.001] 
}

kfold = KFold(n_splits=10, shuffle=True, random_state=1)
rcv = RandomizedSearchCV(pipeline, param_distributions, scoring='r2', refit=True, n_iter=2000, cv=kfold, n_jobs=-1)
rcv.fit(X, yeo_johnson_transform(y)[1])

best_params = rcv.best_params_

print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'model__alpha': 0.001, 'model__colsample_bytree': 0.7092405514235083, 'model__gamma': 0.053117016148287725, 'model__lambda': 0.001, 'model__learning_rate': 0.028941172140662767, 'model__max_depth': 7, 'model__min_child_weight': 2, 'model__n_estimators': 456, 'model__subsample': 0.7583825422204908}


In [157]:
pipeline.set_params(**best_params)

pipeline.fit(X_train, yeo_johnson_transform(y_train)[1])

y_pred = pipeline.predict(X_test)
y_pred = transformer.inverse_transform(y_pred.reshape(-1, 1))

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('r2 score : {}'.format(r2))
print('mae : {}'.format(mae))

r2 score : 0.8490917661906368
mae : 0.2561876896233626


# Seems like the default settings perform the best

In [8]:
import category_encoders as ce
from xgboost import XGBRegressor
preprocessor = ColumnTransformer(
    transformers=[
        ('yeo_johnson', PowerTransformer(method='yeo-johnson'), [5]),
        ('standardize', StandardScaler(), [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
        ('target', ce.TargetEncoder(), [0, 4])
    ], 
    remainder='passthrough'
)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())
])

In [9]:
pipeline.fit(X, y)

In [10]:
import joblib

joblib.dump(pipeline, 'flat_pipeline.joblib')

['flat_pipeline.joblib']