In [196]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [157]:
data = pd.read_csv('gurgaon-post-feature-selection.csv')

In [158]:
data.sample(5)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,study room,store room,furnished_type,luxury_category,floor_category
2395,house,sector 57,1.35,5,3,2,Old Property,550.0,0,0,0,0,Low,Mid Floor
2921,flat,sector 81,2.3,3,3,3+,Relatively New,1617.0,1,1,0,2,Low,High Floor
3119,flat,sector 86,1.26,3,3,3,Moderately Old,1747.0,1,0,0,0,Medium,Mid Floor
2366,flat,sector 78,1.1,3,3,3+,Old Property,2176.0,1,0,0,0,Medium,Mid Floor
1045,flat,sohna road,0.72,2,2,3+,Relatively New,1131.0,0,1,0,0,High,High Floor


In [159]:
df = data.copy()

## furnished_type

In [160]:
df['furnished_type'].value_counts()

furnished_type
0    2356
1    1020
2     189
Name: count, dtype: int64

0 -> unfurnished</br>
1 -> semifurnished</br>
2 -> furnished</br>

In [161]:
df['furnished_type'] = df['furnished_type'].replace({
    0:'unfurnished',
    1:'semifurnished',
    2:'furnished'
})

In [162]:
df.sample(5)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,study room,store room,furnished_type,luxury_category,floor_category
3455,house,sector 7,2.25,3,3,2,Old Property,1440.0,0,0,0,semifurnished,Low,Low Floor
2807,house,sector 105,0.74,3,3,0,Moderately Old,1000.0,0,0,0,unfurnished,Low,Low Floor
1078,house,sector 4,3.2,3,3,2,Old Property,2250.0,0,0,0,unfurnished,Low,Low Floor
1750,flat,sector 102,2.2,3,4,3+,Moderately Old,1945.0,1,1,0,unfurnished,Medium,Mid Floor
571,flat,sector 70,1.9,4,3,3,New Property,2111.0,0,0,1,unfurnished,Medium,Low Floor


In [163]:
X = df.drop(columns = 'price')
y = df['price']

In [164]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,study room,store room,furnished_type,luxury_category,floor_category
0,flat,sohna road,3,2,0,Relatively New,1120.0,0,0,0,unfurnished,Medium,Low Floor
1,flat,sector 74,3,3,2,Relatively New,1938.0,1,0,0,semifurnished,High,Mid Floor
2,flat,sector 103,3,4,3+,New Property,1734.0,1,0,0,unfurnished,Low,Mid Floor
3,flat,sector 107,2,2,2,Relatively New,543.0,0,0,0,unfurnished,Low,Low Floor
4,house,sector 43,4,4,3+,Moderately Old,540.0,1,1,0,furnished,Medium,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3560,flat,sector 81,2,2,1,Relatively New,613.0,0,0,0,semifurnished,Medium,Mid Floor
3561,flat,sector 76,3,3,3,New Property,1944.0,0,0,0,unfurnished,Low,Low Floor
3562,flat,sector 113,4,4,3,New Property,2353.0,1,1,0,semifurnished,High,High Floor
3563,flat,sector 65,3,3,3+,Relatively New,1493.0,1,1,0,unfurnished,High,Mid Floor


In [45]:
y

0       0.90
1       2.25
2       1.12
3       0.27
4       2.45
        ... 
3560    0.48
3561    1.50
3562    3.45
3563    2.25
3564    0.26
Name: price, Length: 3565, dtype: float64

In [11]:
# after eda we know that our target column is right skewed, applying log transformation
y_transformed = np.log1p(y)

In [47]:
y_transformed

0       0.641854
1       1.178655
2       0.751416
3       0.239017
4       1.238374
          ...   
3560    0.392042
3561    0.916291
3562    1.492904
3563    1.178655
3564    0.231112
Name: price, Length: 3565, dtype: float64

# Ordinal Encoding

In [58]:
cols_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'floor_category', 'luxury_category', 'furnished_type']

In [107]:
# create a column transformer 
preprocessor = ColumnTransformer(
    transformers = [
        ('numerical', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room', 'store room']),
        ('categorical', OrdinalEncoder(handle_unknown = 'use_encoded_value',
                                       unknown_value = -1), cols_to_encode)
    ],
    remainder = 'passthrough'
)

#### Linear Regression
Applying with linear regression and testing the score

In [108]:
# create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', LinearRegression())
])

In [109]:
# k-fold cross val
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
scores = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'r2')

In [110]:
scores.mean(), scores.std()

(np.float64(0.7379121949539776), np.float64(0.02813216686385457))

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, random_state = 42, test_size = 0.2)

In [112]:
pipeline.fit(X_train, y_train)

In [113]:
y_pred = pipeline.predict(X_test)

In [114]:
y_pred = np.expm1(y_pred)

In [115]:
mean_absolute_error(np.expm1(y_test), y_pred)

0.9165583408570532

#### converting this whole flow into a function for different model testing

In [116]:
def scorer(model_name, model):
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        (f'{model}', model)
    ])
    
    kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
    scores = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size = 0.2, random_state = 42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [141]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [142]:
model_output = []

for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [143]:
model_df = pd.DataFrame(model_output, columns = ['name', 'r2', 'mae'])

In [145]:
model_df.sort_values(by = 'mae')

Unnamed: 0,name,r2,mae
9,xgboost,0.891208,0.531468
5,random forest,0.883329,0.553389
7,gradient boosting,0.872474,0.592432
6,extra trees,0.868135,0.604375
4,decision tree,0.774473,0.678184
1,svr,0.767206,0.862764
8,adaboost,0.75906,0.866002
0,linear_reg,0.737912,0.916558
2,ridge,0.737915,0.916573
3,LASSO,0.05561,1.537464


# One Hot Encoding

In [197]:
preprocessor = ColumnTransformer(
    transformers = [
        ('numerical', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room', 'store room']),
        ('categorical', OrdinalEncoder(handle_unknown = 'use_encoded_value',
                                       unknown_value = -1), cols_to_encode),
        ('categorical1', OneHotEncoder(drop = 'first', handle_unknown = 'ignore'), ['sector', 'agePossession', 'furnished_type'])
    ],
    remainder = 'passthrough'
)

In [198]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [166]:
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
scores = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'r2')



In [168]:
scores.mean(), scores.std()

(np.float64(0.8512679011581064), np.float64(0.02548328422698523))

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, random_state = 42, test_size = 0.2)

In [170]:
pipeline.fit(X_train, y_train)

In [172]:
y_pred = pipeline.predict(X_test)

In [173]:
y_pred = np.expm1(y_pred)

In [174]:
mean_absolute_error(np.expm1(y_test), y_pred)

0.6747210660547811

In [175]:
def scorer(model_name, model):
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        (f'{model}', model)
    ])
    
    kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
    scores = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size = 0.2, random_state = 42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [176]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [177]:
model_output = []

for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [178]:
model_df = pd.DataFrame(model_output, columns = ['name', 'r2', 'mae'])

In [179]:
model_df.sort_values(by = 'mae')

Unnamed: 0,name,r2,mae
9,xgboost,0.895728,0.520136
6,extra trees,0.896099,0.543517
5,random forest,0.893893,0.546885
7,gradient boosting,0.877293,0.595479
4,decision tree,0.799779,0.669857
0,linear_reg,0.851268,0.674721
2,ridge,0.850339,0.683445
1,svr,0.773311,0.856752
8,adaboost,0.749267,0.862976
3,LASSO,0.05561,1.537464


# Target Encoder

In [20]:
import category_encoders as ce

cols_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'floor_category', 'luxury_category', 'furnished_type']

preprocessor = ColumnTransformer(
    transformers = [
        ('numerical', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room', 'store room']),
        ('categorical', OrdinalEncoder(handle_unknown = 'use_encoded_value',
                                       unknown_value = -1), cols_to_encode),
        ('target_encoding', ce.TargetEncoder(), ['sector'])
    ],
    remainder = 'passthrough'
)

In [21]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [22]:
# K-fold cross-validation
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
scores = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'r2')

In [23]:
scores.mean(),scores.std()

(np.float64(0.8235262829755765), np.float64(0.024319361269544537))

In [24]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [25]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [26]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [27]:
model_df = pd.DataFrame(model_output, columns = ['name','r2','mae'])

In [28]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.90339,0.51246
9,xgboost,0.896999,0.520152
6,extra trees,0.9019,0.521144
7,gradient boosting,0.888939,0.548125
4,decision tree,0.818417,0.641659
8,adaboost,0.820265,0.746113
0,linear_reg,0.823526,0.761082
2,ridge,0.823543,0.761371
1,svr,0.784224,0.842766
3,LASSO,0.05561,1.537464


# Hyperparameter Tuning

In [29]:
from sklearn.model_selection import GridSearchCV

In [36]:
param_grid = {
    'regressor__n_estimators':[50, 100, 150, 200, 250],
    'regressor__criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'regressor__max_depth':[None, 10, 20, 30, 40, 50],
    'regressor__max_features':['sqrt', 'log2', None],
    'regressor__min_samples_split':[2, 4, 6]
}

In [37]:
cols_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'floor_category', 'luxury_category', 'furnished_type']

preprocessor = ColumnTransformer(
    transformers = [
        ('numerical', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room', 'store room']),
        ('categorical', OrdinalEncoder(handle_unknown = 'use_encoded_value',
                                       unknown_value = -1), cols_to_encode),
        ('target_encoding', ce.TargetEncoder(), ['sector'])
    ],
    remainder = 'passthrough'
)

In [38]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [39]:
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)

In [40]:
search = GridSearchCV(pipeline, param_grid, cv = kfold, scoring = 'r2', n_jobs = -1, verbose = 4)

In [41]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 1080 candidates, totalling 10800 fits


In [222]:
search.best_params_

{'regressor__criterion': 'absolute_error',
 'regressor__max_depth': 30,
 'regressor__max_features': None,
 'regressor__min_samples_split': 2,
 'regressor__n_estimators': 250}

In [223]:
final_pipeline = search.best_estimator_

In [44]:
search.best_score_

np.float64(0.9052536355724202)

# Exporting the model
since their are not major changes we will export model in default settings

In [213]:
preprocessor = ColumnTransformer(
    transformers = [
        ('numerical', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'study room', 'store room']),
        ('categorical', OrdinalEncoder(handle_unknown = 'use_encoded_value',
                                       unknown_value = -1), cols_to_encode),
        ('categorical1', OneHotEncoder(drop = 'first', handle_unknown = 'ignore', sparse_output = False), ['sector', 'agePossession', 'furnished_type'])
    ],
    remainder = 'passthrough'
)

In [214]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [201]:
pipeline.fit(X, y_transformed)

In [202]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [189]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [209]:
X.sample(2)

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,study room,store room,furnished_type,luxury_category,floor_category
2380,flat,sector 81,2,2,1,Under Construction,814.0,0,0,0,unfurnished,Low,Low Floor
1091,flat,sector 104,3,3,3+,Relatively New,2072.0,1,0,0,unfurnished,High,High Floor


# Trying out prediction

In [206]:
data = [['house', 'sector 24', 4, 3, '3+', 'New Property', 2750, 0, 1, 0, 'furnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'study room', 'store room',
       'furnished_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,study room,store room,furnished_type,luxury_category,floor_category
0,house,sector 24,4,3,3+,New Property,2750,0,1,0,furnished,Low,Low Floor


In [207]:
np.expm1(pipeline.predict(one_df))

array([4.49450124])