In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error,r2_score
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn.decomposition import PCA

In [None]:
df=pd.read_csv('gurgaon_properties_v5.csv')

In [None]:
df.head()

Unnamed: 0,property_type,sector,price,bedroom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 41,7.25,9.0,8.0,3,Old Property,2475.0,1.0,0.0,0.0,Medium,Mid Floor
1,flat,sector 81,1.28,3.0,3.0,3,Moderately Old,1605.6,1.0,0.0,0.0,Low,Low Floor
2,flat,sector 107,1.4,3.0,4.0,N,New Property,2129.768041,0.0,0.0,0.0,Medium,Low Floor
3,flat,sector 37,0.78,2.0,2.0,3,Moderately Old,1068.1,0.0,0.0,0.0,Medium,Low Floor
4,flat,sohna road,1.05,2.0,2.0,3,New Property,1175.97,0.0,0.0,1.0,Low,Low Floor


In [None]:
df['furnishing_type'].value_counts()

Unnamed: 0_level_0,count
furnishing_type,Unnamed: 1_level_1
0.0,2207
1.0,1287


In [None]:
#0-->unfurnished
#1-->furnished
df['furnishing_type']=df['furnishing_type'].replace({0:'unfurnished',1:'furnished'})
df['furnishing_type']

Unnamed: 0,furnishing_type
0,unfurnished
1,unfurnished
2,unfurnished
3,unfurnished
4,furnished
...,...
3489,furnished
3490,unfurnished
3491,unfurnished
3492,unfurnished


In [None]:
df.head()

Unnamed: 0,property_type,sector,price,bedroom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 41,7.25,9.0,8.0,3,Old Property,2475.0,1.0,0.0,unfurnished,Medium,Mid Floor
1,flat,sector 81,1.28,3.0,3.0,3,Moderately Old,1605.6,1.0,0.0,unfurnished,Low,Low Floor
2,flat,sector 107,1.4,3.0,4.0,N,New Property,2129.768041,0.0,0.0,unfurnished,Medium,Low Floor
3,flat,sector 37,0.78,2.0,2.0,3,Moderately Old,1068.1,0.0,0.0,unfurnished,Medium,Low Floor
4,flat,sohna road,1.05,2.0,2.0,3,New Property,1175.97,0.0,0.0,furnished,Low,Low Floor


In [None]:
X=df.drop('price',axis=1)
y=df['price']

In [None]:
y_transformed=np.log1p(y)

**Ordinal Encoding**

In [None]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [None]:
#creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['bedroom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat',OrdinalEncoder(),columns_to_encode)
    ],
    remainder='passthrough'
)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
#creating a pipeline
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('model',LinearRegression())
])

In [None]:
#k fold cross valiadtion
kf=KFold(n_splits=5,shuffle=True,random_state=42)
scores=cross_val_score(pipeline,X,y_transformed,cv=kf,scoring='r2')

In [None]:
scores.mean(),scores.std()

(np.float64(0.43461705771975917), np.float64(0.43328281002961894))

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
y_pred=pipeline.predict(X_test)

In [None]:
y_pred=np.expm1(y_pred)

In [None]:
mae=mean_absolute_error(np.expm1(y_test),y_pred)
mae

1.0526162052498436

In [None]:
#lets make a function to test all the models
from sklearn.linear_model import LinearRegression,Ridge,Lasso

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR


from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_output

[['linear_reg', np.float64(0.5536727011056062), 1.0526162052498436],
 ['svr', np.float64(0.6902927572734673), 0.9645813657016487],
 ['ridge', np.float64(0.5536928051567429), 1.0527276967420782],
 ['LASSO', np.float64(0.05655236045689057), 1.5849065374201547],
 ['decision tree', np.float64(0.7893063566014047), 0.6902533606091584],
 ['random forest', np.float64(0.8862768554472101), 0.5677494510037051],
 ['extra trees', np.float64(0.8708071601210478), 0.6028270057857417],
 ['gradient boosting', np.float64(0.8765798076602076), 0.6118144961224664],
 ['adaboost', np.float64(0.7623163298577904), 0.8569398525849267],
 ['mlp', np.float64(0.7844120483747622), 0.7765950653487003],
 ['xgboost', np.float64(0.8939965845203822), 0.550301573887915]]

In [None]:
model_df=pd.DataFrame(model_output,columns=['model_name','r2_score','mae'])

In [None]:
model_df.sort_values(by='mae')

Unnamed: 0,model_name,r2_score,mae
10,xgboost,0.893997,0.550302
5,random forest,0.886277,0.567749
6,extra trees,0.870807,0.602827
7,gradient boosting,0.87658,0.611814
4,decision tree,0.789306,0.690253
9,mlp,0.784412,0.776595
8,adaboost,0.762316,0.85694
1,svr,0.690293,0.964581
0,linear_reg,0.553673,1.052616
2,ridge,0.553693,1.052728


**OHE**

In [None]:
# adding OHE (one hot encoding) to the preprocessor and observe the changes

preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['bedroom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat',OrdinalEncoder(),['property_type','balcony','luxury_category','floor_category']),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ],
    remainder='passthrough'
)

In [None]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.882078,0.539525
10,xgboost,0.895452,0.546314
5,random forest,0.877875,0.560868
1,svr,0.879905,0.606404
9,mlp,0.87695,0.614853
4,decision tree,0.803318,0.650778
7,gradient boosting,0.863168,0.667265
2,ridge,0.749492,0.814708
0,linear_reg,0.749975,0.814864
8,adaboost,0.724631,0.935123


**OHE with PCA**

In [None]:
#OHE with PCA

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder='passthrough'
)

In [None]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('model', LinearRegression())
])

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

**Target Encoder**

In [None]:
!pip install category_encoders

In [None]:
import category_encoders as ce

In [None]:
columns_to_encode=['property_type','sector','balcony','agePossession','furnishing_type','luxury_category','floor_category']

In [None]:
preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['bedroom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat',OrdinalEncoder(),columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),

        ('cat2',ce.TargetEncoder(),['sector'])
    ],
    remainder='passthrough'

)

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae']).sort_values(['mae'])
model_df

**Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid={
    'regressor__n_estimators':[50,100,200,300],
    'regressor__max_depth':[None,10,20,30],
    'regressor__max_samples':[0.1,0.25,0.5,1.0],
    'regressor__max_features':['auto','sqrt']
}


In [None]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [None]:
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',RandomForestRegressor())
])



In [None]:
kfold=KFold(n_splits=5,shuffle=True,random_state=42)

In [None]:
search=GridSearchCV(pipeline,param_grid,cv=kfold,scoring='r2',n_jobs=-1,verbose=4)

In [None]:
search.fit(X,y_transformed)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


In [None]:

search.best_params_

In [None]:
search.best_score_

In [None]:
final_pipe=search.best_estimator_
final_pipe.fit(X,y_transformed)

<b><font size="5">Exporting the model</font></b>


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [None]:
pipeline.fit(X,y_transformed)

In [None]:
import pickle

with open('pipeline.pkl','wb') as file:
  pickle.dump(pipeline,file)

In [None]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [None]:
X

**Trying out predications**

In [None]:
X.columns

In [60]:
for i in range(0,10):

    _=pd.DataFrame([X.iloc[i].values],columns=X.columns)
    print(np.expm1(pipeline.predict(_)),y[i])

[6.61527957] 7.25
[1.55370921] 1.28
[1.48340906] 1.4
[0.82947371] 0.78
[0.9885345] 1.05
[2.42851299] 2.7
[1.12194402] 1.02
[3.4709989] 3.15
[3.41441708] 3.25
[5.48065019] 5.25
