In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [58]:
df = pd.read_csv('final_data.csv')
df = df.drop(['Title','Price_per_sqft','Is_in_Dhaka'],axis=1)
df.head()

Unnamed: 0,Bedrooms,Bathrooms,Floor_area,City,location_area,Price_in_Cr,floor_level
0,3.0,4.0,1960.0,dhaka,gulshan,3.9,low
1,3.0,3.0,1705.0,dhaka,kalabagan,1.69,low
2,3.0,3.0,1370.0,dhaka,dhanmondi,1.25,medium
3,3.0,3.0,2125.0,dhaka,bashundhara,2.0,medium
4,3.0,3.0,2687.0,dhaka,banani,4.75,medium


Removing the Price_per_sqft column because customers shouldn't have a knowledge about the Price_per_sqft either

In [48]:
x = df.drop(['Price_in_Cr'],axis=1)
y = df['Price_in_Cr']

In [49]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [50]:
from sklearn.compose import ColumnTransformer,TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PowerTransformer,OneHotEncoder,OrdinalEncoder
from category_encoders import TargetEncoder  
from sklearn.linear_model import LinearRegression

In [51]:
transformer = ColumnTransformer([
    ('Onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False),['City','Bedrooms','Bathrooms','location_area']),
    ('Ordinal',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),['floor_level']),
    # ('Target_Encoding',TargetEncoder(),['location_area']),
    ('Scaler',StandardScaler(),['Floor_area'])
],remainder='passthrough')

model_pipe = Pipeline([
    ('preprocessing',transformer),
    ('model',LinearRegression())
])



In [52]:
from sklearn.model_selection import cross_val_score,KFold

cv_original = cross_val_score(estimator=model_pipe,X=x_train,y=y_train,scoring='r2',cv=KFold(n_splits=10,shuffle=True,random_state=0))
cv_original.mean()

np.float64(0.8701043092933576)

In [57]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

model_pipe.fit(x_train,y_train)
y_pred = model_pipe.predict(x_test)

MSE = mean_squared_error(y_test,y_pred)
MAE = mean_absolute_error(y_test,y_pred)

print(f'Mean Squared Error: {MSE}')

print(f'Mean Absolute Error: {MAE}')


Mean Squared Error: 0.07760002237245646
Mean Absolute Error: 0.16690770006811104


In [54]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


model_dict = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'lasso': Lasso(), 
    'decision_tree': DecisionTreeRegressor(),
    'random_forest': RandomForestRegressor(),
    'extra_trees': ExtraTreesRegressor(),
    'gradient_boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost': XGBRegressor(),
    'lightgbm':LGBMRegressor()
}

# (1) Without Feature Transformation

In [60]:
transformer = ColumnTransformer([
    ('Onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False),['City','Bedrooms','Bathrooms','location_area']),
    ('Ordinal',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),['floor_level']),
    # ('Target_Encoding',TargetEncoder(),['location_area']),
    ('Scaler',StandardScaler(),['Floor_area'])
],remainder='passthrough')


def score(model_name,model):

    output = []
    output.append(model_name)

    model_pipe = Pipeline([
        ('preprocessor',transformer),
        ('regressor',model)
    ])

    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

    # cv = cross_val_score(model_pipe,x_train,y_train,cv=KFold(n_splits=10,shuffle=True,random_state=0),scoring='r2') 
    # output.append(cv.mean())

    model_pipe.fit(x_train,y_train)
    y_pred = model_pipe.predict(x_test)

    R2 = r2_score(y_test,y_pred)
    MSE = mean_squared_error(y_test,y_pred)
    MAE = mean_absolute_error(y_test,y_pred)

    output.append(R2)
    output.append(MSE)
    output.append(MAE)

    return output


model_output  = []
for model_name,model in model_dict.items():
    model_output.append(score(model_name,model))

model_output = pd.DataFrame(model_output,columns=['model_name','r2_score','mse','mae']).sort_values(by='mse',ascending=True)
model_output

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 319
[LightGBM] [Info] Number of data points in the train set: 1681, number of used features: 33
[LightGBM] [Info] Start training from score 0.919131


Unnamed: 0,model_name,r2_score,mse,mae
9,mlp,0.889899,0.053944,0.136656
10,xgboost,0.860554,0.068321,0.142653
6,extra_trees,0.859236,0.068967,0.136304
7,gradient_boosting,0.846171,0.075368,0.159702
2,ridge,0.844019,0.076422,0.1716
0,linear_reg,0.841616,0.0776,0.166908
1,svr,0.838414,0.079169,0.149095
5,random_forest,0.806552,0.094779,0.140958
11,lightgbm,0.791573,0.102119,0.165842
4,decision_tree,0.715159,0.139557,0.167137


# (2) With Feature Transformation (Linear Assumptions)

In [59]:
x_train2,x_test2,y_train2,y_test2 = train_test_split(x,y,test_size=0.2,random_state=0)


floor_area_pipe = Pipeline([
    ('log_transform',PowerTransformer(method='yeo-johnson')),
    ('scalar',StandardScaler())
])

transformer = ColumnTransformer([
    ("floor_area",floor_area_pipe,['Floor_area']),
    ("log_transform",PowerTransformer(method='yeo-johnson'),['Floor_area']),
    ('Onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False),['City','Bedrooms','Bathrooms','location_area']),
    ('Ordinal',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),['floor_level']),
    # ('Target_Encoding',TargetEncoder(random_state=0),['location_area']),
    # ('Scaler',StandardScaler(),['Floor_area'])
],remainder='passthrough')


def score_transformed(model_name,model):

    output = []
    output.append(model_name)

    model_pipe = Pipeline([
        ('preprocessor',transformer),
        ('regressor',model)
    ])

    final_pipe = TransformedTargetRegressor(regressor=model_pipe,func=np.log1p,inverse_func=np.expm1)


    # cv = cross_val_score(final_pipe,x_train2,y_train2,cv=KFold(n_splits=10,shuffle=True,random_state=0),scoring='r2') 
    # output.append(cv.mean())

    final_pipe.fit(x_train2,y_train2)
    y_pred2 = final_pipe.predict(x_test2)

    R2 = r2_score(y_test2,y_pred2)
    MSE = mean_squared_error(y_test2,y_pred2)
    MAE = mean_absolute_error(y_test2,y_pred2)

    output.append(R2)
    output.append(MSE)
    output.append(MAE)

    return output



model_output_transformed  = []
for model_name,model in model_dict.items():
    model_output_transformed.append(score_transformed(model_name,model))

model_output_transformed = pd.DataFrame(model_output_transformed,
                                        columns=['model_name','r2_score','mse','mae']).sort_values(by='mse',ascending=True)
model_output_transformed

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 573
[LightGBM] [Info] Number of data points in the train set: 1681, number of used features: 34
[LightGBM] [Info] Start training from score 0.607124


Unnamed: 0,model_name,r2_score,mse,mae
9,mlp,0.89084,0.053483,0.139576
1,svr,0.870947,0.063229,0.15583
10,xgboost,0.866515,0.065401,0.139492
0,linear_reg,0.858094,0.069526,0.158914
2,ridge,0.847247,0.074841,0.161986
6,extra_trees,0.84691,0.075006,0.137463
7,gradient_boosting,0.833376,0.081637,0.157017
11,lightgbm,0.806209,0.094948,0.163231
5,random_forest,0.798523,0.098713,0.140014
8,adaboost,0.717671,0.138327,0.243162
