In [3]:

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [5]:
import sklearn
print(sklearn.__version__)

1.7.0


In [7]:
df = pd.read_csv('data for model selection for price prediction_without encoding.csv')

In [9]:
df

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,study room,servant room,store room,others,furnishing_type,luxury_category,floor_category
0,flat,sector 106,1.20,3,3,3,Relatively New,1145.000000,1,0,0,0,0,High,High Floor
1,flat,sector 9a,1.25,4,3,3,Old Property,1994.777045,0,1,0,0,0,Low,Low Floor
2,flat,sector 77,0.86,2,2,2,New Property,1186.324045,0,0,0,0,1,Medium,High Floor
3,flat,sector 33 road,1.35,3,2,3+,New Property,1378.931760,0,0,0,0,0,Low,High Floor
4,flat,sector 81,2.21,3,3,3+,Relatively New,1866.283075,0,1,0,0,2,Medium,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3580,flat,sector 107,1.15,2,2,3,Relatively New,1300.935993,1,0,0,0,0,Low,Low Floor
3581,flat,sector 107,0.36,2,2,2,Relatively New,733.708470,0,0,0,0,0,Low,Mid Floor
3582,flat,sector 85,1.49,4,5,3+,Relatively New,2017.000000,0,1,0,0,1,Medium,Mid Floor
3583,flat,sector 88a,1.31,2,2,3+,Relatively New,1432.682751,1,0,0,0,0,Medium,Mid Floor


In [11]:
# Took data before encoding because we will do encoding also in pipeline

In [13]:
df['furnishing_type'].value_counts()

furnishing_type
0    2402
1     998
2     185
Name: count, dtype: int64

In [15]:
# Manually converting furnishing type column back to categorical

df['furnishing_type'] = df['furnishing_type'].replace({0:'unfurnished',1:'semifurnished',2:'furnished'})

In [17]:
df.head(10)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,study room,servant room,store room,others,furnishing_type,luxury_category,floor_category
0,flat,sector 106,1.2,3,3,3,Relatively New,1145.0,1,0,0,0,unfurnished,High,High Floor
1,flat,sector 9a,1.25,4,3,3,Old Property,1994.777045,0,1,0,0,unfurnished,Low,Low Floor
2,flat,sector 77,0.86,2,2,2,New Property,1186.324045,0,0,0,0,semifurnished,Medium,High Floor
3,flat,sector 33 road,1.35,3,2,3+,New Property,1378.93176,0,0,0,0,unfurnished,Low,High Floor
4,flat,sector 81,2.21,3,3,3+,Relatively New,1866.283075,0,1,0,0,furnished,Medium,Mid Floor
5,flat,sohna road road,0.54,2,1,3,New Property,932.03379,0,0,0,0,unfurnished,Medium,Low Floor
6,flat,sector 69,2.06,3,3,3,Relatively New,1607.373469,0,0,0,0,unfurnished,Low,High Floor
7,flat,sector 92,0.95,4,5,3+,Relatively New,1895.0,0,0,0,0,unfurnished,Low,High Floor
8,flat,sector 88a,1.4,3,3,3+,Relatively New,1641.415763,0,0,0,0,semifurnished,Low,Mid Floor
9,flat,sector 86,1.93,3,4,3,Relatively New,2031.110444,0,1,0,0,unfurnished,Medium,High Floor


In [19]:
x = df.drop(columns = ['price'])
y = df['price']

In [21]:
# Applyin log1p for target variable due to its right skewness
y_transformed = np.log1p(y)

# Ordinal Encoding

In [24]:
columns_to_encode = ['property_type', 'sector', 'balcony','agePossession','furnishing_type','luxury_category','floor_category']

In [26]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value=-1), columns_to_encode)
    ],
    remainder = 'passthrough'
)

In [28]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [30]:
# KFold Cross validation
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
scores = cross_val_score(pipeline, x, y_transformed, cv = kfold, scoring = 'r2')

In [32]:
scores.mean(), scores.std()

(0.7351675059748926, 0.018499979714666864)

In [34]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y_transformed, test_size = 0.2, random_state = 42)

In [36]:
pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [38]:
y_pred = pipeline.predict(x_test)

In [40]:
y_pred = np.expm1(y_pred)


In [42]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test), y_pred)

0.9422454443858292

In [44]:
# Function for calucating different scores after training different models

def scorer(model_name, model):
    output = []

    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    # k fold cross validation
    kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
    scores = cross_val_score(pipeline, x,y_transformed, cv = kfold, scoring = 'r2')

    output.append(scores.mean())
    x_train,x_test, y_train,  y_test = train_test_split(x,y_transformed, test_size = 0.2, random_state = 42)
    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)
    y_pred = np.expm1(y_pred)
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    return output

In [46]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [48]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [56]:
model_output

[['linear_reg', 0.7351675059748926, 0.9422454443858292],
 ['svr', 0.7524437421709462, 0.8686675273357478],
 ['ridge', 0.7351694074589539, 0.9421894404297665],
 ['LASSO', 0.049388709901219276, 1.5290449216329298],
 ['decision tree', 0.776338872163717, 0.7105460122566983],
 ['random forest', 0.8786776733262129, 0.5446314695837012],
 ['extra trees', 0.8668768462399387, 0.5831957757963129],
 ['gradient boosting', 0.873799895628548, 0.5871841524346829],
 ['adaboost', 0.7475371613477891, 0.8891071815035011],
 ['mlp', 0.7975136280059891, 0.7122189673180331],
 ['xgboost', 0.8899917463080362, 0.5169875270485711]]

In [58]:
model_df = pd.DataFrame(model_output, columns = ['name','r2','mae'])

In [60]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.889992,0.516988
5,random forest,0.878678,0.544631
6,extra trees,0.866877,0.583196
7,gradient boosting,0.8738,0.587184
4,decision tree,0.776339,0.710546
9,mlp,0.797514,0.712219
1,svr,0.752444,0.868668
8,adaboost,0.747537,0.889107
2,ridge,0.735169,0.942189
0,linear_reg,0.735168,0.942245


### Conclusion
- In Ordinal encoding, Tree based algos are giving best results
- Linear algorithms are bahaving bad because an order has been introduced

# One Hot Encoding

In [64]:
# Create a collumn transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1', OneHotEncoder(drop = 'first',handle_unknown = 'ignore'),['sector','agePossession','furnishing_type'])
    ],
    remainder = 'passthrough'
)

In [66]:
# Creating a Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])    

In [68]:
# KFold cross validation

kfold = KFold(n_splits =10, shuffle = True, random_state = 42)
scores = cross_val_score(pipeline, x, y_transformed, cv = kfold, scoring = 'r2')



In [70]:
scores.mean(), scores.std()

(0.8518905174170335, 0.020094149210952924)

In [72]:
x_train, x_test, y_train, y_test = train_test_split(x,y_transformed, test_size = 0.2, random_state = 42)

In [74]:
pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [76]:
y_pred = pipeline.predict(x_test)

In [78]:
y_pred = np.expm1(y_pred)
mean_absolute_error(np.expm1(y_test), y_pred)

0.707491947428465

In [80]:
# Function for calucating different scores after training different models

def scorer(model_name, model):
    output = []

    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    # k fold cross validation
    kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
    scores = cross_val_score(pipeline, x,y_transformed, cv = kfold, scoring = 'r2')

    output.append(scores.mean())
    x_train,x_test, y_train,  y_test = train_test_split(x,y_transformed, test_size = 0.2, random_state = 42)
    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)
    y_pred = np.expm1(y_pred)
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    return output

In [82]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [83]:
model_output

[['linear_reg', 0.8518905174170335, 0.707491947428465],
 ['svr', 0.7566749558257183, 0.8604955234385939],
 ['ridge', 0.8520982947351454, 0.7142409923541285],
 ['LASSO', 0.04938870990121439, 1.529044921632932],
 ['decision tree', 0.7950415633360292, 0.7256393230128143],
 ['random forest', 0.8882035260140775, 0.5198928585452729],
 ['extra trees', 0.8904294293430108, 0.5233973107971913],
 ['gradient boosting', 0.8751476268077731, 0.5822770923333362],
 ['adaboost', 0.754175500990155, 0.844046439398545],
 ['mlp', 0.864410043940994, 0.5785185631868647],
 ['xgboost', 0.8970585573832052, 0.5100219876688061]]

In [84]:
model_df = pd.DataFrame(model_output, columns = ['name','r2','mae'])

In [85]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.897059,0.510022
5,random forest,0.888204,0.519893
6,extra trees,0.890429,0.523397
9,mlp,0.86441,0.578519
7,gradient boosting,0.875148,0.582277
0,linear_reg,0.851891,0.707492
2,ridge,0.852098,0.714241
4,decision tree,0.795042,0.725639
8,adaboost,0.754176,0.844046
1,svr,0.756675,0.860496


# OHE With PCA

In [87]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown = 'ignore'),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [88]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    # Basically model laane se pahle dimensionality kam kr rhe hai through PCA
    ('regressor', LinearRegression())
])

In [89]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')



In [90]:
scores.mean(), scores.std()

(0.05201155660307631, 0.019775024801624238)

In [91]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    x_train, x_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(x_train,y_train)
    
    y_pred = pipeline.predict(x_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [92]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [93]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [94]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [95]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.741517,0.808862
6,extra trees,0.713433,0.829054
4,decision tree,0.671184,0.905605
10,xgboost,0.604796,0.952638
7,gradient boosting,0.606412,1.070879
1,svr,0.227339,1.360057
9,mlp,0.211988,1.407098
8,adaboost,0.28847,1.445852
0,linear_reg,0.052012,1.526809
2,ridge,0.052012,1.526809
