## Ridge , Lasso, Elasticnet with companyName

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import r2_score


import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("../Dataset/ProcessedFile.csv")

In [3]:
df.head()

Unnamed: 0,size_sq_ft,propertyType,bedrooms,latitude,longitude,localityName,suburbName,companyName,closest_metro_station_km,AP_dist_km,Aiims_dist_km,NDRLW_dist_km,price
0,400,Independent Floor,1,28.64101,77.284386,Swasthya Vihar,East Delhi,Other,0.577495,21.741188,11.119239,6.227231,9000
1,1050,Apartment,2,28.594969,77.298668,mayur vihar phase 1,East Delhi,Other,0.417142,21.401856,9.419061,9.217502,20000
2,2250,Independent Floor,2,28.641806,77.293922,Swasthya Vihar,East Delhi,Other,0.125136,22.620365,11.829486,7.159184,28000
3,1350,Independent Floor,2,28.644363,77.293228,Krishna Nagar,East Delhi,Other,0.371709,22.681201,11.982708,7.097348,28000
4,450,Apartment,2,28.594736,77.31115,New Ashok Nagar,East Delhi,Other,1.08776,22.59281,10.571573,10.263271,12500


In [4]:
X = df.drop(['price','localityName'], axis=1)
y = df['price']

In [5]:
X['propertyType'] = X['propertyType'].map({'Independent Floor':1,
                                           'Apartment':2,
                                           'Independent House':3,
                                           'Villa':4})

In [6]:
ohc = OneHotEncoder()
ct = make_column_transformer((ohc,
       make_column_selector(dtype_include=object)),
       ("passthrough",
        make_column_selector(dtype_include=['int64','float64'])))
X_transf = ct.fit_transform(X).toarray()
X_transf = pd.DataFrame(X_transf, columns=ct.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X_transf, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType'])

In [7]:
print(ct.get_feature_names_out())

['onehotencoder__suburbName_Central Delhi'
 'onehotencoder__suburbName_East Delhi'
 'onehotencoder__suburbName_North Delhi'
 'onehotencoder__suburbName_North West Delhi'
 'onehotencoder__suburbName_Other' 'onehotencoder__suburbName_South Delhi'
 'onehotencoder__suburbName_South West Delhi'
 'onehotencoder__suburbName_West Delhi'
 'onehotencoder__companyName_A R Realtors'
 'onehotencoder__companyName_AB Estate'
 'onehotencoder__companyName_AMIT CHHABRA'
 'onehotencoder__companyName_AMPM Realtors'
 'onehotencoder__companyName_AVS Realtors'
 'onehotencoder__companyName_Abhishek yadav'
 'onehotencoder__companyName_Ahuja properties'
 'onehotencoder__companyName_Angel Property Services'
 'onehotencoder__companyName_Arjun Raj'
 'onehotencoder__companyName_Ashish Bansal'
 'onehotencoder__companyName_Ashish Talwar'
 'onehotencoder__companyName_Atul Kumar'
 'onehotencoder__companyName_B Kumar and Brothers'
 'onehotencoder__companyName_Baghla Estates'
 'onehotencoder__companyName_Bhagirathi Estat

In [8]:
X_train['passthrough__propertyType'] = X_train['passthrough__propertyType'].astype(object)
X_test['passthrough__propertyType'] = X_test['passthrough__propertyType'].astype(object)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [9]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet

## Ridge

In [10]:
ridge = Ridge()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
params = {'alpha':np.arange(1,20)}
gcv = GridSearchCV(ridge, param_grid=params, cv=kfold,scoring='r2',n_jobs=-1)
gcv.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])},
             scoring='r2')

In [11]:
print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 17}
0.39185870465761463


In [12]:
#Coefficients
best_model = gcv.best_estimator_
print(best_model.coef_)

[ 1.00819266e+04 -7.74226836e+03 -4.10963806e+03  3.47183838e+03
 -1.63468116e+03 -1.18635410e+03 -4.75808865e+03  5.87726533e+03
 -1.73604441e+04 -6.77713395e+03 -2.35408933e+03  1.55448326e+04
 -1.18419429e+03 -3.95137531e+03 -2.31015260e+04 -2.05127147e+04
 -2.82158721e+03  5.33956804e+03 -6.44129712e+03 -3.73133135e+04
  4.45682593e+05 -1.10881322e+04  3.34943875e+03  9.93004646e+02
  4.96394525e+04 -7.00509331e+03  2.76366124e+03 -6.54083886e+03
 -5.45269181e+03 -1.94785986e+04 -3.90845846e+03  5.09080186e+00
 -3.02381251e+03  5.55917439e+04 -2.04970481e+03 -3.64448123e+03
 -2.72227813e+04  5.33753100e+03 -3.39650193e+04 -2.30788551e+04
  0.00000000e+00  2.28283495e+04 -1.15186349e+04 -6.81772916e+04
 -2.44315320e+04 -2.08136067e+03 -7.38244557e+03 -4.29972786e+03
  0.00000000e+00 -5.89901205e+03  7.28392302e+03 -1.16118419e+04
 -9.26588554e+04 -2.45155422e+04 -4.18679008e+04 -1.65153426e+02
 -1.63523099e+03 -4.99901659e+03  1.54384220e+04 -3.25325659e+04
  1.21833356e+04  4.64335

In [13]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.5097321902871619


## Lasso

In [14]:
lasso = Lasso()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
params = {'alpha':np.arange(1,50)}
gcv = GridSearchCV(lasso, param_grid=params, cv=kfold,scoring='r2',n_jobs=-1)
gcv.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
             scoring='r2')

In [15]:
print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 49}
0.34038544471761767


In [16]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.48383863078914646


## Elasticnet

In [17]:
elastic = ElasticNet()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
params = {'alpha':[0.01,0.1,0.5,1,2,3,6,10],'l1_ratio':[0,0.25,0.5,0.75,1]}
gcv = GridSearchCV(elastic, param_grid=params, cv=kfold,scoring='r2',n_jobs=-1)
gcv.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=ElasticNet(), n_jobs=-1,
             param_grid={'alpha': [0.01, 0.1, 0.5, 1, 2, 3, 6, 10],
                         'l1_ratio': [0, 0.25, 0.5, 0.75, 1]},
             scoring='r2')

In [18]:
print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 0.01, 'l1_ratio': 0.75}
0.38561988806984343


In [19]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.4903385056723052


## Ridge , Lasso, Elasticnet without companyName

In [20]:
X = df.drop(['price','localityName','companyName'], axis=1)
y = df['price']

In [21]:
X['propertyType'] = X['propertyType'].map({'Independent Floor':1,
                                           'Apartment':2,
                                           'Independent House':3,
                                           'Villa':4})

In [22]:
ohc = OneHotEncoder()
ct = make_column_transformer((ohc,
       make_column_selector(dtype_include=object)),
       ("passthrough",
        make_column_selector(dtype_include=['int64','float64'])))
X_transf = ct.fit_transform(X)
X_transf = pd.DataFrame(X_transf, columns=ct.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X_transf, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType'])

In [23]:
print(ct.get_feature_names_out())

['onehotencoder__suburbName_Central Delhi'
 'onehotencoder__suburbName_East Delhi'
 'onehotencoder__suburbName_North Delhi'
 'onehotencoder__suburbName_North West Delhi'
 'onehotencoder__suburbName_Other' 'onehotencoder__suburbName_South Delhi'
 'onehotencoder__suburbName_South West Delhi'
 'onehotencoder__suburbName_West Delhi' 'passthrough__size_sq_ft'
 'passthrough__propertyType' 'passthrough__bedrooms'
 'passthrough__latitude' 'passthrough__longitude'
 'passthrough__closest_metro_station_km' 'passthrough__AP_dist_km'
 'passthrough__Aiims_dist_km' 'passthrough__NDRLW_dist_km']


In [24]:
X_train['passthrough__propertyType'] = X_train['passthrough__propertyType'].astype(object)
X_test['passthrough__propertyType'] = X_test['passthrough__propertyType'].astype(object)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [25]:
## Ridge
ridge = Ridge()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
params = {'alpha':np.arange(1,20)}
gcv = GridSearchCV(ridge, param_grid=params, cv=kfold,scoring='r2',n_jobs=-1)
gcv.fit(X_train,y_train)

print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 19}
0.2810062102417459


In [26]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.33209238079293146


In [27]:
## Lasso
lasso = Lasso()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
params = {'alpha':np.arange(1,50)}
gcv = GridSearchCV(lasso, param_grid=params, cv=kfold,scoring='r2',n_jobs=-1)
gcv.fit(X_train,y_train)

print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 49}
0.2824108302417986


In [28]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.3296187013220817


In [29]:
## Elasticnet 

elastic = ElasticNet()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
params = {'alpha':[0.01,0.1,0.5,1,2,3,6,10],'l1_ratio':[0,0.25,0.5,0.75,1]}
gcv = GridSearchCV(elastic, param_grid=params, cv=kfold,scoring='r2',n_jobs=-1)
gcv.fit(X_train,y_train)

print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 10, 'l1_ratio': 0}
0.30072525530845207


In [30]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.3197846003498559
