<a href="https://colab.research.google.com/github/shriju/real-estate-project-end-to-end/blob/main/insights_module_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('/content/gurgaon_properties_post_feature_selection_v2.csv').drop(columns=['store room','floor_category','balcony'])


In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 36,0.82,3.0,2.0,New Property,850.0,0.0,0.0,Low
1,flat,sector 89,0.95,2.0,2.0,New Property,1226.0,1.0,0.0,Low
2,flat,sohna road,0.32,2.0,2.0,New Property,1000.0,0.0,0.0,Low
3,flat,sector 92,1.6,3.0,4.0,Relatively New,1615.0,1.0,1.0,High
4,flat,sector 102,0.48,2.0,2.0,Relatively New,582.0,0.0,0.0,High


In [None]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished

In [None]:
# Numerical = bedRoom, bathroom, built_up_area, servant room
# Ordinal = property_type, furnishing_type, luxury_category
# OHE = sector, agePossession

In [4]:
df['agePossession'].value_counts()

Relatively New        1732
Moderately Old         619
New Property           599
Old Property           327
Under Construction     277
Name: agePossession, dtype: int64

In [5]:
df['agePossession'] = df['agePossession'].replace(
    {
        'Relatively New':'new',
        'Moderately Old':'old',
        'New Property' : 'new',
        'Old Property' : 'old',
        'Under Construction' : 'under construction'
    }
)

In [6]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 36,0.82,3.0,2.0,new,850.0,0.0,0.0,Low
1,flat,sector 89,0.95,2.0,2.0,new,1226.0,1.0,0.0,Low
2,flat,sohna road,0.32,2.0,2.0,new,1000.0,0.0,0.0,Low
3,flat,sector 92,1.6,3.0,4.0,new,1615.0,1.0,1.0,High
4,flat,sector 102,0.48,2.0,2.0,new,582.0,0.0,0.0,High


In [7]:
df['property_type'] = df['property_type'].replace({'flat':0,'house':1})

In [8]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 36,0.82,3.0,2.0,new,850.0,0.0,0.0,Low
1,0,sector 89,0.95,2.0,2.0,new,1226.0,1.0,0.0,Low
2,0,sohna road,0.32,2.0,2.0,new,1000.0,0.0,0.0,Low
3,0,sector 92,1.6,3.0,4.0,new,1615.0,1.0,1.0,High
4,0,sector 102,0.48,2.0,2.0,new,582.0,0.0,0.0,High


In [9]:
df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1,'High':2})

In [10]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 36,0.82,3.0,2.0,new,850.0,0.0,0.0,0
1,0,sector 89,0.95,2.0,2.0,new,1226.0,1.0,0.0,0
2,0,sohna road,0.32,2.0,2.0,new,1000.0,0.0,0.0,0
3,0,sector 92,1.6,3.0,4.0,new,1615.0,1.0,1.0,2
4,0,sector 102,0.48,2.0,2.0,new,582.0,0.0,0.0,2


In [11]:
df.shape

(3554, 10)

In [12]:
new_df=pd.get_dummies(df,columns=['sector','agePossession'],drop_first=True)

In [13]:
new_df.shape

(3554, 113)

In [14]:
X = new_df.drop(columns=['price'])
y = new_df['price']

In [15]:
y_log = np.log1p(y)

In [16]:
y_log

0       0.598837
1       0.667829
2       0.277632
3       0.955511
4       0.392042
          ...   
3549    0.314811
3550    1.945910
3551    0.470004
3552    2.803360
3553    1.022451
Name: price, Length: 3554, dtype: float64

In [17]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [18]:
X_scaled = pd.DataFrame(X_scaled,columns=X.columns)

In [19]:
X_scaled

Unnamed: 0,property_type,bedRoom,bathroom,built_up_area,servant room,furnishing_type,luxury_category,sector_gwal pahari,sector_manesar,sector_sector 1,...,sector_sector 9,sector_sector 90,sector_sector 91,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sohna road,agePossession_old,agePossession_under construction
0,-0.517180,-0.074329,-0.874300,-0.831662,-0.747968,-0.668281,-0.984642,-0.071348,-0.093805,-0.041123,...,-0.078923,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.602271,-0.290738
1,-0.517180,-0.877269,-0.874300,-0.522517,1.336956,-0.668281,-0.984642,-0.071348,-0.093805,-0.041123,...,-0.078923,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.602271,-0.290738
2,-0.517180,-0.877269,-0.874300,-0.708333,-0.747968,-0.668281,-0.984642,-0.071348,-0.093805,-0.041123,...,-0.078923,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,4.561105,-0.602271,-0.290738
3,-0.517180,-0.074329,0.505173,-0.202684,1.336956,1.037949,1.866207,-0.071348,-0.093805,-0.041123,...,-0.078923,-0.160267,-0.069328,5.877074,-0.050386,-0.125375,-0.109357,-0.219245,-0.602271,-0.290738
4,-0.517180,-0.877269,-0.874300,-1.052010,-0.747968,-0.668281,1.866207,-0.071348,-0.093805,-0.041123,...,-0.078923,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.602271,-0.290738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,-0.517180,-0.877269,-0.874300,-1.093119,-0.747968,-0.668281,0.440783,-0.071348,-0.093805,-0.041123,...,-0.078923,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.602271,-0.290738
3550,1.933563,1.531549,1.194909,3.590095,1.336956,-0.668281,1.866207,-0.071348,-0.093805,-0.041123,...,-0.078923,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.602271,-0.290738
3551,-0.517180,-1.680208,-1.564036,-0.983768,-0.747968,1.037949,0.440783,-0.071348,-0.093805,-0.041123,...,-0.078923,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,1.660383,-0.290738
3552,1.933563,1.531549,1.884645,2.983317,1.336956,-0.668281,0.440783,-0.071348,-0.093805,-0.041123,...,-0.078923,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,1.660383,-0.290738


In [20]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(LinearRegression(), X_scaled, y_log, cv=kfold, scoring='r2')

In [21]:
scores.mean(),scores.std()

(0.8512613057405425, 0.016992929105286218)

In [67]:
lr = LinearRegression()
ridge = Ridge(alpha=0.0001)

In [68]:
lr.fit(X_scaled,y_log)


In [62]:
ridge.fit(X_scaled, y_log)

In [69]:
lr.coef_

array([ 1.20165036e-01,  5.40015778e-02,  6.51193988e-02,  2.10637561e-01,
        5.09461847e-02,  8.28171481e-03,  6.18569187e-03,  9.81917509e-03,
       -2.27136751e-02, -5.19263751e-03,  5.24027544e-03,  2.69999569e-02,
       -2.90628642e-03,  1.96385005e-03, -1.93922433e-02,  5.72258238e-04,
       -1.29872300e-02,  1.68170547e-02,  2.61496408e-02, -1.49660399e-02,
        8.73379008e-03,  1.62286944e-02,  3.08257676e-02,  3.21709327e-02,
       -1.94575986e-02, -1.22503903e-02,  2.87410970e-02,  3.29139011e-03,
        1.39170889e-02,  6.72176140e-03, -9.41789321e-03,  3.54737020e-02,
        2.19599278e-03,  1.77019789e-02,  5.75604298e-02,  7.38834785e-02,
        6.74158385e-03,  4.18540805e-02, -1.27147058e-02,  6.27615381e-03,
        2.24917111e-02,  2.59569793e-02,  2.49451572e-03, -1.14658162e-02,
        1.14313378e-03,  1.21374481e-02,  2.57701788e-03, -2.07745570e-02,
        7.65349283e-03,  1.67856368e-02,  6.03752164e-02,  2.69570318e-02,
        1.58617969e-02,  

In [63]:
ridge.coef_

array([ 1.20165033e-01,  5.40015843e-02,  6.51193965e-02,  2.10637554e-01,
        5.09461864e-02,  8.28171743e-03,  6.18569538e-03,  9.81907453e-03,
       -2.27138067e-02, -5.19269573e-03,  5.24021239e-03,  2.69997136e-02,
       -2.90643912e-03,  1.96365747e-03, -1.93923489e-02,  5.72112298e-04,
       -1.29874109e-02,  1.68168747e-02,  2.61494229e-02, -1.49661346e-02,
        8.73367597e-03,  1.62285832e-02,  3.08256534e-02,  3.21707775e-02,
       -1.94577320e-02, -1.22504613e-02,  2.87409954e-02,  3.29133160e-03,
        1.39170137e-02,  6.72157638e-03, -9.41795148e-03,  3.54735949e-02,
        2.19590678e-03,  1.77018721e-02,  5.75602841e-02,  7.38833273e-02,
        6.74154978e-03,  4.18539312e-02, -1.27148213e-02,  6.27609996e-03,
        2.24916348e-02,  2.59567879e-02,  2.49440485e-03, -1.14659854e-02,
        1.14294781e-03,  1.21373726e-02,  2.57693909e-03, -2.07747149e-02,
        7.65343901e-03,  1.67855570e-02,  6.03750346e-02,  2.69569636e-02,
        1.58617170e-02,  

In [25]:
lr.coef_.shape

(112,)

In [70]:
coef_df = pd.DataFrame(lr.coef_.reshape(1,112), columns=X.columns).stack().reset_index().drop(columns=['level_0']).rename(columns={'level_1':'feature', 0:'coefficient'}) # finding out the coefficients of each feature

In [65]:
coef_ridge = pd.DataFrame(ridge.coef_.reshape(1,112), columns=X.columns).stack().reset_index().drop(columns=['level_0']).rename(columns={'level_1':'feature', 0:'coefficient'}) # finding out the coefficients of each feature

In [66]:
coef_ridge

Unnamed: 0,feature,coefficient
0,property_type,0.120165
1,bedRoom,0.054002
2,bathroom,0.065119
3,built_up_area,0.210638
4,servant room,0.050946
...,...,...
107,sector_sector 95,-0.025222
108,sector_sector 99,-0.010312
109,sector_sohna road,-0.029515
110,agePossession_old,-0.007900


In [35]:
coef_df

Unnamed: 0,feature,coefficient
0,property_type,0.120165
1,bedRoom,0.054002
2,bathroom,0.065119
3,built_up_area,0.210638
4,servant room,0.050946
...,...,...
107,sector_sector 95,-0.025222
108,sector_sector 99,-0.010312
109,sector_sohna road,-0.029515
110,agePossession_old,-0.007900


### Regression Analysis

In [36]:
# 1. Import necessary libraries
import statsmodels.api as sm

# 2. Add a constant to X
X_with_const = sm.add_constant(X_scaled)

# 3. Fit the model
model = sm.OLS(y_log, X_with_const).fit()

# 4. Obtain summary statistics
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.860
Method:                 Least Squares   F-statistic:                     196.7
Date:                Fri, 17 Nov 2023   Prob (F-statistic):               0.00
Time:                        17:49:05   Log-Likelihood:                 588.22
No. Observations:                3554   AIC:                            -950.4
Df Residuals:                    3441   BIC:                            -252.6
Df Model:                         112                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

In [37]:
y_log.std()

0.5579613263072812

In [38]:
X['bedRoom'].std()

1.2455995038118572

In [39]:
# bedroom, unstandardized coefficient for bedroom
0.054 *(0.557/1.245)

0.024159036144578313

In [None]:
# still y is in log form, so we need to take exponent of 0.024159036144578313

In [40]:
np.expm1(0.0241) # final value of coeffient of bedroom in unstanderdized form

0.024392752044032906

In [41]:
X_scaled['built_up_area'].std() # for built up area

1.000140716246387

In [42]:
0.2106 *(0.557/1.000)

0.11730420000000001

In [43]:
np.expm1(0.117)

0.12411942969053685