In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv(r'C:\Users\pradeep dubey\Desktop\project\Real_State_Project\data\feature_engineering\gurgaon_properties_post_feature_selection_v2.csv').drop(columns=['store room','floor_category','balcony'])

In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 70a,1.6,4.0,4.0,Relatively New,1600.0,0,0,Low
1,house,sector 111,9.0,5.0,5.0,Relatively New,8000.0,1,0,Medium
2,flat,sector 74,2.25,3.0,3.0,Relatively New,1900.0,0,0,High
3,house,sector 55,5.6,6.0,4.0,Old Property,2700.0,1,2,Low
4,house,sector 43,2.45,4.0,4.0,Moderately Old,540.0,1,1,Medium


In [4]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished

In [5]:
# Numerical = bedRoom, bathroom, built_up_area, servant room
# Ordinal = property_type, furnishing_type, luxury_category 
# OHE = sector, agePossession

In [6]:
df['agePossession'] = df['agePossession'].replace(
    {
        'Relatively New':'new',
        'Moderately Old':'old',
        'New Property' : 'new',
        'Old Property' : 'old',
        'Under Construction' : 'under construction'
    }
)

In [7]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 70a,1.6,4.0,4.0,new,1600.0,0,0,Low
1,house,sector 111,9.0,5.0,5.0,new,8000.0,1,0,Medium
2,flat,sector 74,2.25,3.0,3.0,new,1900.0,0,0,High
3,house,sector 55,5.6,6.0,4.0,old,2700.0,1,2,Low
4,house,sector 43,2.45,4.0,4.0,old,540.0,1,1,Medium


In [8]:
df['property_type'] = df['property_type'].replace({'flat':0,'house':1})

  df['property_type'] = df['property_type'].replace({'flat':0,'house':1})


In [9]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 70a,1.6,4.0,4.0,new,1600.0,0,0,Low
1,1,sector 111,9.0,5.0,5.0,new,8000.0,1,0,Medium
2,0,sector 74,2.25,3.0,3.0,new,1900.0,0,0,High
3,1,sector 55,5.6,6.0,4.0,old,2700.0,1,2,Low
4,1,sector 43,2.45,4.0,4.0,old,540.0,1,1,Medium


In [10]:
df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1,'High':2})

  df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1,'High':2})


In [11]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 70a,1.6,4.0,4.0,new,1600.0,0,0,0
1,1,sector 111,9.0,5.0,5.0,new,8000.0,1,0,1
2,0,sector 74,2.25,3.0,3.0,new,1900.0,0,0,2
3,1,sector 55,5.6,6.0,4.0,old,2700.0,1,2,0
4,1,sector 43,2.45,4.0,4.0,old,540.0,1,1,1


In [12]:
new_df = pd.get_dummies(df,columns=['sector','agePossession'],drop_first=True)

In [13]:
X = new_df.drop(columns=['price'])
y = new_df['price']

In [14]:
y_log = np.log1p(y)

In [15]:
y_log

0       0.955511
1       2.302585
2       1.178655
3       1.887070
4       1.238374
          ...   
3548    3.349904
3549    1.022451
3550    0.667829
3551    0.662688
3552    1.081805
Name: price, Length: 3553, dtype: float64

In [16]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [17]:
X_scaled = pd.DataFrame(X_scaled,columns=X.columns)

In [18]:
X_scaled

Unnamed: 0,property_type,bedRoom,bathroom,built_up_area,servant room,furnishing_type,luxury_category,sector_gwal pahari,sector_manesar,sector_new,...,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sector 99a,sector_sector 9a,sector_sohna road,sector_sohna road road,agePossession_old,agePossession_under construction
0,-0.516835,0.728339,0.504910,-0.215456,-0.748132,-0.685482,-0.984915,-0.071358,-0.093818,-0.033572,...,-0.170177,-0.050393,-0.125392,-0.058214,-0.092279,-0.053127,-0.211407,-0.055728,-0.606292,-0.197203
1,1.934854,1.531252,1.194623,5.047388,1.336663,-0.685482,0.440504,-0.071358,-0.093818,-0.033572,...,-0.170177,-0.050393,-0.125392,-0.058214,-0.092279,-0.053127,-0.211407,-0.055728,-0.606292,-0.197203
2,-0.516835,-0.074574,-0.184804,0.031240,-0.748132,-0.685482,1.865922,-0.071358,-0.093818,-0.033572,...,-0.170177,-0.050393,-0.125392,-0.058214,-0.092279,-0.053127,-0.211407,-0.055728,-0.606292,-0.197203
3,1.934854,2.334166,0.504910,0.689095,1.336663,1.554074,-0.984915,-0.071358,-0.093818,-0.033572,...,-0.170177,-0.050393,-0.125392,-0.058214,-0.092279,-0.053127,-0.211407,-0.055728,1.649369,-0.197203
4,1.934854,0.728339,0.504910,-1.087115,1.336663,0.434296,0.440504,-0.071358,-0.093818,-0.033572,...,-0.170177,-0.050393,-0.125392,-0.058214,-0.092279,-0.053127,-0.211407,-0.055728,1.649369,-0.197203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3548,1.934854,2.334166,2.574050,7.749529,1.336663,1.554074,-0.984915,-0.071358,-0.093818,-0.033572,...,-0.170177,-0.050393,-0.125392,-0.058214,-0.092279,-0.053127,-0.211407,-0.055728,-0.606292,-0.197203
3549,-0.516835,-0.877487,-0.874517,-0.230258,-0.748132,-0.685482,0.440504,-0.071358,-0.093818,-0.033572,...,-0.170177,-0.050393,-0.125392,-0.058214,-0.092279,-0.053127,-0.211407,-0.055728,-0.606292,-0.197203
3550,-0.516835,-0.877487,-0.874517,-0.639773,-0.748132,1.554074,0.440504,-0.071358,-0.093818,-0.033572,...,-0.170177,-0.050393,-0.125392,-0.058214,-0.092279,-0.053127,-0.211407,-0.055728,1.649369,-0.197203
3551,-0.516835,-0.877487,-0.874517,-0.514780,-0.748132,-0.685482,0.440504,-0.071358,-0.093818,-0.033572,...,-0.170177,-0.050393,-0.125392,-0.058214,-0.092279,-0.053127,-0.211407,-0.055728,1.649369,-0.197203


In [19]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(LinearRegression(), X_scaled, y_log, cv=kfold, scoring='r2')

In [20]:
scores.mean(),scores.std()

(0.8521029622610794, 0.020282328781258456)

In [21]:
lr = LinearRegression()
ridge = Ridge(alpha=0.0001)

In [22]:
lr.fit(X_scaled,y_log)

In [23]:
ridge.fit(X_scaled,y_log)

In [24]:
coef_df = pd.DataFrame(ridge.coef_.reshape(1,112),columns=X.columns).stack().reset_index().drop(columns=['level_0']).rename(columns={'level_1':'feature',0:'coef'})

ValueError: cannot reshape array of size 121 into shape (1,112)

In [25]:
coef_df

NameError: name 'coef_df' is not defined

In [26]:
# 1. Import necessary libraries
import statsmodels.api as sm

# 2. Add a constant to X
X_with_const = sm.add_constant(X_scaled)

# 3. Fit the model
model = sm.OLS(y_log, X_with_const).fit()

# 4. Obtain summary statistics
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.861
Method:                 Least Squares   F-statistic:                     182.3
Date:                Wed, 07 Aug 2024   Prob (F-statistic):               0.00
Time:                        00:44:59   Log-Likelihood:                 593.90
No. Observations:                3553   AIC:                            -943.8
Df Residuals:                    3431   BIC:                            -190.4
Df Model:                         121                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [27]:
y_log.std()

0.5580360979062747

In [28]:
X_scaled['bedRoom'].std()

1.0001407558596598

In [29]:
0.21 * (0.557/1)

0.11697

In [30]:
np.expm1(0.030)

0.030454533953516858

In [31]:
2.4726962617564903e-05 * 100

0.0024726962617564905