In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_csv(("House_Rent_Dataset.csv"))
display(df)

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner
...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2022-05-18,2,15000,1000,3 out of 5,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner
4742,2022-05-15,3,29000,2000,1 out of 4,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner
4743,2022-07-10,3,35000,1750,3 out of 5,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent
4744,2022-07-06,3,45000,1500,23 out of 34,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent


In [3]:
df = df.drop(["Posted On"],axis = 1)

In [4]:
categorical1 = df["Floor"].astype('category')
df["Floor"] = categorical1.cat.codes + 1

In [5]:
categorical2 = df["Area Type"].astype('category')
df["Area Type"] = categorical2.cat.codes + 1

In [6]:
categorical3 = df["Area Locality"].astype('category')
df["Area Locality"] = categorical3.cat.codes + 1

In [7]:
categorical4 = df["City"].astype('category')
df["City"] = categorical4.cat.codes + 1

In [8]:
categorical5 = df["Furnishing Status"].astype('category')
df["Furnishing Status"] = categorical5.cat.codes + 1

In [9]:
categorical6 = df["Tenant Preferred"].astype('category')
df["Tenant Preferred"] = categorical6.cat.codes + 1

In [10]:
categorical7 = df["Point of Contact"].astype('category')
df["Point of Contact"] = categorical7.cat.codes + 1

In [11]:
x = df[["BHK","Size","Floor","Area Type","Area Locality","City","Furnishing Status",
        "Tenant Preferred", "Bathroom", "Point of Contact"]]
y = df["Rent"]

### 1st Iteration - Simple Linear Regression

In [12]:
function_dict = {'predictor': [], 'r-squared':[]}
for col in x.columns:
    selected_x = x[[col]]
    model = sm.OLS(y, sm.add_constant(selected_x)).fit()
    y_preds = model.predict(sm.add_constant(selected_x))
    function_dict['predictor'].append(col)
    r2 = np.corrcoef(y, y_preds)[0, 1]**2
    function_dict['r-squared'].append(r2)
    
function_df = pd.DataFrame(function_dict).sort_values(by=['r-squared'], ascending = False)
display(function_df.head())

Unnamed: 0,predictor,r-squared
8,Bathroom,0.194671
1,Size,0.171024
0,BHK,0.136691
9,Point of Contact,0.11538
3,Area Type,0.046168


### 2nd Iteration

In [13]:
def next_possible_feature (x_npf, y_npf, current_features, ignore_features=[]):  
    function_dict = {'predictor': [], 'r-squared':[]}
    for col in x_npf.columns:
        if col not in (current_features+ignore_features):
            selected_x = x_npf[current_features + [col]]
            model = sm.OLS(y_npf, sm.add_constant(selected_x)).fit()
            y_preds = model.predict(sm.add_constant(selected_x))
            function_dict['predictor'].append(col)
            r2 = np.corrcoef(y_npf, y_preds)[0, 1]**2
            function_dict['r-squared'].append(r2)
    function_df = pd.DataFrame(function_dict).sort_values(by=['r-squared'],ascending = False)
    display(function_df.head())

In [14]:
selected_features = ['Bathroom']
ignore_features = []
next_possible_feature (x_npf=x, y_npf=y, current_features=selected_features)

Unnamed: 0,predictor,r-squared
8,Point of Contact,0.237315
5,City,0.221476
3,Area Type,0.213282
1,Size,0.211341
6,Furnishing Status,0.199203


### Multicollinearity Check

In [15]:
vif_2 = pd.DataFrame() 
x_2 = x[['Bathroom', 'Point of Contact']]
vif_2["features"] = x_2.columns
vif_2["VIF"] = [variance_inflation_factor(x_2.values, i) \
                for i in range(len(x_2.columns))]
vif_2

Unnamed: 0,features,VIF
0,Bathroom,2.75442
1,Point of Contact,2.75442


### 3rd Iteration

In [16]:
selected_features = ['Bathroom', 'Point of Contact']
ignore_features = ['BHK', 'Size']
next_possible_feature (x_npf=x, y_npf=y, current_features=selected_features)

Unnamed: 0,predictor,r-squared
1,Size,0.256493
5,City,0.247113
6,Furnishing Status,0.239969
0,BHK,0.23908
3,Area Type,0.238221


### 4th Iteration

In [19]:
vif_3 = pd.DataFrame()
x_3 = x[['Bathroom', 'Point of Contact','City']]
vif_3["features"] = x_3.columns
vif_3["VIF"] = [variance_inflation_factor(x_3.values, i) \
                for i in range(len(x_3.columns))]
vif_3

Unnamed: 0,features,VIF
0,Bathroom,4.057138
1,Point of Contact,3.03716
2,City,3.554184


### Results

In [20]:
final_features = ['Bathroom', 'Point of Contact', 'City']
final_x = x[final_features]
final_model = sm.OLS(y, sm.add_constant(final_x)).fit()
final_model.summary()

0,1,2,3
Dep. Variable:,Rent,R-squared:,0.247
Model:,OLS,Adj. R-squared:,0.247
Method:,Least Squares,F-statistic:,518.8
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,1.4600000000000001e-291
Time:,17:01:34,Log-Likelihood:,-59528.0
No. Observations:,4746,AIC:,119100.0
Df Residuals:,4742,BIC:,119100.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9852.2647,5129.525,-1.921,0.055,-1.99e+04,203.987
Bathroom,3.273e+04,1177.720,27.790,0.000,3.04e+04,3.5e+04
Point of Contact,-1.507e+04,1186.226,-12.707,0.000,-1.74e+04,-1.27e+04
City,4631.5042,589.598,7.855,0.000,3475.618,5787.390

0,1,2,3
Omnibus:,12361.287,Durbin-Watson:,1.939
Prob(Omnibus):,0.0,Jarque-Bera (JB):,385282579.711
Skew:,29.625,Prob(JB):,0.0
Kurtosis:,1397.568,Cond. No.,26.3
