In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import mutual_info_regression



import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\Saidabrorkhon\Downloads\archive 2\Indian_housing_Mumbai_data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   house_type        5000 non-null   object 
 1   house_size        5000 non-null   object 
 2   location          5000 non-null   object 
 3   city              5000 non-null   object 
 4   latitude          5000 non-null   float64
 5   longitude         5000 non-null   float64
 6   price             5000 non-null   int64  
 7   currency          5000 non-null   object 
 8   numBathrooms      4986 non-null   float64
 9   numBalconies      733 non-null    float64
 10  isNegotiable      571 non-null    object 
 11  priceSqFt         0 non-null      float64
 12  verificationDate  5000 non-null   object 
 13  description       4922 non-null   object 
 14  SecurityDeposit   5000 non-null   object 
 15  Status            5000 non-null   object 
dtypes: float64(5), int64(1), object(10)
memory

In [11]:
df.head()

Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,isNegotiable,verificationDate,description,SecurityDeposit,Status
0,2 BHK Apartment,"1,180 sq ft",Ulwe,Mumbai,18.971046,73.017723,19000,INR,2.0,2.06412,Negotiable,Posted 2 years ago,It has area of 1180 sqft and is available at a...,No Deposit,Furnished
1,2 BHK Apartment,"1,120 sq ft",Ulwe,Mumbai,18.966377,73.012802,12500,INR,2.0,2.06412,Negotiable,Posted 2 years ago,Well designed 2 bhk multistorey apartment is a...,No Deposit,Unfurnished
2,2 BHK Apartment,"1,200 sq ft",Panvel,Mumbai,18.965979,73.124649,22000,INR,2.0,2.06412,Negotiable,Posted a month ago,A spacious 2 bhk multistorey apartment is avai...,No Deposit,Unfurnished
3,2 BHK Apartment,750 sq ft,Kandivali West,Mumbai,19.213598,72.833633,28500,INR,2.0,2.06412,Negotiable,Posted 3 years ago,It has a built-up area of 750 sqft and is avai...,No Deposit,Unfurnished
4,3 BHK Apartment,"1,400 sq ft",Chembur,Mumbai,19.062933,72.892395,50000,INR,3.0,2.06412,Negotiable,Posted 3 years ago,The house is semi-furnished. It has power back...,No Deposit,Semi-Furnished


In [7]:
for col in df.columns:
  if np.issubdtype(df[col].dtype, np.number):
    df[col].fillna(df[col].mean(),inplace=True)
  else:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.drop(columns=['priceSqFt'], inplace=True)

In [10]:
df.isnull().sum()

house_type          0
house_size          0
location            0
city                0
latitude            0
longitude           0
price               0
currency            0
numBathrooms        0
numBalconies        0
isNegotiable        0
verificationDate    0
description         0
SecurityDeposit     0
Status              0
dtype: int64

In [13]:
df.drop(columns=['verificationDate'], inplace=True)

In [14]:
df['price_per_bathroom'] = df['price'] / df['numBathrooms']

In [15]:
cat_col = df.select_dtypes(include='object').columns
le = LabelEncoder()

for col in cat_col:
  if df[col].nunique() > 5:
    df[col] = le.fit_transform(df[col])
  else:
    df = pd.get_dummies(df, columns=[col], dtype=int, drop_first=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   house_type             5000 non-null   int64  
 1   house_size             5000 non-null   int64  
 2   location               5000 non-null   int64  
 3   latitude               5000 non-null   float64
 4   longitude              5000 non-null   float64
 5   price                  5000 non-null   int64  
 6   numBathrooms           5000 non-null   float64
 7   numBalconies           5000 non-null   float64
 8   description            5000 non-null   int64  
 9   SecurityDeposit        5000 non-null   int64  
 10  price_per_bathroom     5000 non-null   float64
 11  city_Mumbai            5000 non-null   int64  
 12  Status_Semi-Furnished  5000 non-null   int64  
 13  Status_Unfurnished     5000 non-null   int64  
dtypes: float64(5), int64(9)
memory usage: 547.0 KB


In [17]:
scaler = StandardScaler()
num_col = df.select_dtypes(include='number').columns

df[num_col] = scaler.fit_transform(df[num_col])

In [10]:
df.head()

Unnamed: 0,house_type,house_size,location,latitude,longitude,price,numBathrooms,numBalconies,verificationDate,description,SecurityDeposit,city_Mumbai,Status_Semi-Furnished,Status_Unfurnished
0,0.235485,-1.264763,1.567631,-0.431945,0.114293,-0.489027,-0.042387,0.0,-0.87257,-0.346216,0.495598,0.040032,-0.842607,-0.674043
1,0.235485,-1.404781,1.567631,-0.442668,0.105572,-0.562278,-0.042387,0.0,-0.87257,1.54914,0.495598,0.040032,-0.842607,1.483585
2,0.235485,-1.21809,0.827534,-0.443583,0.303773,-0.455219,-0.042387,0.0,1.051054,-0.978259,0.495598,0.040032,-0.842607,1.483585
3,0.235485,0.949605,-0.007447,0.125067,-0.211926,-0.381968,-0.042387,0.0,-0.088871,-0.59347,0.495598,0.040032,-0.842607,1.483585
4,1.124554,-0.855079,-0.899358,-0.220931,-0.107797,-0.139675,1.208146,0.0,-0.088871,0.795015,0.495598,0.040032,1.186793,-0.674043


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   house_type             5000 non-null   float64
 1   house_size             5000 non-null   float64
 2   location               5000 non-null   float64
 3   latitude               5000 non-null   float64
 4   longitude              5000 non-null   float64
 5   price                  5000 non-null   float64
 6   numBathrooms           5000 non-null   float64
 7   numBalconies           5000 non-null   float64
 8   description            5000 non-null   float64
 9   SecurityDeposit        5000 non-null   float64
 10  price_per_bathroom     5000 non-null   float64
 11  city_Mumbai            5000 non-null   float64
 12  Status_Semi-Furnished  5000 non-null   float64
 13  Status_Unfurnished     5000 non-null   float64
dtypes: float64(14)
memory usage: 547.0 KB


In [19]:
def mtop_scores(df, target_column, top_n=9):
    
    x = df.copy()
    y = x.pop(target_column)
    
    mi_scores = mutual_info_regression(x, y)
    mi_scores_df = pd.DataFrame(mi_scores, index=x.columns, columns=['Mutual Info'])
    mi_scores_df = mi_scores_df.sort_values(by='Mutual Info', ascending=False)

    return mi_scores_df.head(top_n)


In [20]:
top_scores = mtop_scores(df, 'price', top_n=9)
top_scores

Unnamed: 0,Mutual Info
price_per_bathroom,3.70867
longitude,0.92083
latitude,0.831547
location,0.81768
house_size,0.693007
description,0.484144
SecurityDeposit,0.357726
house_type,0.343068
numBathrooms,0.286443


In [21]:
x = df.drop(columns=['price'])
y = df['price']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [22]:
model = DecisionTreeRegressor(random_state=42)
dt_model = model.fit(x_train, y_train)

In [23]:
y_pred = dt_model.predict(x_val)

In [31]:
dt_mse = mean_squared_error(y_val, y_pred)
dt_r2 = r2_score(y_val, y_pred)
print(dt_mse)
print(dt_r2)

# mse: 0.13427876286517653
# r2:  0.9197168935761537

dt_mse = 0.13427876286517653
dt_r2 = 0.9197168935761537

0.022874350870483156
0.9796564204248567


In [None]:
mi_mse = mean_squared_error(y_val, y_pred)
mi_r2 = r2_score(y_val, y_pred)
print(mi_mse)
print(mi_r2)

# 0.022874350870483156
# 0.9796564204248567

0.022874350870483156
0.9796564204248567


In [25]:
from tabulate import tabulate

In [None]:
headers=['Model', 'MSE', 'R2']
table=[
  ['After feature engineering', dt_mse, dt_r2],
  ['DecisionTreeRegressor', mi_mse, mi_r2]
]

print(tabulate(table, headers=headers, tablefmt='grid'))

+---------------------------+-----------+----------+
| Model                     |       MSE |       R2 |
| After feature engineering | 0.134279  | 0.919717 |
+---------------------------+-----------+----------+
| DecisionTreeRegressor     | 0.0228744 | 0.979656 |
+---------------------------+-----------+----------+
