## House Price Prediction

### Dataset Information

Dataset was obtained from kaggle through this link: https://www.kaggle.com/datasets/anmolkumar/house-price-prediction-challenge

In [35]:
import pandas as pd

train_dataset = pd.read_csv('./train.csv')

print(f"Dataset Head:\n\n{train_dataset.head(2)}\n\n")
print(f"Dataset Columns:\n{train_dataset.columns}")

Dataset Head:

  POSTED_BY  UNDER_CONSTRUCTION  RERA  BHK_NO. BHK_OR_RK    SQUARE_FT  \
0     Owner                   0     0        2       BHK  1300.236407   
1    Dealer                   0     0        2       BHK  1275.000000   

   READY_TO_MOVE  RESALE                    ADDRESS  LONGITUDE   LATITUDE  \
0              1       1      Ksfc Layout,Bangalore  12.969910  77.597960   
1              1       1  Vishweshwara Nagar,Mysore  12.274538  76.644605   

   TARGET(PRICE_IN_LACS)  
0                   55.0  
1                   51.0  


Dataset Columns:
Index(['POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK',
       'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'ADDRESS', 'LONGITUDE',
       'LATITUDE', 'TARGET(PRICE_IN_LACS)'],
      dtype='object')


In [36]:
## Check for missing value

print(f"Missing Value:\n{train_dataset.isna().sum()}")

Missing Value:
POSTED_BY                0
UNDER_CONSTRUCTION       0
RERA                     0
BHK_NO.                  0
BHK_OR_RK                0
SQUARE_FT                0
READY_TO_MOVE            0
RESALE                   0
ADDRESS                  0
LONGITUDE                0
LATITUDE                 0
TARGET(PRICE_IN_LACS)    0
dtype: int64


In [37]:
# Define Outlier check function 

import numpy as np

def find_outliers_iqr(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

outlier_prone_features = ['BHK_NO.', 'SQUARE_FT', 'TARGET(PRICE_IN_LACS)']



In [38]:
# Check for outlier
for outline_prone_feature in outlier_prone_features:
    print(f"Outlier For {outline_prone_feature}:\n{len(find_outliers_iqr(train_dataset[outline_prone_feature]))}\n")

Outlier For BHK_NO.:
284

Outlier For SQUARE_FT:
1635

Outlier For TARGET(PRICE_IN_LACS):
3084



In [39]:
# Check data size before outlier deleteion
print(f"Data size before outlier deletion: {len(train_dataset)}")

# Delete Outliers
for outline_prone_feature in outlier_prone_features:
    train_dataset = train_dataset[~train_dataset[outline_prone_feature].isin(find_outliers_iqr(train_dataset[outline_prone_feature]))]

# Check data size after outlier deleteion
print(f"Data size after outlier deletion: {len(train_dataset)}")

Data size before outlier deletion: 29451
Data size after outlier deletion: 25633


In [40]:
# Check for imbalance
from collections import Counter

imbalance_prone_features = ['POSTED_BY', 'BHK_OR_RK', 'ADDRESS']

for imbalance_prone_feature in imbalance_prone_features:
    print(f"{Counter(train_dataset[imbalance_prone_feature])}")

Counter({'Dealer': 14918, 'Owner': 10131, 'Builder': 584})
Counter({'BHK': 25609, 'RK': 24})
Counter({'Zirakpur,Chandigarh': 504, 'Raj Nagar Extension,Ghaziabad': 213, 'Whitefield,Bangalore': 196, 'Sector-137 Noida,Noida': 130, 'New Town,Kolkata': 122, 'Jagatpura,Jaipur': 118, 'Rajarhat,Kolkata': 109, 'Noida Extension,Noida': 106, 'Thanisandra,Bangalore': 101, 'Sector 88 Faridabad,Faridabad': 100, 'Sector 86 Faridabad,Faridabad': 96, 'Sector-75 Noida,Noida': 95, 'Ajmer Road,Jaipur': 92, 'Sarjapur Road,Bangalore': 85, 'Electronics City Phase 1,Bangalore': 84, 'Sector-150 Noida,Noida': 82, 'Baner,Pune': 80, 'Wagholi,Pune': 79, 'Kundli,Sonipat': 78, 'Sector-74 Noida,Noida': 77, 'Mansarovar,Jaipur': 74, 'Yelahanka,Bangalore': 73, 'Sector-78 Noida,Noida': 73, 'Crossing Republik,Ghaziabad': 69, 'Alwar Bypass Road,Bhiwadi': 67, 'Wakad,Pune': 67, 'Vaishali Nagar,Jaipur': 67, 'Vaibhav Khand,Ghaziabad': 67, 'Sector 82 Faridabad,Faridabad': 66, 'Akshaya Nagar,Bangalore': 66, 'NIBM,Pune': 63, 'Kha

In [41]:
# Copy Dataset

processed_train_df = train_dataset.copy()

In [42]:
# Create a new feature called city
processed_train_df['CITY'] = processed_train_df['ADDRESS'].apply(lambda x: x.split(',')[-1])

print(Counter(processed_train_df['CITY']))

Counter({'Bangalore': 3793, 'Lalitpur': 2120, 'Pune': 1811, 'Kolkata': 1582, 'Noida': 1578, 'Mumbai': 1396, 'Chennai': 1190, 'Maharashtra': 1079, 'Ghaziabad': 1057, 'Jaipur': 909, 'Chandigarh': 675, 'Faridabad': 618, 'Mohali': 498, 'Vadodara': 482, 'Surat': 379, 'Nagpur': 331, 'Lucknow': 312, 'Indore': 283, 'Gurgaon': 258, 'Bhubaneswar': 228, 'Bhopal': 211, 'Kochi': 178, 'Visakhapatnam': 169, 'Bhiwadi': 161, 'Goa': 141, 'Coimbatore': 134, 'Dehradun': 123, 'Ranchi': 117, 'Mangalore': 116, 'Gandhinagar': 114, 'Sonipat': 112, 'Palghar': 107, 'Secunderabad': 102, 'Guwahati': 89, 'Raipur': 88, 'Kanpur': 84, 'Jamshedpur': 84, 'Siliguri': 82, 'Rajkot': 81, 'Patna': 81, 'Agra': 79, 'Panchkula': 77, 'Vijayawada': 75, 'Aurangabad': 66, 'Jamnagar': 66, 'Raigad': 64, 'Dharuhera': 63, 'Durgapur': 55, 'Thrissur': 54, 'Gwalior': 53, 'Allahabad': 49, 'Meerut': 49, 'Bahadurgarh': 47, 'Anand': 46, 'Ernakulam': 44, 'Mysore': 43, 'Bharuch': 42, 'Valsad': 41, 'Kota': 41, 'Varanasi': 40, 'Ratnagiri': 40, 'H

In [43]:
# Drop Address
processed_train_df = processed_train_df.drop('ADDRESS', axis=1)

In [44]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

features_to_encode = ['POSTED_BY', 'BHK_OR_RK', 'CITY']

for feature_to_encode in features_to_encode:
    processed_train_df[feature_to_encode] = le.fit_transform(processed_train_df[feature_to_encode])

In [45]:
# Normalize and split the data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

features = ['POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK',
       'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'CITY', 'LONGITUDE',
       'LATITUDE']

target = 'TARGET(PRICE_IN_LACS)'

x = processed_train_df[features]
y = processed_train_df[target]

scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

In [106]:
# Random forest regressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# Build Model
rf_regressor = RandomForestRegressor(n_estimators=150, max_depth=10, min_samples_split=2)
rf_regressor.fit(x_train, y_train)

# Make predictions
rf_pred = rf_regressor.predict(x_test)

# Calculate MAE
rf_mae = mean_absolute_error(y_test, rf_pred)
print(f"Random Forest MAE: {rf_mae:.2f}")

# Overfit Check
rf_train_pred = rf_regressor.predict(x_train)
rf_train_mae = mean_absolute_error(y_train, rf_train_pred)

print(f"Random Forest MAE for Train Data: {rf_train_mae:.2f}")

Random Forest MAE: 13.60
Random Forest MAE for Train Data: 12.11
