<a href="https://colab.research.google.com/github/sharunraj14/ICT_Projects/blob/main/House_pricing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

1. Load Dataset

In [2]:
df = pd.read_csv('House_Pricing.csv')

print("--- Dataset Info ---")
df.info()
print("\n--- Descriptive Statistics ---")
display(df.describe())
print("\n--- Initial Missing Values ---")
print(df.isnull().sum())

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ID                                         21613 non-null  int64  
 1   Date House was Sold                        21613 non-null  object 
 2   Sale Price                                 21609 non-null  float64
 3   No of Bedrooms                             21613 non-null  int64  
 4   No of Bathrooms                            21609 non-null  float64
 5   Flat Area (in Sqft)                        21604 non-null  float64
 6   Lot Area (in Sqft)                         21604 non-null  float64
 7   No of Floors                               21613 non-null  float64
 8   Waterfront View                            21613 non-null  object 
 9   No of Times Visited                        2124 non-null   object 
 10  C

Unnamed: 0,ID,Sale Price,No of Bedrooms,No of Bathrooms,Flat Area (in Sqft),Lot Area (in Sqft),No of Floors,Overall Grade,Area of the House from Basement (in Sqft),Basement Area (in Sqft),Age of House (in Years),Renovated Year,Zipcode,Latitude,Longitude,Living Area after Renovation (in Sqft),Lot Area after Renovation (in Sqft)
count,21613.0,21609.0,21613.0,21609.0,21604.0,21604.0,21613.0,21613.0,21610.0,21613.0,21613.0,21613.0,21612.0,21612.0,21612.0,21612.0,21613.0
mean,4580302000.0,540198.4,3.370842,2.114732,2079.931772,15107.76,1.494309,7.623467,1788.344193,291.509045,46.994864,84.402258,98077.937766,47.560048,-122.213892,1986.538914,12768.455652
std,2876566000.0,367389.0,0.930062,0.770138,918.487597,41428.27,0.539989,1.105439,827.982604,442.575043,29.373411,401.67924,53.505425,0.138565,0.14083,685.404255,27304.179631
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,1.0,290.0,0.0,3.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,321950.0,3.0,1.75,1429.25,5040.0,1.0,7.0,1190.0,0.0,21.0,0.0,98033.0,47.470975,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7617.5,1.5,7.0,1560.0,0.0,43.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.25,2.0,8.0,2210.0,560.0,67.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,10.0,9410.0,4820.0,118.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0



--- Initial Missing Values ---
ID                                               0
Date House was Sold                              0
Sale Price                                       4
No of Bedrooms                                   0
No of Bathrooms                                  4
Flat Area (in Sqft)                              9
Lot Area (in Sqft)                               9
No of Floors                                     0
Waterfront View                                  0
No of Times Visited                          19489
Condition of the House                           0
Overall Grade                                    0
Area of the House from Basement (in Sqft)        3
Basement Area (in Sqft)                          0
Age of House (in Years)                          0
Renovated Year                                   0
Zipcode                                          1
Latitude                                         1
Longitude                                        1

2. Remove duplicate values

In [3]:
# Remove duplicate rows
initial_rows = len(df)
df.drop_duplicates(inplace=True)
print(f"Duplicate rows removed: {initial_rows - len(df)}")

# Remove duplicate columns
duplicate_cols = []
cols = df.columns
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        if df[cols[i]].equals(df[cols[j]]):
            duplicate_cols.append(cols[j])

df.drop(columns=duplicate_cols, inplace=True)
print(f"Duplicate columns removed: {duplicate_cols}")

Duplicate rows removed: 0
Duplicate columns removed: []


3. Handling Missing Values

In [4]:
# Median
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in num_cols:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Imputed numerical column '{col}' with median: {median_val}")

# Mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)
        print(f"Imputed categorical column '{col}' with mode: {mode_val}")

print("\nMissing values after imputation:", df.isnull().sum().sum())

Imputed numerical column 'Sale Price' with median: 450000.0
Imputed numerical column 'No of Bathrooms' with median: 2.25
Imputed numerical column 'Flat Area (in Sqft)' with median: 1910.0
Imputed numerical column 'Lot Area (in Sqft)' with median: 7617.5
Imputed numerical column 'Area of the House from Basement (in Sqft)' with median: 1560.0
Imputed numerical column 'Zipcode' with median: 98065.0
Imputed numerical column 'Latitude' with median: 47.5718
Imputed numerical column 'Longitude' with median: -122.23
Imputed numerical column 'Living Area after Renovation (in Sqft)' with median: 1840.0
Imputed categorical column 'No of Times Visited' with mode: Twice

Missing values after imputation: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_val, inplace=True)


4. Outlier Removal-IQR

In [5]:
# Identify numerical columns for outlier detection

outlier_features = ['Sale Price', 'Flat Area (in Sqft)', 'Lot Area (in Sqft)']

initial_shape = df.shape[0]
for col in outlier_features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filtering
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

print(f"Total rows removed as outliers: {initial_shape - df.shape[0]}")

Total rows removed as outliers: 3590


5. Scaling Numerical Variables

In [6]:
scaler = StandardScaler()

features_to_scale = [col for col in num_cols if col not in ['Sale Price', 'ID']]

df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

print("--- Scaled Features (First 5 rows) ---")
display(df[features_to_scale].head())

--- Scaled Features (First 5 rows) ---


Unnamed: 0,No of Bedrooms,No of Bathrooms,Flat Area (in Sqft),Lot Area (in Sqft),No of Floors,Overall Grade,Area of the House from Basement (in Sqft),Basement Area (in Sqft),Age of House (in Years),Renovated Year,Zipcode,Latitude,Longitude,Living Area after Renovation (in Sqft),Lot Area after Renovation (in Sqft)
0,-0.330875,-1.469091,-1.024904,-0.443258,-0.868647,-0.470639,-0.685261,-0.662914,0.499469,-0.193112,1.787339,-0.354482,-0.216239,-0.907818,-0.36328
1,-0.330875,0.343215,0.992519,0.022624,0.972433,-0.470639,0.827197,0.36638,0.633208,5.165028,0.804533,1.162954,-0.681005,-0.287308,0.104018
2,-1.433773,-1.469091,-1.619972,0.829724,-0.868647,-1.524084,-1.311632,-0.662914,1.235036,-0.193112,-0.994186,1.285188,-0.03633,1.538764,0.203398
3,0.772022,1.430599,0.107175,-0.633474,-0.868647,-0.470639,-0.883866,1.678729,0.165121,-0.193112,1.008512,-0.285047,-1.235726,-0.87236,-0.515992
4,-0.330875,-0.019246,-0.299212,0.267856,-0.868647,0.582805,0.078607,-0.662914,-0.570446,-0.193112,-0.141185,0.409299,1.372961,-0.09229,0.072066


6. Encoding Categorical Variables

In [7]:
# Label Encoding (Ordinal: Bad < Okay < Fair < Good < Excellent)

cond_map = {'Bad':1, 'Okay':2, 'Fair':3, 'Good':4, 'Excellent':5}
df['Condition of the House'] = df['Condition of the House'].map(cond_map)

# One-Hot Encoding (Nominal)
df = pd.get_dummies(df, columns=['Waterfront View'], drop_first=True)

print("--- Encoded Columns (First 5 rows) ---")
display(df[['Condition of the House', 'Waterfront View_Yes']].head())

--- Encoded Columns (First 5 rows) ---


Unnamed: 0,Condition of the House,Waterfront View_Yes
0,3,False
1,3,False
2,3,False
3,5,False
4,3,False


7. Train-Test Split

In [8]:
# Setting target variable
X = df.drop(columns=['Sale Price', 'ID', 'Date House was Sold'], errors='ignore')
y = df['Sale Price']

# Split 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (14418, 18)
Testing set size: (3605, 18)
