In [5]:
#Loading the Dataset & Inspect Structure
import pandas as pd
df = pd.read_csv('Dataset.csv')
print("========== [MODULE 1: Dataset Overview] ==========\n")
print(f" Dataset Loaded Successfully!")
print(f" Total Records   : {df.shape[0]}")
print(f" Total Features  : {df.shape[1]}\n")
print("Column Names:")
print("-" * 50)
for col in df.columns:
    print(f"‚Ä¢ {col}")
print("-" * 50)

print("\nSample Records (Top 5 Rows):")
print(df.head().to_markdown(index=False))



 Dataset Loaded Successfully!
 Total Records   : 21609
 Total Features  : 31

Column Names:
--------------------------------------------------
‚Ä¢ Sale_Price
‚Ä¢ No of Bedrooms
‚Ä¢ No of Bathrooms
‚Ä¢ Flat Area (in Sqft)
‚Ä¢ Lot Area (in Sqft)
‚Ä¢ No of Floors
‚Ä¢ No of Times Visited
‚Ä¢ Overall Grade
‚Ä¢ Area of the House from Basement (in Sqft)
‚Ä¢ Basement Area (in Sqft)
‚Ä¢ Age of House (in Years)
‚Ä¢ Latitude
‚Ä¢ Longitude
‚Ä¢ Living Area after Renovation (in Sqft)
‚Ä¢ Lot Area after Renovation (in Sqft)
‚Ä¢ Years Since Renovation
‚Ä¢ Condition_of_the_House_Excellent
‚Ä¢ Condition_of_the_House_Fair
‚Ä¢ Condition_of_the_House_Good
‚Ä¢ Condition_of_the_House_Okay
‚Ä¢ Ever_Renovated_Yes
‚Ä¢ Waterfront_View_Yes
‚Ä¢ Zipcode_Group_Zipcode_Group_1
‚Ä¢ Zipcode_Group_Zipcode_Group_2
‚Ä¢ Zipcode_Group_Zipcode_Group_3
‚Ä¢ Zipcode_Group_Zipcode_Group_4
‚Ä¢ Zipcode_Group_Zipcode_Group_5
‚Ä¢ Zipcode_Group_Zipcode_Group_6
‚Ä¢ Zipcode_Group_Zipcode_Group_7
‚Ä¢ Zipcode_Group_Zipcode_Group_8
‚Ä¢ Z

In [6]:
# Removing Duplicates
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]

print("\n========== [MODULE 2: Duplicate Removal] ==========")
print(f" Rows Before Duplicate Removal : {before}")
print(f" Duplicates Removed            : {before - after}")
print(f" Rows After Duplicate Removal  : {after}")



 Rows Before Duplicate Removal : 21609
 Duplicates Removed            : 11
 Rows After Duplicate Removal  : 21598


In [12]:
# Checking Missing Values
missing = df.isnull().sum()
missing = missing[missing > 0]

print("\n========== [MODULE 3: Missing Value Check] ==========")
if not missing.empty:
    print("‚ùó Missing Values Found:\n")
    print(missing.to_frame(name='Missing Count').to_markdown())
else:
    print("‚úÖ No Missing Values Detected!")


‚ùó Missing Values Found:

|                 |   Missing Count |
|:----------------|----------------:|
| House Age Group |            1242 |


In [11]:
#Detecting and Remove Outliers
print("\n========== [MODULE 4: Outlier Detection] ==========")
print(f" Max Bedrooms       : {df['No of Bedrooms'].max()}")
print(f" Max Bathrooms      : {df['No of Bathrooms'].max()}")
print(f" Min Flat Area (sqft): {df['Flat Area (in Sqft)'].min()}\n")

# Remove unreasonable outliers
df = df[(df['No of Bedrooms'] <= 10) &
        (df['No of Bathrooms'] <= 6) &
        (df['Flat Area (in Sqft)'] >= 300)]

print("‚úÖ Outlier Records Removed Based On:")
print("   ‚Ä¢ Bedrooms > 10")
print("   ‚Ä¢ Bathrooms > 6")
print("   ‚Ä¢ Flat Area < 300 sqft")
print(f"\nüìä Remaining Rows After Cleaning: {df.shape[0]}")


 Max Bedrooms       : 10
 Max Bathrooms      : 6.0
 Min Flat Area (sqft): 370

‚úÖ Outlier Records Removed Based On:
   ‚Ä¢ Bedrooms > 10
   ‚Ä¢ Bathrooms > 6
   ‚Ä¢ Flat Area < 300 sqft

üìä Remaining Rows After Cleaning: 21585


In [13]:
# Feature Engineering
df['Total Area (in Sqft)'] = df['Flat Area (in Sqft)'] + df['Basement Area (in Sqft)']
df['House Age Group'] = pd.cut(df['Age of House (in Years)'],
                               bins=[0, 10, 20, 30, 40, 100],
                               labels=['0-10', '11-20', '21-30', '31-40', '40+'])

print("\n========== [MODULE 5: Feature Engineering] ==========")
print("üÜï New Features Added:")
print("   ‚Ä¢ Total Area (in Sqft)")
print("   ‚Ä¢ House Age Group\n")

print("Sample View of Engineered Features:\n")
print(df[['Flat Area (in Sqft)', 'Basement Area (in Sqft)',
          'Total Area (in Sqft)', 'Age of House (in Years)',
          'House Age Group']].head().to_markdown(index=False))



üÜï New Features Added:
   ‚Ä¢ Total Area (in Sqft)
   ‚Ä¢ House Age Group

Sample View of Engineered Features:

|   Flat Area (in Sqft) |   Basement Area (in Sqft) |   Total Area (in Sqft) |   Age of House (in Years) | House Age Group   |
|----------------------:|--------------------------:|-----------------------:|--------------------------:|:------------------|
|                  1180 |                         0 |                   1180 |                        63 | 40+               |
|                  2570 |                       400 |                   2970 |                        67 | 40+               |
|                   770 |                         0 |                    770 |                        85 | 40+               |
|                  1960 |                       910 |                   2870 |                        53 | 40+               |
|                  1680 |                         0 |                   1680 |                        31 | 31-40           

In [14]:
# Saving Cleaned Data
df.to_csv("Cleaned_Dataset.csv", index=False)

print("\n========== [MODULE 6: Save Cleaned Dataset] ==========")
print("üíæ Cleaned dataset successfully saved as Processed_Dataset.csv")


üíæ Cleaned dataset successfully saved as Processed_Dataset.csv
