In [None]:
# Exploratory Data Analysis (EDA) for Real Estate Pricing

#This notebook performs an exploratory data analysis to understand the factors influencing house prices. We will analyze the data through visualization, feature engineering, and clustering techniques to extract actionable insights.


In [None]:
## Importing Required Libraries

## We use Pandas for data manipulation, Matplotlib and Seaborn for visualization, and scikit-learn for clustering techniques.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

sns.set(style="whitegrid")


In [None]:
## Loading the Dataset

## The dataset is loaded using Pandas. We first check its shape and view the first few rows to ensure it’s loaded correctly.

# Load dataset
df = pd.read_csv('housing_data.csv')

# Display basic info
print("Dataset shape:", df.shape)
print(df.head())


In [None]:
## Dataset Overview

## We check the structure of the dataset using `info()`. This gives us information about columns, data types, and missing values.

print(df.info())


In [None]:
## Checking for Missing Values

## Missing data can skew analysis. We check how many missing values exist in each column.

print(df.isnull().sum())


In [None]:
## Handling Missing Values

## We can choose to drop or fill missing values depending on the situation. Here, we fill missing values or remove them to ensure the dataset is clean.

# Example of filling missing values
df = df.ffill()  # Forward fill



In [None]:
## Removing Duplicate Entries

## Duplicate rows can affect the analysis. We check for and remove any duplicate entries.

df = df.drop_duplicates()
print("Duplicates removed, new shape:", df.shape)


In [None]:
## Summary Statistics

## We generate descriptive statistics to understand the distribution and central tendencies of numerical variables.

print(df.describe())


In [None]:
## Distribution of Sale Price

## We plot a histogram and KDE to understand how house prices are distributed.

plt.figure(figsize=(10,6))
sns.histplot(df['SalePrice'], kde=True, bins=30)
plt.title('Distribution of Sale Price')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.show()


In [None]:
## Boxplot of Sale Prices

## A boxplot helps us identify outliers and understand the spread of house prices.


plt.figure(figsize=(10,6))
sns.boxplot(x=df['SalePrice'])
plt.title('Boxplot of Sale Prices')
plt.show()


In [None]:
## Correlation Among Features

## We calculate the correlation between numerical features to see how strongly they are related to each other and the target variable, SalePrice.

# Select only numeric columns
numeric_df = df.select_dtypes(include='number')

# Calculate correlation
corr = numeric_df.corr()

# Display correlations with SalePrice
print(corr['SalePrice'].sort_values(ascending=False))



In [None]:
## Heatmap of Correlations

## A heatmap provides a visual representation of how features are correlated with each other,
## making it easier to interpret relationships.

plt.figure(figsize=(16, 12))  # 🔹 Increase figure size for clarity
sns.heatmap(
    corr,
    annot=True,          # Keep annotations
    cmap='coolwarm',     # Color scheme
    fmt=".2f",           # Format correlation values
    annot_kws={"size": 8},  # 🔹 Smaller annotation font
    cbar=True,           # Show color bar
    square=True          # Optional: make each cell square-shaped
)
plt.title('Correlation Matrix Heatmap', fontsize=14)
plt.xticks(rotation=45, ha='right')  # 🔹 Rotate x-axis labels for readability
plt.yticks(rotation=0)               # 🔹 Keep y-axis labels horizontal
plt.tight_layout()                   # 🔹 Adjust layout to avoid clipping
plt.show()


In [None]:
## Upper Triangle Heatmap of Correlations



# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

plt.figure(figsize=(16, 12))
sns.heatmap(
    corr,
    mask=mask,           # 🔹 Hides upper triangle
    annot=True,          # Show correlation values
    cmap='coolwarm',     # Color scheme
    fmt=".2f",           # Two decimal places
    annot_kws={"size": 8},  # Smaller annotation font
    cbar=True,           # Show color bar
    square=True,         # Keep square cells
    linewidths=0.5       # 🔹 Add lines between cells
)
plt.title('Correlation Matrix Heatmap (Lower Triangle)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
## Relationship Between Living Area and Sale Price

## We use a scatter plot to visualize how the living area (GrLivArea) affects sale price.


plt.figure(figsize=(10,6))
sns.scatterplot(x='GrLivArea', y='SalePrice', data=df)
plt.title('Living Area vs Sale Price')
plt.xlabel('Living Area (sq ft)')
plt.ylabel('Sale Price')
plt.show()


In [None]:
## Feature Engineering: Price per Square Foot

## We create a new feature, price per square foot, to better understand how area impacts property value.


df['Age'] = 2025 - df['YearBuilt']
print(df[['YearBuilt', 'Age']].head())


In [None]:
## Impact of Bedrooms on Sale Price

## We plot the number of bedrooms against sale price using a boxplot to see how additional rooms impact property values.


plt.figure(figsize=(10,6))
sns.boxplot(x='BedroomAbvGr', y='SalePrice', data=df)
plt.title('Bedrooms vs Sale Price')
plt.xlabel('Number of Bedrooms')
plt.ylabel('Sale Price')
plt.show()


In [None]:
## Impact of Bathrooms on Sale Price

## This boxplot shows how the number of bathrooms is related to property pricing.


plt.figure(figsize=(10,6))
sns.boxplot(x='FullBath', y='SalePrice', data=df)
plt.title('Bathrooms vs Sale Price')
plt.xlabel('Number of Bathrooms')
plt.ylabel('Sale Price')
plt.show()


In [None]:
## Market Trends Over Time

## We combine the year and month of sale into a single datetime column to analyze price trends over time.


df['month_num'] = df['MoSold'].map({
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
})

df['Date'] = pd.to_datetime(dict(year=df['YrSold'], month=df['month_num'], day=1))


In [None]:
## Visualizing Price Trends Over Years

## We group by year and calculate the average price to observe how prices have changed over time.


price_trend = df.groupby(df['Date'].dt.year)['SalePrice'].mean()
plt.figure(figsize=(10,6))
price_trend.plot(marker='o')
plt.title('Average Sale Price Over Years')
plt.xlabel('Year')
plt.ylabel('Average Sale Price')
plt.grid(True)
plt.show()


In [None]:
## Impact of Garage on Sale Price

## We explore how garage capacity and area affect sale prices, indicating how important amenities influence property values.


plt.figure(figsize=(10,6))
sns.boxplot(x='GarageCars', y='SalePrice', data=df)
plt.title('Garage Capacity vs Sale Price')
plt.xlabel('Number of Cars Garage Can Hold')
plt.ylabel('Sale Price')
plt.show()


In [None]:
## Impact of Pool Area on Sale Price

## We explore the relationship between pool area and sale price to understand how recreational amenities add value to a home.


plt.figure(figsize=(10,6))
sns.scatterplot(x='PoolArea', y='SalePrice', data=df)
plt.title('Pool Area vs Sale Price')
plt.xlabel('Pool Area (sq ft)')
plt.ylabel('Sale Price')
plt.show()


In [None]:
## Clustering Based on Amenities

## We prepare the dataset for clustering by selecting relevant features like pool area and garage size.


features = df[['PoolArea', 'GarageCars', 'GarageArea']]


In [None]:
## Applying KMeans Clustering

## We use the KMeans algorithm to group properties into three clusters based on amenities.


kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(features)


In [None]:
## Visualizing Clusters

## We use a scatter plot to observe how houses with similar amenities are grouped into clusters and how this relates to living area and price.


plt.figure(figsize=(10,6))
sns.scatterplot(x='GrLivArea', y='SalePrice', hue='Cluster', palette='Set1', data=df)
plt.title('Clusters of Houses Based on Amenities')
plt.xlabel('Living Area (sq ft)')
plt.ylabel('Sale Price')
plt.legend(title='Cluster')
plt.show()
