# 🏠 House Data Exploration
This notebook performs initial data exploration on the `house_data.csv` dataset.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/house_data.csv')

# ℹ️ Display basic information about the dataset
print(f"📐 Dataset shape (rows, columns): {df.shape}")
print("\n📊 Dataset Info:")
print(df.info())


In [None]:
# ❓ Checking for missing values to identify data quality issues
print("\n🔍 Missing values per column:")
print(df.isnull().sum())

In [None]:
# 📈 Descriptive statistics like mean, std deviation, min, max, etc.
print("\n📋 Statistical Summary:")
print(df.describe())


In [None]:
# 💰 Visualizing the distribution of house prices
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], kde=True)
plt.title('House Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

print("📊 Plotted house price distribution.")

In [None]:
# 🔗 Checking how features are correlated with each other
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

print("📈 Displayed feature correlation heatmap.")


In [None]:
# 📏 Scatter plot to observe relationship between sqft and price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sqft', y='price', data=df)
plt.title('Price vs Square Footage')
plt.xlabel('Square Footage')
plt.ylabel('Price')
plt.show()

print("📐 Analyzed relation between square footage and price.")


In [None]:
# 📍 Bar chart to see how many houses exist in each location
plt.figure(figsize=(10, 6))
sns.countplot(x='location', data=df)
plt.title('Houses by Location')
plt.xlabel('Location')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

print("📍 Visualized distribution of houses by location.")


In [None]:
# 🛏️ Bar chart showing how average house price varies by bedroom count
plt.figure(figsize=(10, 6))
sns.barplot(x='bedrooms', y='price', data=df)
plt.title('Average Price by Number of Bedrooms')
plt.xlabel('Number of Bedrooms')
plt.ylabel('Average Price')
plt.show()

print("🛏️ Compared average prices based on bedroom count.")


In [None]:
# Next Step 
# Feature Engineering
# Create new features or modify existing ones
