In [1]:
# 📓 Feature Engineering Notebook for DevOps Learners

# ---
# 🧪 Imports
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

# ---
# 📥 Load Cleaned Data
df = pd.read_csv("../data/processed/cleaned_house_data.csv")
df.head()


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/cleaned_house_data.csv'

In [None]:
# ---
# 🔍 3. Initial Data Exploration

# Print dataset shape: rows × columns
print("Dataset shape (colums, rows):", df.shape)

# Print summary of column names, types, null counts, etc.
df.info()

# Show basic stats (mean, min, max, std) for numeric columns
df.describe()

# ---
# 🧠 Why this matters:
# This gives a quick overview of the data:
#  Are there missing values?
#  What kind of columns are there (numeric, text)?
#  Do the values look reasonable?n


In [None]:
# ---
# 🎯 Feature Engineering (Human-Readable)

# Feature 1: Age of the house
df['house_age'] = datetime.now().year - df['year_built']

# Feature 2: Price per square foot
df['price_per_sqft'] = df['price'] / df['sqft']

# Feature 3: Ratio of bedrooms to bathrooms
df['bed_bath_ratio'] = df['bedrooms'] / df['bathrooms']

# Handle division by zero and infinite values
df['bed_bath_ratio'] = df['bed_bath_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)

# 📊 Quick Look at Engineered Data
df[['house_age', 'price_per_sqft', 'bed_bath_ratio']].describe()


In [None]:
# 📊 5. Visualize the New Features

# Plot distributions of the new features to check if they make sense

fig, axs = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(df['house_age'], kde=True, ax=axs[0])
axs[0].set_title('Distribution of House Age')

sns.histplot(df['price_per_sqft'], kde=True, ax=axs[1])
axs[1].set_title('Price per Sqft')

sns.histplot(df['bed_bath_ratio'], kde=True, ax=axs[2])
axs[2].set_title('Bedroom to Bathroom Ratio')

plt.tight_layout()
plt.show()


In [None]:
# 💾 Save Final Engineered Features (Human-Readable CSV)

selected_columns = [
    'price', 'sqft', 'bedrooms', 'bathrooms', 'location', 'year_built', 'condition',
    'house_age', 'price_per_sqft', 'bed_bath_ratio'
]

df[selected_columns].to_csv("../data/processed/data_scientists_features.csv", index=False)
print("✅ Saved human-readable engineered features to ../data/processed/data_scientists_features.csv")


In [None]:
# ✅ 8. Summary
# We:
# - Created domain-informed features: house_age, price_per_sqft, bed_bath_ratio
# - Explored and visualized them
# - Built a preprocessing pipeline using sklearn
# - Saved the preprocessed data and pipeline for downstream ML tasks

# 👨‍🔬 This notebook represents the **data scientist/MLE workflow** for feature engineering,
# before these steps are automated via a script like `engineer.py` for production.

# 📦 Next step: Experimentation to find out the right Algorithm and its Configurations (Hyperparameters)!
