In [None]:
import pandas as pd
import numpy as np

# 1. Load the Dataset
# We will use a reliable GitHub mirror of the Kaggle House Prices train dataset
url = "https://raw.githubusercontent.com/joolsa/Kaggle_House_Prices/master/train.csv"
train_df = pd.read_csv(url)

# Drop Id from features as requested
train_df = train_df.drop('Id', axis=1)

# ---------------------------------------------------------
# METRIC JUSTIFICATION: RMSLE vs RMSE Example
# ---------------------------------------------------------
print("--- RMSLE vs RMSE Demonstration ---")
from sklearn.metrics import root_mean_squared_error, root_mean_squared_log_error

# Hypothetical Scenario: $10k error on a Cheap home vs an Expensive home
actuals = [100_000, 1_000_000]
predictions = [110_000, 1_010_000] # $10k error for both

rmse = root_mean_squared_error(actuals, predictions)
rmsle = root_mean_squared_log_error(actuals, predictions)

print(f"Absolute RMSE for both combined: ${rmse:.2f}")
print(f"RMSLE (Relative error): {rmsle:.4f}")
print("RMSLE naturally normalizes the errors across huge price differences.\n")

# ---------------------------------------------------------
# DATA OVERVIEW & BASELINE REPORT
# ---------------------------------------------------------
print("--- Dataset Overview ---")
print(f"Shape (Rows, Columns): {train_df.shape}\n")

# Target Summary
target = train_df['SalePrice']
print("--- Target (SalePrice) Summary ---")
print(f"Mean:   ${target.mean():,.0f}")
print(f"Median: ${target.median():,.0f}")
print(f"Skew:   {target.skew():.2f}")
print()

# Feature Typology
# Exclude the target from the feature count
features = train_df.drop('SalePrice', axis=1)
num_features = features.select_dtypes(include=[np.number]).columns
cat_features = features.select_dtypes(include=['object']).columns

print("--- Feature Types ---")
print(f"Numeric features:     {len(num_features)}")
print(f"Categorical features: {len(cat_features)}")
print()

# Missing Values Table
print("--- Top 10 Missing Features ---")
missing_counts = features.isnull().sum()
missing_pct = (missing_counts / len(features)) * 100

# Create a DataFrame for easy viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Percentage': missing_pct
})

# Sort and get top 10
top_missing = missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Count', ascending=False).head(10)

# Format the percentage for display
top_missing['Percentage'] = top_missing['Percentage'].map('{:.1f}%'.format)

print(top_missing)