In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Paths
DATA_PATH = Path('../data/train.csv')
FIGURES_PATH = Path('../report/figures')
FIGURES_PATH.mkdir(parents=True, exist_ok=True)

print(f"Figures will be saved to: {FIGURES_PATH.resolve()}")

In [None]:
# Load data
df = pd.read_csv(DATA_PATH)
print(f"Dataset shape: {df.shape}")
print(f"Number of instances: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")
df.head()

## 1. Missing Values Analysis

In [None]:
# Calculate missing values
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Feature': missing.index,
    'Missing Count': missing.values,
    'Missing %': missing_pct.values
})

print(f"Features with missing values: {len(missing_df)}")
missing_df.head(20)

In [None]:
# Plot missing values
fig, ax = plt.subplots(figsize=(12, 8))

top_missing = missing_df.head(20)
colors = plt.cm.Reds(np.linspace(0.3, 0.9, len(top_missing)))

bars = ax.barh(top_missing['Feature'], top_missing['Missing %'], color=colors)
ax.set_xlabel('Missing Percentage (%)', fontsize=12)
ax.set_ylabel('Feature', fontsize=12)
ax.set_title('Top 20 Features with Missing Values', fontsize=14, fontweight='bold')
ax.invert_yaxis()

# Add percentage labels
for bar, pct in zip(bars, top_missing['Missing %']):
    ax.text(bar.get_width() + 0.5, bar.get_y() + bar.get_height()/2, 
            f'{pct:.1f}%', va='center', fontsize=9)

plt.tight_layout()
plt.savefig(FIGURES_PATH / 'missing_values.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_PATH / 'missing_values.png'}")

## 2. Target Variable: SalePrice Distribution

In [None]:
# SalePrice statistics
print("SalePrice Statistics:")
print(f"  Min: ${df['SalePrice'].min():,.0f}")
print(f"  Max: ${df['SalePrice'].max():,.0f}")
print(f"  Mean: ${df['SalePrice'].mean():,.0f}")
print(f"  Median: ${df['SalePrice'].median():,.0f}")
print(f"  Std: ${df['SalePrice'].std():,.0f}")
print(f"  Skewness: {df['SalePrice'].skew():.3f}")
print(f"  Kurtosis: {df['SalePrice'].kurtosis():.3f}")

In [None]:
# SalePrice distribution: Before and After log1p transformation
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Original distribution
ax1 = axes[0, 0]
sns.histplot(df['SalePrice'], kde=True, ax=ax1, color='#3498db', bins=50)
ax1.set_title('SalePrice Distribution (Original)', fontsize=12, fontweight='bold')
ax1.set_xlabel('SalePrice ($)')
ax1.axvline(df['SalePrice'].mean(), color='red', linestyle='--', label=f"Mean: ${df['SalePrice'].mean():,.0f}")
ax1.axvline(df['SalePrice'].median(), color='green', linestyle='--', label=f"Median: ${df['SalePrice'].median():,.0f}")
ax1.legend()

# Original Q-Q plot
ax2 = axes[0, 1]
from scipy import stats
stats.probplot(df['SalePrice'], dist="norm", plot=ax2)
ax2.set_title('Q-Q Plot (Original)', fontsize=12, fontweight='bold')

# Log-transformed distribution
log_price = np.log1p(df['SalePrice'])
ax3 = axes[1, 0]
sns.histplot(log_price, kde=True, ax=ax3, color='#2ecc71', bins=50)
ax3.set_title('SalePrice Distribution (Log1p Transformed)', fontsize=12, fontweight='bold')
ax3.set_xlabel('log1p(SalePrice)')
ax3.axvline(log_price.mean(), color='red', linestyle='--', label=f"Mean: {log_price.mean():.2f}")
ax3.axvline(log_price.median(), color='green', linestyle='--', label=f"Median: {log_price.median():.2f}")
ax3.legend()

# Log Q-Q plot
ax4 = axes[1, 1]
stats.probplot(log_price, dist="norm", plot=ax4)
ax4.set_title('Q-Q Plot (Log1p Transformed)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(FIGURES_PATH / 'saleprice_distribution.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"\nSkewness after log1p: {log_price.skew():.3f}")
print(f"Saved: {FIGURES_PATH / 'saleprice_distribution.png'}")

## 3. Correlation Analysis

In [None]:
# Compute correlations with SalePrice
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlations = df[numeric_cols].corr()['SalePrice'].drop('SalePrice').sort_values(ascending=False)

print("Top 15 features correlated with SalePrice:")
print(correlations.head(15))

In [None]:
# Correlation heatmap for top features
top_features = correlations.head(10).index.tolist() + ['SalePrice']
corr_matrix = df[top_features].corr()

fig, ax = plt.subplots(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdYlBu_r',
            center=0, square=True, linewidths=0.5, ax=ax,
            annot_kws={'size': 10})
ax.set_title('Correlation Heatmap: Top 10 Features + SalePrice', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(FIGURES_PATH / 'correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_PATH / 'correlation_heatmap.png'}")

In [None]:
# Bar plot of correlations
fig, ax = plt.subplots(figsize=(10, 8))

top_corr = correlations.head(15)
colors = ['#2ecc71' if c > 0 else '#e74c3c' for c in top_corr.values]

bars = ax.barh(top_corr.index, top_corr.values, color=colors)
ax.set_xlabel('Correlation with SalePrice', fontsize=12)
ax.set_title('Top 15 Features Correlated with SalePrice', fontsize=14, fontweight='bold')
ax.axvline(x=0, color='black', linewidth=0.5)
ax.invert_yaxis()

for bar, val in zip(bars, top_corr.values):
    ax.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.3f}', va='center', fontsize=9)

plt.tight_layout()
plt.savefig(FIGURES_PATH / 'correlation_barplot.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_PATH / 'correlation_barplot.png'}")

## 4. Scatter Plots: Top Features vs SalePrice

In [None]:
# Scatter plots for top 6 numeric features
top_6_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF']

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_6_features):
    ax = axes[idx]
    ax.scatter(df[feature], df['SalePrice'], alpha=0.5, c='#3498db', s=30)
    
    # Add regression line
    z = np.polyfit(df[feature].dropna(), df.loc[df[feature].notna(), 'SalePrice'], 1)
    p = np.poly1d(z)
    x_line = np.linspace(df[feature].min(), df[feature].max(), 100)
    ax.plot(x_line, p(x_line), 'r--', linewidth=2, label='Trend')
    
    corr = df[feature].corr(df['SalePrice'])
    ax.set_xlabel(feature, fontsize=11)
    ax.set_ylabel('SalePrice ($)', fontsize=11)
    ax.set_title(f'{feature} vs SalePrice (r={corr:.3f})', fontsize=12, fontweight='bold')
    ax.legend(loc='upper left')

plt.tight_layout()
plt.savefig(FIGURES_PATH / 'scatter_plots.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_PATH / 'scatter_plots.png'}")

## 5. Boxplots: Categorical Features vs SalePrice

In [None]:
# Boxplots for key categorical features
cat_features = ['OverallQual', 'Neighborhood', 'KitchenQual', 'GarageCars']

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, feature in enumerate(cat_features):
    ax = axes[idx]
    
    if feature == 'Neighborhood':
        # Sort by median price
        order = df.groupby(feature)['SalePrice'].median().sort_values().index
        sns.boxplot(data=df, x=feature, y='SalePrice', ax=ax, order=order, palette='viridis')
        ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=8)
    else:
        sns.boxplot(data=df, x=feature, y='SalePrice', ax=ax, palette='viridis')
    
    ax.set_xlabel(feature, fontsize=11)
    ax.set_ylabel('SalePrice ($)', fontsize=11)
    ax.set_title(f'SalePrice by {feature}', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(FIGURES_PATH / 'boxplots.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_PATH / 'boxplots.png'}")

## 6. Feature Types Summary

In [None]:
# Summarize feature types
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

print(f"Total Features: {len(df.columns)}")
print(f"Numeric Features: {len(numeric_features)}")
print(f"Categorical Features: {len(categorical_features)}")
print(f"\nNumeric: {numeric_features[:10]}...")
print(f"\nCategorical: {categorical_features[:10]}...")

In [None]:
# Pairplot for top 4 features
top_4 = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'SalePrice']
g = sns.pairplot(df[top_4], diag_kind='kde', corner=True,
                 plot_kws={'alpha': 0.5, 's': 30})
g.fig.suptitle('Pairplot: Top Features vs SalePrice', y=1.02, fontsize=14, fontweight='bold')

plt.savefig(FIGURES_PATH / 'pairplot.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {FIGURES_PATH / 'pairplot.png'}")

## 7. Summary Statistics

In [None]:
# Summary
print("=" * 60)
print("EDA SUMMARY")
print("=" * 60)
print(f"Dataset: Ames Housing")
print(f"Instances: {len(df)}")
print(f"Features: {len(df.columns)}")
print(f"  - Numeric: {len(numeric_features)}")
print(f"  - Categorical: {len(categorical_features)}")
print(f"\nTarget Variable: SalePrice")
print(f"  - Range: ${df['SalePrice'].min():,.0f} - ${df['SalePrice'].max():,.0f}")
print(f"  - Mean: ${df['SalePrice'].mean():,.0f}")
print(f"  - Median: ${df['SalePrice'].median():,.0f}")
print(f"\nTop Correlated Features:")
for feat, corr in correlations.head(5).items():
    print(f"  - {feat}: {corr:.3f}")
print(f"\nFeatures with >50% missing: {len(missing_df[missing_df['Missing %'] > 50])}")
print(f"Features with any missing: {len(missing_df)}")
print("=" * 60)
print(f"\nAll figures saved to: {FIGURES_PATH.resolve()}")