# House Prices - Advanced Regression Techniques

https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

Kaggle competition for predicting house prices. What follows is my attempt a implementing a data science project from start to finish.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_validate, learning_curve, KFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin

## Downloading the data

Get the data from Kaggle. The section returns two dataframes:

**df_train** and **df_test**

In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_train = kagglehub.dataset_load(adapter=KaggleDatasetAdapter.PANDAS, handle='house-prices-advanced-regression-techniques', path='train.csv')

In [4]:
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [5]:
df_test = kagglehub.dataset_load(adapter=KaggleDatasetAdapter.PANDAS, handle='house-prices-advanced-regression-techniques', path='test.csv')

In [6]:
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [7]:
len(df_train), len(df_test)

(1460, 1459)

## Feature Selection

Using mutual_info_regression to decide which features to keep.

In [8]:
from sklearn.feature_selection import mutual_info_regression


def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [9]:
X = df_train.copy()
y = X.pop("SalePrice")

# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int



mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores[::3]  # show a few features with their MI scores

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  for colname in X.select_dtypes("object"):


ValueError: Input X contains NaN.

## EDA

What data do we have?
Missing data?
Text data?

This section returns clean and ready to train data.

### EDA Step 1: The target

Analyse the target variable.

In [None]:
X_train = df_train.select_dtypes('number')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_cols = df_train.select_dtypes('number')
categorical_cols = df_train.select_dtypes('str')

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
model = LinearRegression()


# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

In [None]:
sns.histplot(df_train.SalePrice)

In [None]:
# Distribution is not symmetrical

fig, axes = plt.subplots(2, 1, figsize=(12, 10))

sns.histplot(np.log(df_train.SalePrice), ax=axes[0])

# Useful when there are very small values for x. I don't think that applies in this case.
sns.histplot(np.log1p(df_train.SalePrice), ax=axes[1])

### EDA Step 1: Missing data

What data is missing? What are we going to do about it?

Look at the distribution of the target. Is there a linear combination of the Does any transformation make the distribution symmetrical?



In [None]:
from sklearn.feature_selection import r_regression
from sklearn.feature_selection import SelectKBest
from sklearn.impute import SimpleImputer
from sklearn import set_config

set_config(working_memory=8192)

In [None]:
# handle the missing data
# handle the categorical data
# do this inside CV
# pipeline
# fit, transform

In [None]:
df_train.isna().mean()

# df_train.PoolQC

imputer = SimpleImputer(strategy='most_frequent')

imputer.fit_transform(df_train)

In [None]:
columns_with_missing_data = df_train.isna().mean()[df_train.isna().mean() > 0].reset_index()['index'].to_list()
print(columns_with_missing_data)
df_train_full_data = df_train.drop(columns_with_missing_data, axis=1)

y_train = np.log(df_train_full_data.SalePrice) # Train the data on the log transformation of the data.
X_train = df_train_full_data.select_dtypes('number').drop('SalePrice', axis=1)

In [None]:
X_train_new = SelectKBest(r_regression, k=25).set_output(transform="pandas").fit_transform(X_train, y_train)
X_train_new

In [None]:
plt.scatter(y_train, y_train.reset_index().index)

In [None]:
plt.scatter(X_train_new.LotArea, np.log1p(y_train))

In [None]:
plt.scatter(np.log1p(X_train_new.LotArea), np.log1p(y_train))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats

# Style configuration — used consistently throughout the notebook
sns.set_style("whitegrid")
sns.set_palette("muted")
PLOT_COLOR = "#4878CF"  # consistent blue for single-variable plots
plt.rcParams.update({
    'figure.figsize': (10, 5),
    'axes.titlesize': 13,
    'axes.labelsize': 11,
    'font.size': 10
})



In [None]:


fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Top row: Histograms with KDE
sns.histplot(df_train['SalePrice'], kde=True, bins=50, color=PLOT_COLOR, ax=axes[0, 0])
axes[0, 0].set_title('SalePrice — Raw Distribution')
axes[0, 0].set_xlabel('Sale Price ($)')

sns.histplot(np.log1p(df_train['SalePrice']), kde=True, bins=50, color=PLOT_COLOR, ax=axes[0, 1])
axes[0, 1].set_title('SalePrice — Log-Transformed')
axes[0, 1].set_xlabel('log(1 + Sale Price)')

# Bottom row: Q-Q plots
stats.probplot(df_train['SalePrice'], dist="norm", plot=axes[1, 0])
axes[1, 0].set_title('Q-Q Plot — Raw')

stats.probplot(np.log1p(df_train['SalePrice']), dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot — Log-Transformed')

plt.tight_layout()
plt.show()

In [None]:
# sns.pairplot()

## Training a model
This section trains and refines the model.

In [None]:
# Top 100 score: 0.11832
# Top 1000 score: 0.12755

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_validate



cv_results = cross_validate(LinearRegression(), X_train_new, y_train, scoring='neg_root_mean_squared_error', cv=5, return_train_score=True)

print(cv_results)
print("="*70)
print(f"train: {cv_results['train_score'].mean()} ± {cv_results['train_score'].std()}, test: {cv_results['test_score'].mean()} ± {cv_results['test_score'].std()}")

In [None]:
# Run for a different number of features.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# 1. Transform the data
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly_array = poly.fit_transform(X_train_new)

# 2. Get the feature names
# If X_train_new is a DataFrame, it pulls the actual names automatically
feature_names = poly.get_feature_names_out(X_train_new.columns)

# 3. Create the new DataFrame
X_train_new_poly = pd.DataFrame(X_poly_array, columns=feature_names, index=X_train_new.index)

X_train_new_poly

In [None]:
# This is too many features

from statsmodels.regression.linear_model import OLS

In [None]:
ols = OLS(y_train, X_train_new_poly).fit()
ols.summary()

feature_names = ols.params.index.tolist()
pvalues = ols.pvalues



top_features = pvalues[pvalues<0.05].index.tolist()

X_train_new_poly[top_features]

In [None]:
X_train.corr()

In [None]:
sns.heatmap(X_train.corr())

In [None]:
X_train_new_poly

In [None]:
X_train_top_poly = SelectKBest(r_regression, k=25).set_output(transform="pandas").fit_transform(X_train_new_poly, y_train)
X_train_top_poly.corr()

In [None]:
cv_results_poly = cross_validate(LinearRegression(), X_train_new_poly, y_train, scoring='neg_root_mean_squared_error', cv=5, return_train_score=True)

print(cv_results_poly)
print("="*70)
print(f"train: {cv_results_poly['train_score'].mean()} ± {cv_results_poly['train_score'].std()}, test: {cv_results_poly['test_score'].mean()} ± {cv_results_poly['test_score'].std()}")

In [None]:
cv_results_poly = cross_validate(LinearRegression(), X_train_new_poly[top_features], y_train, scoring='neg_root_mean_squared_error', cv=5, return_train_score=True)

print(cv_results_poly)
print("="*70)
print(f"train: {cv_results_poly['train_score'].mean()} ± {cv_results_poly['train_score'].std()}, test: {cv_results_poly['test_score'].mean()} ± {cv_results_poly['test_score'].std()}")

In [None]:
cv_results_poly = cross_validate(LinearRegression(), X_train_top_poly, y_train, scoring='neg_root_mean_squared_error', cv=5, return_train_score=True)

print(cv_results_poly)
print("="*70)
print(f"train: {cv_results_poly['train_score'].mean()} ± {cv_results_poly['train_score'].std()}, test: {cv_results_poly['test_score'].mean()} ± {cv_results_poly['test_score'].std()}")

In [None]:
from sklearn.linear_model import Ridge

$||y - Xw||^2_2 + alpha * ||w||^2_2$

In [None]:
# alpha = 0 is the "same" as linear regression
cv_results_poly = cross_validate(Ridge(alpha=1), X_train_new_poly, y_train, scoring='neg_root_mean_squared_error',
                                 cv=5, return_train_score=True)

print(cv_results_poly)
print("=" * 70)
print(
    f"train: {cv_results_poly['train_score'].mean()} ± {cv_results_poly['train_score'].std()}, test: {cv_results_poly['test_score'].mean()} ± {cv_results_poly['test_score'].std()}")

In [None]:
y_train.min(), y_train.max()

y_train

## Prediction

This section returns a csv file with a SalePrice column added to the test dataset.

In [None]:
simple_lr = LinearRegression().fit(X_train_new, np.log1p(y_train))

In [None]:
X_train_new.columns.to_list()

In [None]:
X_test = df_test[X_train_new.columns.to_list()]
len(X_test)

In [None]:
X_test.fillna(0, inplace=True)

In [None]:
y_pred = simple_lr.predict(X_train_new)
y_prices = np.exp(y_pred)-1
plt.scatter(y_train, y_prices)
plt.scatter(y_train, y_train)

In [None]:
len(y_prices), len(y_train)

In [None]:
import plotly.express as px

In [None]:
px.scatter(x=y_train, y=y_prices)

In [None]:
px.scatter(y_train - y_prices)

In [None]:
px.scatter((y_train - y_prices)/y_prices)

In [None]:
sns.histplot((y_train - y_prices))

In [None]:
y_pred = simple_lr.predict(X_test)
y_prices = np.exp(y_pred)-1
y_prices

## Competition submission

This section submits the data to Kaggle.

In [None]:
submission_df = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': np.exp(y_prices)})
submission_df

In [None]:
submission_df.to_csv('submission.csv', index=False, header=True)

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi

# 1. Authenticate
KaggleApi().authenticate()

In [None]:
# 2. Define competition and submission details
COMPETITION = 'house-prices-advanced-regression-techniques'

def get_latest_score(competition):
    # Fetch the list of all your submissions
    submissions = KaggleApi().competition_submissions(competition)

    if submissions:
        latest = submissions[0]
        # Status will be 'pending' while Kaggle is still calculating
        return latest.public_score, latest.date, latest.status
    return None

In [None]:
get_latest_score(COMPETITION)

In [None]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "My first model submission"

In [None]:
!kaggle competitions submissions -c house-prices-advanced-regression-techniques

In [None]:
!uv sync --upgrade

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, KFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor

# # 1. Create a Mixed-Type Dataset
# # We create 1000 rows. 'cat_feature' is text, others are numbers.
# data = pd.DataFrame({
#     'num_1': np.random.normal(0, 1, 1000),
#     'num_2': np.random.normal(5, 2, 1000), # Informative
#     'num_3': np.random.normal(-5, 2, 1000),
#     'cat_1': np.random.choice(['A', 'B', 'C'], 1000), # Categorical
#     'cat_2': np.random.choice(['X', 'Y'], 1000)       # Categorical
# })
# # Target variable depends heavily on num_2 and cat_1
# y = 3 * data['num_2'] + (data['cat_1'] == 'A') * 5 + np.random.normal(0, 1, 1000)
#
# # Introduce NaNs to test imputation
# data.loc[::10, 'num_1'] = np.nan
# data.loc[::10, 'cat_1'] = np.nan

# 1. Define feature groups

numeric_features = df_train.drop('SalePrice', axis=1).select_dtypes('number').columns.tolist()
categorical_features = df_train.select_dtypes('str').columns.tolist()

# 2. Build the Preprocessing Pipelines
# Pipeline for Numbers: Impute Mean -> Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline for Categories: Impute 'missing' -> OneHotEncode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine them into a single Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    verbose_feature_names_out=False # Keeps names clean (e.g. 'cat_1_A' instead of 'cat__cat_1_A')
)

# 3. Create the Master Pipeline
# Data Flow: Preprocessor (Clean/Encode) -> Selection -> Model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(mutual_info_regression, k=25)), # Select top 4 features
    ('regressor', RandomForestRegressor(random_state=42))
])

# 4. Run Cross-Validation with Inspection
cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = cross_validate(
    pipeline,
    df_train.drop('SalePrice', axis=1),
    y=np.log(df_train['SalePrice']),
    cv=cv,
    scoring='neg_mean_squared_error',
    return_train_score=True,  # Give us training scores
    return_estimator=True     # Give us the fitted pipeline for each fold
)

# 5. Extract and Display Results
print(f"{'Fold':<5} | {'Train RMSE':<10} | {'Test RMSE':<10} | {'Selected Features'}")
print("-" * 80)

for i, estimator in enumerate(results['estimator']):
    # Recover the feature names after OneHotEncoding
    # The preprocessor is the first step in our pipeline
    feature_names_out = estimator.named_steps['preprocessor'].get_feature_names_out()

    # Get the boolean mask of selected features from step 2 ('selector')
    mask = estimator.named_steps['selector'].get_support()

    # Filter the names
    selected_feats = feature_names_out[mask]

    # Calculate RMSEs
    train_rmse = np.sqrt(-results['train_score'][i])
    test_rmse = np.sqrt(-results['test_score'][i])

    print(f"{i+1:<5} | {train_rmse:.4f}     | {test_rmse:.4f}     | {list(selected_feats)}")

print("-" * 80)
print(f"Average Test RMSE: {np.sqrt(-results['test_score']).mean():.4f}")

In [None]:
# --- CUSTOM TRANSFORMER: DROP CORRELATED FEATURES ---
# This solves your concern about selected features being correlated.
# We place this INSIDE the pipeline to prevent leakage.
class DropCorrelatedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.to_drop = []

    def fit(self, X, y=None):
        # Create correlation matrix
        # Note: X might be a sparse matrix from OneHotEncoder, so we convert to dense
        if hasattr(X, "toarray"):
            df = pd.DataFrame(X.toarray())
        else:
            df = pd.DataFrame(X)

        corr_matrix = df.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        return self

    def transform(self, X):
        if hasattr(X, "toarray"):
            df = pd.DataFrame(X.toarray())
        else:
            df = pd.DataFrame(X)
        return df.drop(df.columns[self.to_drop], axis=1).values

# --- THE "PACKAGE": PIPELINE DOCTOR ---
class PipelineDoctor:
    def __init__(self, X, y, numerical_cols, categorical_cols):
        self.X = X
        self.y = y
        self.num_cols = numerical_cols
        self.cat_cols = categorical_cols
        self.pipeline = self._build_pipeline()

    def _build_pipeline(self):
        # 1. Preprocessing
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.num_cols),
                ('cat', categorical_transformer, self.cat_cols)
            ],
            verbose_feature_names_out=False
        )

        # 2. Assembly
        return Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('corr_filter', DropCorrelatedFeatures(threshold=0.85)), # NEW: Remove duplicates
            ('selector', SelectKBest(mutual_info_regression, k=25)),
            ('regressor', RandomForestRegressor(random_state=42, n_estimators=100))
        ])

    def diagnose_bias_variance(self):
        """Generates Learning Curves to detect Overfitting/Underfitting"""
        print("\n--- DIAGNOSIS 1: BIAS vs VARIANCE (Learning Curves) ---")

        train_sizes, train_scores, test_scores = learning_curve(
            self.pipeline, self.X, self.y, cv=5, n_jobs=-1,
            train_sizes=np.linspace(0.1, 1.0, 5), scoring='neg_mean_squared_error'
        )

        train_rmse = np.sqrt(-train_scores.mean(axis=1))
        test_rmse = np.sqrt(-test_scores.mean(axis=1))

        # Plotting
        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes, train_rmse, 'o-', color="r", label="Training Score")
        plt.plot(train_sizes, test_rmse, 'o-', color="g", label="Cross-Validation Score")
        plt.xlabel("Training Examples")
        plt.ylabel("RMSE (Lower is Better)")
        plt.title("Learning Curve")
        plt.legend(loc="best")
        plt.grid()
        plt.show() #

        # Automatic Advice
        gap = test_rmse[-1] - train_rmse[-1]
        print(f"Final Train RMSE: {train_rmse[-1]:.4f}")
        print(f"Final Test RMSE:  {test_rmse[-1]:.4f}")
        print(f"Gap: {gap:.4f}")

        if train_rmse[-1] < 0.1 and gap > 0.2:
            print(">> DIAGNOSIS: HIGH VARIANCE (Overfitting).")
            print(">> TIP: Reduce k in SelectKBest, increase regularization, or get more data.")
        elif train_rmse[-1] > 0.3:
            print(">> DIAGNOSIS: HIGH BIAS (Underfitting).")
            print(">> TIP: Increase model complexity, add interaction terms, or increase k.")
        else:
            print(">> DIAGNOSIS: Model looks balanced.")

    def analyze_feature_stability(self):
        """Runs CV to see which features are consistently selected"""
        print("\n--- DIAGNOSIS 2: FEATURE STABILITY ---")

        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        results = cross_validate(
            self.pipeline, self.X, self.y, cv=cv,
            scoring='neg_mean_squared_error', return_estimator=True
        )

        # Aggregate selected features
        feature_counts = {}

        for estimator in results['estimator']:
            # Get feature names from preprocessor
            all_names = estimator.named_steps['preprocessor'].get_feature_names_out()

            # The correlation filter drops columns, so we need to track what remains
            # This is tricky because custom transformers don't always expose feature names easily
            # For this demo, we assume the correlation filter keeps names aligned or we rely on the selector mask
            # Note: In production, you'd implement get_feature_names_out in DropCorrelatedFeatures

            # Simple approach: Mask from Selector
            mask = estimator.named_steps['selector'].get_support()

            # Note: This logic assumes 1-to-1 mapping.
            # If DropCorrelatedFeatures drops columns, the mask length will differ from all_names length.
            # To keep this robust for the demo, we will check indices.

            # (Simplified for display purposes - capturing the raw indices selected)
            # In a real package, you would rigorously track column names through steps.
            pass

        print(f"Average Test RMSE across folds: {np.sqrt(-results['test_score']).mean():.4f}")
        print(">> TIP: If your CV score standard deviation is high, your feature selection is unstable.")

# --- USAGE EXAMPLE ---
# Assuming you have df_train loaded from Kaggle
# df_train = kagglehub.dataset_load(...) or pd.read_csv('train.csv')

# Dummy data for demonstration


# 1. Setup Data
X = df_train.drop('SalePrice', axis=1)
y = np.log(df_train['SalePrice'])

# Fix the dtypes selector (Pandas often loads strings as 'object')
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 2. Run the Doctor
doc = PipelineDoctor(X, y, num_cols, cat_cols)
doc.diagnose_bias_variance()
doc.analyze_feature_stability()