In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("Starting House Prices Prediction Analysis...")

# --- 1. Data Loading ---
# Load the datasets from the provided CSV files.
try:
    train_df_original = pd.read_csv(r"C:\Users\DELL\Downloads\train (1).csv")
    test_df_original = pd.read_csv(r"C:\Users\DELL\Downloads\test (1).csv")
    print("Files 'train (1).csv' and 'test (1).csv' loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading files: {e}. Please ensure the files are accessible in the environment.")
    # Exit if critical files are not found
    exit()

# Store original IDs for submission and apply log transformation to SalePrice.
# Log transformation is crucial because the evaluation metric (RMSE) is based on log-transformed prices.
train_ids = train_df_original['Id']
test_ids = test_df_original['Id']
sale_price_log = np.log1p(train_df_original['SalePrice']) # log1p handles zero values gracefully

# Drop 'Id' and 'SalePrice' (and the temporary 'SalePrice_Log' if it existed) from training data,
# and 'Id' from test data before combining for preprocessing.
train_features = train_df_original.drop(['Id', 'SalePrice'], axis=1)
test_features = test_df_original.drop('Id', axis=1)

# Concatenate train and test features for consistent preprocessing.
all_data = pd.concat([train_features, test_features], axis=0).reset_index(drop=True)
print(f"Combined data shape before preprocessing: {all_data.shape}")

# --- 2. Missing Value Imputation ---
# Impute missing values based on domain knowledge from data_description.txt.

# 2.1. Features where 'NA' means 'None' (categorical absence)
for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'BsmtCond',
            'MasVnrType']:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna('None')

# 2.2. Numerical Features where 'NA' implies 0 (numerical absence)
for col in ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            'GarageCars', 'GarageArea', 'BsmtFullBath', 'BsmtHalfBath']:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna(0)

# 2.3. LotFrontage: Impute with median grouped by Neighborhood
# This is a more sophisticated imputation as LotFrontage can vary significantly by neighborhood.
if 'LotFrontage' in all_data.columns:
    all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    # Fill any remaining NaNs (e.g., if a neighborhood has all missing LotFrontage) with overall median.
    if all_data['LotFrontage'].isnull().any():
        all_data['LotFrontage'] = all_data['LotFrontage'].fillna(all_data['LotFrontage'].median())

# 2.4. GarageYrBlt: Fill with 0 for no garage, otherwise with median of existing garage years.
if 'GarageYrBlt' in all_data.columns:
    # If GarageType is 'None', it means no garage, so set GarageYrBlt to 0.
    # Otherwise, fill with the median of existing GarageYrBlt values.
    all_data['GarageYrBlt'] = np.where(all_data['GarageType'] == 'None',
                                       0,
                                       all_data['GarageYrBlt'].fillna(all_data['GarageYrBlt'].median()))

# 2.5. Other categorical features with few missing values (Mode imputation)
for col in ['MSZoning', 'Utilities', 'Electrical', 'KitchenQual', 'Exterior1st',
            'Exterior2nd', 'SaleType', 'Functional']:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

print("\nMissing values after initial imputation (should be empty if successful):")
print(all_data.isnull().sum()[all_data.isnull().sum() > 0])


# --- 3. Feature Engineering ---
# Create new features to capture more complex relationships in the data.

# 3.1. Total Square Footage (Above Ground + Basement)
all_data['TotalSF'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']

# 3.2. Total Bathrooms
all_data['TotalBath'] = all_data['FullBath'] + (all_data['HalfBath'] * 0.5) + \
                        all_data['BsmtFullBath'] + (all_data['BsmtHalfBath'] * 0.5)

# 3.3. Total Porch Area (sum of all porch types)
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + \
                           all_data['3SsnPorch'] + all_data['ScreenPorch']

# 3.4. Age-related Features: Years since built and years since last remodel.
all_data['YearsBuilt'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['YearsRemod'] = all_data['YrSold'] - all_data['YearRemodAdd']

# Correct for potential negative years (e.g., if YrSold < YearBuilt/RemodAdd for new builds or data errors)
# Set negative values to 0, assuming it means a very new or just remodeled house.
all_data['YearsBuilt'] = all_data['YearsBuilt'].apply(lambda x: x if x >= 0 else 0)
all_data['YearsRemod'] = all_data['YearsRemod'].apply(lambda x: x if x >= 0 else 0)

# 3.5. Simple Binary Features (presence/absence of certain amenities)
all_data['HasPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['Has2ndFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasBsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasFireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

print("\nMissing values after feature engineering (should be empty):")
print(all_data.isnull().sum()[all_data.isnull().sum() > 0])


# --- 4. Categorical Encoding ---
# Convert categorical features into numerical format for model training.

# 4.1. Convert some numerical features that are actually categorical to 'object' type
# This ensures they are treated as categories during one-hot encoding.
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

# 4.2. Ordinal Encoding: Map categorical features with a clear order to numerical values.
# Mappings are based on the data_description.txt and common sense for quality/condition.

# General Quality/Condition: Ex > Gd > TA > Fa > Po > None
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
for col in ['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu',
            'GarageQual', 'GarageCond', 'PoolQC']:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(quality_map)

# Basement Quality/Condition/Exposure
bsmt_qual_cond_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
bsmt_exposure_map = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0}
bsmt_fin_type_map = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0}

for col in ['BsmtQual', 'BsmtCond']:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(bsmt_qual_cond_map)
if 'BsmtExposure' in all_data.columns:
    all_data['BsmtExposure'] = all_data['BsmtExposure'].map(bsmt_exposure_map)
for col in ['BsmtFinType1', 'BsmtFinType2']:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(bsmt_fin_type_map)

# Functional: Typ > Min1 > Min2 > Mod > Maj1 > Maj2 > Sev > Sal
functional_map = {'Typ': 7, 'Min1': 6, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 'Maj2': 2, 'Sev': 1, 'Sal': 0}
if 'Functional' in all_data.columns:
    all_data['Functional'] = all_data['Functional'].map(functional_map)

# GarageFinish: Fin > RFn > Unf > None
garage_finish_map = {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0}
if 'GarageFinish' in all_data.columns:
    all_data['GarageFinish'] = all_data['GarageFinish'].map(garage_finish_map)

# PavedDrive: Y > P > N
paved_drive_map = {'Y': 2, 'P': 1, 'N': 0}
if 'PavedDrive' in all_data.columns:
    all_data['PavedDrive'] = all_data['PavedDrive'].map(paved_drive_map)

# Fence: GdPrv > MnPrv > GdWo > MnWw > None
fence_map = {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'None': 0}
if 'Fence' in all_data.columns:
    all_data['Fence'] = all_data['Fence'].map(fence_map)

# LandSlope: Gtl > Mod > Sev
land_slope_map = {'Gtl': 2, 'Mod': 1, 'Sev': 0}
if 'LandSlope' in all_data.columns:
    all_data['LandSlope'] = all_data['LandSlope'].map(land_slope_map)

# LotShape: Reg > IR1 > IR2 > IR3
lot_shape_map = {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0}
if 'LotShape' in all_data.columns:
    all_data['LotShape'] = all_data['LotShape'].map(lot_shape_map)

# Utilities: AllPub > NoSeWa > NoSewr (assuming hierarchy)
utilities_map = {'AllPub': 3, 'NoSeWa': 2, 'NoSewr': 1, 'None': 0}
if 'Utilities' in all_data.columns:
    all_data['Utilities'] = all_data['Utilities'].map(utilities_map)

# Street: Pave > Grvl
street_map = {'Pave': 1, 'Grvl': 0}
if 'Street' in all_data.columns:
    all_data['Street'] = all_data['Street'].map(street_map)

# 4.3. One-Hot Encoding for remaining nominal categorical features
# Select all object type columns that are still present after ordinal mapping.
one_hot_cols = all_data.select_dtypes(include='object').columns.tolist()
print(f"\nColumns identified for One-Hot Encoding: {one_hot_cols}")

all_data = pd.get_dummies(all_data, columns=one_hot_cols, dummy_na=False)
print(f"Shape after One-Hot Encoding: {all_data.shape}")

# Final check for any remaining missing values after ALL preprocessing steps.
# This is a critical step to ensure models don't fail due to NaNs.
print("\nMissing values in combined data after ALL preprocessing steps:")
missing_final = all_data.isnull().sum()
missing_final = missing_final[missing_final > 0].sort_values(ascending=False)
if not missing_final.empty:
    print(missing_final)
    # If there are any remaining missing values, fill them with 0 as a last resort.
    # This might happen if a new category appeared in test set not in train and was not handled.
    print("\nFilling any remaining missing values with 0.")
    all_data.fillna(0, inplace=True)
else:
    print("No remaining missing values found. Data is clean for modeling.")


# Separate the preprocessed data back into training and test sets.
X_train = all_data.iloc[:len(train_df_original)]
X_test = all_data.iloc[len(train_df_original):]
y_train = sale_price_log # Our log-transformed SalePrice

print(f"\nFinal X_train shape: {X_train.shape}")
print(f"Final X_test shape: {X_test.shape}")
print(f"Final y_train shape: {y_train.shape}")

# --- 5. Visualization: Correlation Heatmap ---
print("\nGenerating Correlation Heatmap...")

# Combine X_train and y_train for correlation calculation
train_data_for_corr = X_train.copy()
train_data_for_corr['SalePrice_Log'] = y_train

# Calculate correlations with SalePrice_Log
correlations = train_data_for_corr.corr()['SalePrice_Log'].sort_values(ascending=False)

# Select top N positive and negative correlated features (e.g., top 10 positive and top 10 negative)
# Exclude SalePrice_Log itself from the top list, but include it in the final heatmap
top_n = 10
top_correlated_features = correlations.head(top_n).index.tolist() + correlations.tail(top_n).index.tolist()
# Ensure SalePrice_Log is in the list for the heatmap
if 'SalePrice_Log' not in top_correlated_features:
    top_correlated_features.append('SalePrice_Log')

# Create a subset DataFrame with only the selected features for the heatmap
corr_matrix_subset = train_data_for_corr[top_correlated_features].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix_subset, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title(f'Correlation Heatmap of Top {len(top_correlated_features)-1} Features with Log-transformed SalePrice', fontsize=16)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('Correlation_Heatmap.png') # Save the plot
plt.close() # Close the plot to free memory
print("Correlation Heatmap saved as 'Correlation_Heatmap.png'.")


# --- 6. Model Training and Evaluation ---

# Define a function for Root Mean Squared Error with Cross-Validation (RMSE_CV)
# This helps in evaluating model performance robustly and avoiding overfitting.
def rmse_cv(model, X, y):
    kf = KFold(n_splits=10, shuffle=True, random_state=42) # 10-fold cross-validation
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return(rmse)

# Initialize models.
# Note: XGBoost and LightGBM are commented out due to previous ModuleNotFoundError.
# If available, they are highly recommended for this type of problem.
ridge = Ridge(alpha=10)
lasso = Lasso(alpha=0.0005, max_iter=10000)
elasticnet = ElasticNet(alpha=0.0005, l1_ratio=0.9, max_iter=10000)
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                max_depth=4, max_features='sqrt',
                                min_samples_leaf=15, min_samples_split=10,
                                loss='huber', random_state=42)

models = {
    'Ridge': ridge,
    'Lasso': lasso,
    'ElasticNet': elasticnet,
    'GradientBoosting': gbr
}

rmse_scores = {}
for name, model in models.items():
    print(f"\nTraining and evaluating {name} model...")
    score = rmse_cv(model, X_train, y_train)
    rmse_scores[name] = score.mean()
    print(f"{name} RMSE (mean): {score.mean():.4f}")
    print(f"{name} RMSE (std): {score.std():.4f}")

# --- 7. Final Model Training and Prediction ---

# Based on typical performance in similar competitions, GradientBoostingRegressor is a strong choice.
# For a more advanced solution, an ensemble (stacking/blending) of the best performing models would be ideal.
print("\nTraining the final GradientBoostingRegressor model on the entire training data...")
final_model = gbr
final_model.fit(X_train, y_train)

# Make predictions on the preprocessed test data.
predictions_log = final_model.predict(X_test)

# Convert predictions back from log scale to original price scale using np.expm1.
predictions = np.expm1(predictions_log)

# --- 8. Create Submission File ---
# Format the predictions into the required submission file format.
submission_df = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions})
submission_df.to_csv('submission.csv', index=False)

print("\nAnalysis complete. Submission file 'submission.csv' created successfully.")
print("First 5 rows of the submission file:")
print(submission_df.head())

Starting House Prices Prediction Analysis...
Files 'train (1).csv' and 'test (1).csv' loaded successfully.
Combined data shape before preprocessing: (2919, 79)

Missing values after initial imputation (should be empty if successful):
Series([], dtype: int64)

Missing values after feature engineering (should be empty):
Series([], dtype: int64)

Columns identified for One-Hot Encoding: ['MSSubClass', 'MSZoning', 'Alley', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']
Shape after One-Hot Encoding: (2919, 264)

Missing values in combined data after ALL preprocessing steps:
No remaining missing values found. Data is clean for modeling.

Final X_train shape: (1460, 264)
Final X_test shape: (1459, 264)
Final y_train shape: (1460,)

Generating Cor