# üè† Intelligent House Price Prediction System

**Project Title:** Intelligent House Price Prediction System Using Machine Learning

**Group Members:**
- Muhammad Usman Rajput (450327)
- Muhammad Ramish Ali (537262)
- Malik Huzaifa Saeed (539701)

---

## Table of Contents
1. [Imports & Setup](#1-imports--setup)
2. [Data Loading](#2-data-loading)
3. [Data Preprocessing](#3-data-preprocessing)
4. [Exploratory Data Analysis](#4-exploratory-data-analysis)
5. [Feature Engineering](#5-feature-engineering)
6. [Model Training](#6-model-training)
7. [Model Evaluation & Comparison](#7-model-evaluation--comparison)
8. [Predictions & Submission](#8-predictions--submission)
9. [Conclusion](#9-conclusion)

## 1. Imports & Setup

In [1]:
# Core Libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Set display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("‚úÖ All libraries imported successfully!")

ModuleNotFoundError: No module named 'xgboost'

## 2. Data Loading

In [None]:
# Load the datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

print(f"üìä Training Data Shape: {train_df.shape}")
print(f"üìä Test Data Shape: {test_df.shape}")
print(f"\nüéØ Target Variable: SalePrice")

In [None]:
# Display first few rows
print("\nüìã First 5 rows of training data:")
train_df.head()

In [None]:
# Basic info about the dataset
print("\nüìã Dataset Information:")
print("="*50)
print(f"Total Features: {train_df.shape[1] - 1}")
print(f"Total Samples: {train_df.shape[0]}")
print(f"\nNumerical Features: {train_df.select_dtypes(include=[np.number]).shape[1]}")
print(f"Categorical Features: {train_df.select_dtypes(include=['object']).shape[1]}")

In [None]:
# Statistical summary of numerical features
print("\nüìà Statistical Summary:")
train_df.describe()

In [None]:
# Target variable statistics
print("\nüéØ Target Variable (SalePrice) Statistics:")
print("="*50)
print(train_df['SalePrice'].describe())

## 3. Data Preprocessing

### 3.1 Missing Values Analysis

In [None]:
# Check missing values in training data
missing_train = train_df.isnull().sum()
missing_train = missing_train[missing_train > 0].sort_values(ascending=False)
missing_percent = (missing_train / len(train_df)) * 100

print("\nüîç Missing Values in Training Data:")
print("="*50)
missing_df = pd.DataFrame({
    'Missing Count': missing_train,
    'Missing %': missing_percent.round(2)
})
print(missing_df)

In [None]:
# Visualize missing values
plt.figure(figsize=(12, 6))
missing_df['Missing %'].plot(kind='bar', color='coral', edgecolor='black')
plt.title('Missing Values Percentage by Feature', fontsize=14, fontweight='bold')
plt.xlabel('Features', fontsize=12)
plt.ylabel('Missing Percentage (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('visualizations/missing_values.png', dpi=150, bbox_inches='tight')
plt.show()

### 3.2 Handle Missing Values

In [None]:
# Combine train and test for consistent preprocessing
# Save target and IDs
y_train = train_df['SalePrice'].copy()
train_ids = train_df['Id'].copy()
test_ids = test_df['Id'].copy()

# Drop Id and SalePrice from training data
train_df = train_df.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df.drop(['Id'], axis=1)

# Combine datasets
ntrain = train_df.shape[0]
combined_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
print(f"Combined dataset shape: {combined_df.shape}")

In [None]:
# Handle missing values based on feature type

# Features where NA means "None" (no feature exists)
none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'MasVnrType']

for col in none_cols:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].fillna('None')

# Numerical features with 0 for NA (no feature)
zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2',
             'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']

for col in zero_cols:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].fillna(0)

# Fill LotFrontage with median of neighborhood
combined_df['LotFrontage'] = combined_df.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)

# Fill remaining missing values
# Numerical: fill with median
num_cols = combined_df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    if combined_df[col].isnull().sum() > 0:
        combined_df[col] = combined_df[col].fillna(combined_df[col].median())

# Categorical: fill with mode
cat_cols = combined_df.select_dtypes(include=['object']).columns
for col in cat_cols:
    if combined_df[col].isnull().sum() > 0:
        combined_df[col] = combined_df[col].fillna(combined_df[col].mode()[0])

print(f"\n‚úÖ Missing values after handling: {combined_df.isnull().sum().sum()}")

## 4. Exploratory Data Analysis

### 4.1 Target Variable Distribution

In [None]:
# Create visualizations directory
import os
os.makedirs('visualizations', exist_ok=True)

# Target variable distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Original distribution
sns.histplot(y_train, kde=True, ax=axes[0], color='steelblue', edgecolor='black')
axes[0].set_title('SalePrice Distribution (Original)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('SalePrice', fontsize=12)
axes[0].axvline(y_train.mean(), color='red', linestyle='--', label=f'Mean: ${y_train.mean():,.0f}')
axes[0].axvline(y_train.median(), color='green', linestyle='--', label=f'Median: ${y_train.median():,.0f}')
axes[0].legend()

# Log-transformed distribution
y_train_log = np.log1p(y_train)
sns.histplot(y_train_log, kde=True, ax=axes[1], color='coral', edgecolor='black')
axes[1].set_title('SalePrice Distribution (Log Transformed)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Log(SalePrice)', fontsize=12)

plt.tight_layout()
plt.savefig('visualizations/target_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nüìä Skewness (Original): {y_train.skew():.4f}")
print(f"üìä Skewness (Log-transformed): {y_train_log.skew():.4f}")

### 4.2 Correlation Analysis

In [None]:
# Correlation with target variable
train_with_target = combined_df.iloc[:ntrain].copy()
train_with_target['SalePrice'] = y_train.values

# Get numerical columns correlation with SalePrice
numeric_df = train_with_target.select_dtypes(include=[np.number])
correlations = numeric_df.corr()['SalePrice'].sort_values(ascending=False)

print("\nüîó Top 15 Features Correlated with SalePrice:")
print("="*50)
print(correlations.head(16))

In [None]:
# Correlation heatmap for top features
top_features = correlations.head(11).index.tolist()
plt.figure(figsize=(12, 10))
corr_matrix = numeric_df[top_features].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
            fmt='.2f', square=True, linewidths=0.5, cbar_kws={'shrink': 0.8})
plt.title('Correlation Heatmap - Top 10 Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('visualizations/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

### 4.3 Feature Distributions

In [None]:
# Scatter plots for top correlated features
top_corr_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(top_corr_features):
    axes[i].scatter(train_with_target[feature], train_with_target['SalePrice'], 
                    alpha=0.5, c='steelblue', edgecolors='black', linewidth=0.5)
    axes[i].set_xlabel(feature, fontsize=11)
    axes[i].set_ylabel('SalePrice', fontsize=11)
    axes[i].set_title(f'{feature} vs SalePrice', fontsize=12, fontweight='bold')
    
    # Add trend line
    z = np.polyfit(train_with_target[feature], train_with_target['SalePrice'], 1)
    p = np.poly1d(z)
    axes[i].plot(train_with_target[feature].sort_values(), 
                 p(train_with_target[feature].sort_values()), 
                 color='red', linewidth=2, linestyle='--')

plt.tight_layout()
plt.savefig('visualizations/scatter_plots.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Box plots for categorical features
cat_features = ['OverallQual', 'Neighborhood', 'GarageCars', 'FullBath']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, feature in enumerate(cat_features):
    if feature == 'Neighborhood':
        # For neighborhood, show only top 10 by median price
        order = train_with_target.groupby('Neighborhood')['SalePrice'].median().sort_values(ascending=False).head(10).index
        data = train_with_target[train_with_target['Neighborhood'].isin(order)]
        sns.boxplot(data=data, x=feature, y='SalePrice', ax=axes[i], palette='viridis', order=order)
        axes[i].tick_params(axis='x', rotation=45)
    else:
        sns.boxplot(data=train_with_target, x=feature, y='SalePrice', ax=axes[i], palette='viridis')
    axes[i].set_title(f'SalePrice by {feature}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel(feature, fontsize=11)
    axes[i].set_ylabel('SalePrice', fontsize=11)

plt.tight_layout()
plt.savefig('visualizations/box_plots.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Feature Engineering

In [None]:
# Create new features
print("\nüîß Creating new features...")

# Total Square Feet
combined_df['TotalSF'] = combined_df['TotalBsmtSF'] + combined_df['1stFlrSF'] + combined_df['2ndFlrSF']

# House Age
combined_df['HouseAge'] = combined_df['YrSold'] - combined_df['YearBuilt']

# Remodel Age
combined_df['RemodelAge'] = combined_df['YrSold'] - combined_df['YearRemodAdd']

# Total Bathrooms
combined_df['TotalBath'] = combined_df['FullBath'] + 0.5 * combined_df['HalfBath'] + \
                           combined_df['BsmtFullBath'] + 0.5 * combined_df['BsmtHalfBath']

# Total Porch Area
combined_df['TotalPorchSF'] = combined_df['OpenPorchSF'] + combined_df['EnclosedPorch'] + \
                              combined_df['3SsnPorch'] + combined_df['ScreenPorch']

# Has Pool
combined_df['HasPool'] = (combined_df['PoolArea'] > 0).astype(int)

# Has Garage
combined_df['HasGarage'] = (combined_df['GarageArea'] > 0).astype(int)

# Has Basement
combined_df['HasBsmt'] = (combined_df['TotalBsmtSF'] > 0).astype(int)

# Has Fireplace
combined_df['HasFireplace'] = (combined_df['Fireplaces'] > 0).astype(int)

print(f"‚úÖ New features created! Total features: {combined_df.shape[1]}")

In [None]:
# Encode categorical variables
print("\nüîß Encoding categorical variables...")

# Get categorical columns
cat_cols = combined_df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns to encode: {len(cat_cols)}")

# Label encode ordinal features
ordinal_features = {
    'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'FireplaceQu': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PoolQC': ['None', 'Fa', 'TA', 'Gd', 'Ex'],
}

for feature, categories in ordinal_features.items():
    if feature in combined_df.columns:
        combined_df[feature] = combined_df[feature].map({cat: i for i, cat in enumerate(categories)})
        combined_df[feature] = combined_df[feature].fillna(0)

# One-hot encode remaining categorical features
remaining_cat_cols = combined_df.select_dtypes(include=['object']).columns.tolist()
combined_df = pd.get_dummies(combined_df, columns=remaining_cat_cols, drop_first=True)

print(f"‚úÖ Encoding complete! Total features: {combined_df.shape[1]}")

In [None]:
# Handle skewness in numerical features
from scipy.stats import skew

numeric_features = combined_df.select_dtypes(include=[np.number]).columns.tolist()
skewed_features = combined_df[numeric_features].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed_features = skewed_features[abs(skewed_features) > 0.75]

print(f"\nüìä Features with high skewness (>0.75): {len(skewed_features)}")

# Apply log transformation to highly skewed features
for feature in skewed_features.index:
    combined_df[feature] = np.log1p(combined_df[feature])

print("‚úÖ Log transformation applied to skewed features!")

## 6. Model Training

In [None]:
# Prepare data for modeling
X = combined_df.iloc[:ntrain].copy()
X_test_final = combined_df.iloc[ntrain:].copy()

# Use log-transformed target
y = np.log1p(y_train)

print(f"Training data shape: {X.shape}")
print(f"Test data shape: {X_test_final.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Split data for training and validation
X_train, X_val, y_train_split, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_final)

print("‚úÖ Features scaled!")

In [None]:
# Define RMSE function
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Dictionary to store results
results = {}

### 6.1 Linear Regression

In [None]:
# Train Linear Regression
print("\nüîÑ Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train_split)

# Predictions
lr_train_pred = lr_model.predict(X_train_scaled)
lr_val_pred = lr_model.predict(X_val_scaled)

# Evaluate
lr_train_rmse = rmse(y_train_split, lr_train_pred)
lr_val_rmse = rmse(y_val, lr_val_pred)
lr_r2 = r2_score(y_val, lr_val_pred)

results['Linear Regression'] = {
    'Train RMSE': lr_train_rmse,
    'Val RMSE': lr_val_rmse,
    'R¬≤ Score': lr_r2
}

print(f"‚úÖ Linear Regression Results:")
print(f"   Train RMSE: {lr_train_rmse:.4f}")
print(f"   Validation RMSE: {lr_val_rmse:.4f}")
print(f"   R¬≤ Score: {lr_r2:.4f}")

### 6.2 Ridge Regression

In [None]:
# Train Ridge Regression
print("\nüîÑ Training Ridge Regression...")
ridge_model = Ridge(alpha=10)
ridge_model.fit(X_train_scaled, y_train_split)

# Predictions
ridge_train_pred = ridge_model.predict(X_train_scaled)
ridge_val_pred = ridge_model.predict(X_val_scaled)

# Evaluate
ridge_train_rmse = rmse(y_train_split, ridge_train_pred)
ridge_val_rmse = rmse(y_val, ridge_val_pred)
ridge_r2 = r2_score(y_val, ridge_val_pred)

results['Ridge Regression'] = {
    'Train RMSE': ridge_train_rmse,
    'Val RMSE': ridge_val_rmse,
    'R¬≤ Score': ridge_r2
}

print(f"‚úÖ Ridge Regression Results:")
print(f"   Train RMSE: {ridge_train_rmse:.4f}")
print(f"   Validation RMSE: {ridge_val_rmse:.4f}")
print(f"   R¬≤ Score: {ridge_r2:.4f}")

### 6.3 Lasso Regression

In [None]:
# Train Lasso Regression
print("\nüîÑ Training Lasso Regression...")
lasso_model = Lasso(alpha=0.0005)
lasso_model.fit(X_train_scaled, y_train_split)

# Predictions
lasso_train_pred = lasso_model.predict(X_train_scaled)
lasso_val_pred = lasso_model.predict(X_val_scaled)

# Evaluate
lasso_train_rmse = rmse(y_train_split, lasso_train_pred)
lasso_val_rmse = rmse(y_val, lasso_val_pred)
lasso_r2 = r2_score(y_val, lasso_val_pred)

results['Lasso Regression'] = {
    'Train RMSE': lasso_train_rmse,
    'Val RMSE': lasso_val_rmse,
    'R¬≤ Score': lasso_r2
}

print(f"‚úÖ Lasso Regression Results:")
print(f"   Train RMSE: {lasso_train_rmse:.4f}")
print(f"   Validation RMSE: {lasso_val_rmse:.4f}")
print(f"   R¬≤ Score: {lasso_r2:.4f}")

### 6.4 Random Forest

In [None]:
# Train Random Forest
print("\nüîÑ Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train_split)

# Predictions
rf_train_pred = rf_model.predict(X_train)
rf_val_pred = rf_model.predict(X_val)

# Evaluate
rf_train_rmse = rmse(y_train_split, rf_train_pred)
rf_val_rmse = rmse(y_val, rf_val_pred)
rf_r2 = r2_score(y_val, rf_val_pred)

results['Random Forest'] = {
    'Train RMSE': rf_train_rmse,
    'Val RMSE': rf_val_rmse,
    'R¬≤ Score': rf_r2
}

print(f"‚úÖ Random Forest Results:")
print(f"   Train RMSE: {rf_train_rmse:.4f}")
print(f"   Validation RMSE: {rf_val_rmse:.4f}")
print(f"   R¬≤ Score: {rf_r2:.4f}")

### 6.5 Gradient Boosting

In [None]:
# Train Gradient Boosting
print("\nüîÑ Training Gradient Boosting...")
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
gb_model.fit(X_train, y_train_split)

# Predictions
gb_train_pred = gb_model.predict(X_train)
gb_val_pred = gb_model.predict(X_val)

# Evaluate
gb_train_rmse = rmse(y_train_split, gb_train_pred)
gb_val_rmse = rmse(y_val, gb_val_pred)
gb_r2 = r2_score(y_val, gb_val_pred)

results['Gradient Boosting'] = {
    'Train RMSE': gb_train_rmse,
    'Val RMSE': gb_val_rmse,
    'R¬≤ Score': gb_r2
}

print(f"‚úÖ Gradient Boosting Results:")
print(f"   Train RMSE: {gb_train_rmse:.4f}")
print(f"   Validation RMSE: {gb_val_rmse:.4f}")
print(f"   R¬≤ Score: {gb_r2:.4f}")

### 6.6 XGBoost

In [None]:
# Train XGBoost
print("\nüîÑ Training XGBoost...")
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)
xgb_model.fit(X_train, y_train_split)

# Predictions
xgb_train_pred = xgb_model.predict(X_train)
xgb_val_pred = xgb_model.predict(X_val)

# Evaluate
xgb_train_rmse = rmse(y_train_split, xgb_train_pred)
xgb_val_rmse = rmse(y_val, xgb_val_pred)
xgb_r2 = r2_score(y_val, xgb_val_pred)

results['XGBoost'] = {
    'Train RMSE': xgb_train_rmse,
    'Val RMSE': xgb_val_rmse,
    'R¬≤ Score': xgb_r2
}

print(f"‚úÖ XGBoost Results:")
print(f"   Train RMSE: {xgb_train_rmse:.4f}")
print(f"   Validation RMSE: {xgb_val_rmse:.4f}")
print(f"   R¬≤ Score: {xgb_r2:.4f}")

## 7. Model Evaluation & Comparison

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('Val RMSE')

print("\nüìä Model Comparison Results:")
print("="*70)
print(results_df.round(4))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# RMSE Comparison
x = np.arange(len(results_df))
width = 0.35

axes[0].bar(x - width/2, results_df['Train RMSE'], width, label='Train RMSE', color='steelblue', edgecolor='black')
axes[0].bar(x + width/2, results_df['Val RMSE'], width, label='Validation RMSE', color='coral', edgecolor='black')
axes[0].set_xlabel('Model', fontsize=12)
axes[0].set_ylabel('RMSE (log scale)', fontsize=12)
axes[0].set_title('RMSE Comparison Across Models', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(results_df.index, rotation=45, ha='right')
axes[0].legend()

# R¬≤ Score Comparison
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(results_df)))
axes[1].barh(results_df.index, results_df['R¬≤ Score'], color=colors, edgecolor='black')
axes[1].set_xlabel('R¬≤ Score', fontsize=12)
axes[1].set_title('R¬≤ Score Comparison', fontsize=14, fontweight='bold')
axes[1].set_xlim(0, 1)

for i, v in enumerate(results_df['R¬≤ Score']):
    axes[1].text(v + 0.01, i, f'{v:.4f}', va='center', fontsize=10)

plt.tight_layout()
plt.savefig('visualizations/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Feature Importance (using XGBoost)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False).head(20)

plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Top 20 Feature Importances (XGBoost)', fontsize=14, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.savefig('visualizations/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Actual vs Predicted plot for best model (XGBoost)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(np.expm1(y_val), np.expm1(xgb_val_pred), alpha=0.5, c='steelblue', edgecolors='black', linewidth=0.5)
axes[0].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual SalePrice', fontsize=12)
axes[0].set_ylabel('Predicted SalePrice', fontsize=12)
axes[0].set_title('Actual vs Predicted (XGBoost)', fontsize=14, fontweight='bold')
axes[0].legend()

# Residual plot
residuals = np.expm1(y_val) - np.expm1(xgb_val_pred)
axes[1].scatter(np.expm1(xgb_val_pred), residuals, alpha=0.5, c='coral', edgecolors='black', linewidth=0.5)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted SalePrice', fontsize=12)
axes[1].set_ylabel('Residuals', fontsize=12)
axes[1].set_title('Residual Plot (XGBoost)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('visualizations/actual_vs_predicted.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Predictions & Submission

In [None]:
# Train final model on full training data
print("\nüîÑ Training final XGBoost model on full training data...")

final_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)
final_model.fit(X, y)

print("‚úÖ Final model trained!")

In [None]:
# Generate predictions for test data
test_predictions_log = final_model.predict(X_test_final)
test_predictions = np.expm1(test_predictions_log)  # Convert back from log scale

print(f"\nüìä Test Predictions Summary:")
print(f"   Min: ${test_predictions.min():,.0f}")
print(f"   Max: ${test_predictions.max():,.0f}")
print(f"   Mean: ${test_predictions.mean():,.0f}")
print(f"   Median: ${np.median(test_predictions):,.0f}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})

submission.to_csv('submission.csv', index=False)
print("\n‚úÖ Submission file created: submission.csv")
print(f"\nüìÑ Preview of submission:")
submission.head(10)

In [None]:
# Visualize predictions distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training data distribution
sns.histplot(y_train, kde=True, ax=axes[0], color='steelblue', edgecolor='black', label='Training Data')
axes[0].set_title('Training SalePrice Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('SalePrice', fontsize=12)

# Test predictions distribution
sns.histplot(test_predictions, kde=True, ax=axes[1], color='coral', edgecolor='black', label='Test Predictions')
axes[1].set_title('Test Predictions Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('SalePrice', fontsize=12)

plt.tight_layout()
plt.savefig('visualizations/predictions_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Conclusion

In [None]:
print("\n" + "="*70)
print("                    üèÜ PROJECT SUMMARY üèÜ")
print("="*70)

print("\nüìä Dataset Statistics:")
print(f"   ‚Ä¢ Training samples: 1,460")
print(f"   ‚Ä¢ Test samples: 1,459")
print(f"   ‚Ä¢ Original features: 79")
print(f"   ‚Ä¢ Engineered features: {X.shape[1]}")

print("\nü§ñ Models Trained:")
for model_name in results.keys():
    print(f"   ‚Ä¢ {model_name}")

print("\nüèÜ Best Model: XGBoost")
print(f"   ‚Ä¢ Validation RMSE: {results['XGBoost']['Val RMSE']:.4f}")
print(f"   ‚Ä¢ R¬≤ Score: {results['XGBoost']['R¬≤ Score']:.4f}")

print("\nüìà Key Insights:")
print("   ‚Ä¢ Overall Quality is the most important predictor of house prices")
print("   ‚Ä¢ Living area (GrLivArea) strongly correlates with sale price")
print("   ‚Ä¢ Newer homes and recent remodels command higher prices")
print("   ‚Ä¢ Location (Neighborhood) significantly impacts property values")

print("\nüìÅ Generated Files:")
print("   ‚Ä¢ submission.csv - Kaggle submission file")
print("   ‚Ä¢ visualizations/ - All generated plots")

print("\n" + "="*70)
print("         ‚úÖ Project completed successfully!")
print("="*70)