In [None]:
# =================================================
# COMPLETE HOUSE PRICE PREDICTION - BASELINE MODEL
# =================================================

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# ============================================
# STEP 1: LOAD DATA
# ============================================
print("STEP 1: Loading data...")
train = pd.read_csv('/kaggle/input/datasets/shadalishah/house-prices-advanced-regression-techniques/train.csv')
print(f" Data loaded: {train.shape[0]} rows, {train.shape[1]} columns\n")

# ============================================
# STEP 2: ANALYZE TARGET VARIABLE
# ============================================
print("STEP 2: Analyzing SalePrice...")
print(train['SalePrice'].describe())
print(f"Skewness: {train['SalePrice'].skew():.2f}\n")

# ============================================
# STEP 3: CHECK MISSING VALUES
# ============================================
print("STEP 3: Checking missing values...")
missing = train.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("Columns with missing values:")
print(missing)
print(f"\nTotal columns with missing data: {len(missing)}\n")

# ============================================
# STEP 4: HANDLE MISSING VALUES
# ============================================
print("STEP 4: Handling missing values...")
train_clean = train.copy()

# Get column types
numerical_cols = train_clean.select_dtypes(include=[np.number]).columns
categorical_cols = train_clean.select_dtypes(include=['object']).columns

# Fill missing values
for col in numerical_cols:
    if train_clean[col].isnull().sum() > 0:
        train_clean[col].fillna(train_clean[col].median(), inplace=True)

for col in categorical_cols:
    if train_clean[col].isnull().sum() > 0:
        train_clean[col].fillna('None', inplace=True)

print(" Missing values handled!\n")

# ============================================
# STEP 5: FEATURE ENGINEERING
# ============================================
print("STEP 5: Creating new features...")
train_clean['TotalSF'] = train_clean['TotalBsmtSF'] + train_clean['1stFlrSF'] + train_clean['2ndFlrSF']
train_clean['TotalBath'] = (train_clean['FullBath'] + 
                             (0.5 * train_clean['HalfBath']) + 
                             train_clean['BsmtFullBath'] + 
                             (0.5 * train_clean['BsmtHalfBath']))
train_clean['HouseAge'] = train_clean['YrSold'] - train_clean['YearBuilt']
train_clean['RemodAge'] = train_clean['YrSold'] - train_clean['YearRemodAdd']

print(" New features created: TotalSF, TotalBath, HouseAge, RemodAge\n")

# ============================================
# STEP 6: ENCODE CATEGORICAL VARIABLES
# ============================================
print("STEP 6: Encoding categorical variables...")
train_encoded = train_clean.copy()
le = LabelEncoder()

for col in categorical_cols:
    train_encoded[col] = le.fit_transform(train_encoded[col].astype(str))

print(f" Encoded {len(categorical_cols)} categorical variables\n")

# ============================================
# STEP 7: PREPARE FEATURES AND TARGET
# ============================================
print("STEP 7: Preparing data for modeling...")
X = train_encoded.drop(['Id', 'SalePrice'], axis=1)
y = np.log1p(train_encoded['SalePrice'])  # Log transform

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}\n")

# ============================================
# STEP 8: TRAIN-VALIDATION SPLIT
# ============================================
print("STEP 8: Splitting data...")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}\n")

# ============================================
# STEP 9: TRAIN LINEAR REGRESSION MODEL
# ============================================
print("STEP 9: Training Linear Regression model...")
model = LinearRegression()
model.fit(X_train, y_train)
print(" Model trained!\n")

# ============================================
# STEP 10: EVALUATE MODEL
# ============================================
print("STEP 10: Evaluating model...")
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))

print("="*50)
print("MODEL PERFORMANCE")
print("="*50)
print(f"Training RMSE: {rmse_train:.4f}")
print(f"Validation RMSE: {rmse_val:.4f}")
print("="*50)

# ============================================
# STEP 11: VISUALIZE PREDICTIONS
# ============================================
print("\nSTEP 11: Visualizing predictions...")
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_train, y_pred_train, alpha=0.5)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
plt.xlabel('Actual Log(Price)')
plt.ylabel('Predicted Log(Price)')
plt.title(f'Training Set (RMSE: {rmse_train:.4f})')

plt.subplot(1, 2, 2)
plt.scatter(y_val, y_pred_val, alpha=0.5)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
plt.xlabel('Actual Log(Price)')
plt.ylabel('Predicted Log(Price)')
plt.title(f'Validation Set (RMSE: {rmse_val:.4f})')

plt.tight_layout()
plt.show()



The gap between training RMSE and Validation RMSE is 0.025 (very small gap)
This means your model generalizes well to unseen data, so we apply another advance technique to reduce these small gaps 

Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
print(f"Random Forest RMSE: {rmse_rf:.4f}")

In [None]:
# ============================================
# XGBOOST MODEL
# ============================================
print("Training XGBoost model...")
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=3,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))

print(f"XGBoost RMSE: {rmse_xgb:.4f}")
print(f"Improvement over Linear Regression: {((0.1559 - rmse_xgb) / 0.1559 * 100):.1f}%")

Final Model Comparison

In [None]:
# ============================================
# MODEL COMPARISON TABLE
# ============================================
import pandas as pd

results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'Validation RMSE': [0.1559, 0.1442, rmse_xgb],  # Will update with XGBoost result
})

results['Improvement vs Baseline'] = ((0.1559 - results['Validation RMSE']) / 0.1559 * 100).round(2)
results = results.sort_values('Validation RMSE')

print("\n" + "="*60)
print("MODEL PERFORMANCE COMPARISON")
print("="*60)
print(results.to_string(index=False))
print("="*60)

# Visualize
plt.figure(figsize=(10, 5))
plt.barh(results['Model'], results['Validation RMSE'], color=['#ff9999', '#66b3ff', '#99ff99'])
plt.xlabel('Validation RMSE (Lower is Better)')
plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
for i, v in enumerate(results['Validation RMSE']):
    plt.text(v + 0.002, i, f'{v:.4f}', va='center')
plt.tight_layout()
plt.show()

XGBoost model validation RMSE is lower comparitevly all two models such as linear regressions and random forest , so XGBoost is win for final sales prediction 

Now i upload test datsets to predict final sale price of house to apply XGBoost model

In [None]:
# load test datasets
print("STEP 1: Loading data...")
test = pd.read_csv('/kaggle/input/datasets/shadalishah/house-prices-advanced-regression-techniques/test.csv')
print(f" Data loaded: {train.shape[0]} rows, {train.shape[1]} columns\n")
print(test.head())

In [None]:
# Handle missing values
for col in test.columns:
    if test[col].dtype in ['float64', 'int64']:
        test[col].fillna(test[col].median(), inplace=True)
    else:
        test[col].fillna('None', inplace=True)

# Create features
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']
test['TotalBath'] = test['FullBath'] + 0.5*test['HalfBath'] + test['BsmtFullBath'] + 0.5*test['BsmtHalfBath']
test['HouseAge'] = test['YrSold'] - test['YearBuilt']
test['RemodAge'] = test['YrSold'] - test['YearRemodAdd']

# Encode categorical
categorical_cols = test.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    test[col] = le.fit_transform(test[col].astype(str))

# Prepare features
test_ids = test['Id'].copy()
X_test = test.drop(['Id'], axis=1)

# Align columns
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

# Predict
predictions = np.expm1(xgb_model.predict(X_test))

# Create submission
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions})

# Save to CSV
submission.to_csv('house_sale_price_predictions.csv', index=False)

# Print all predictions
print("ALL HOUSE PRICE PREDICTIONS")
print("="*60)
pd.set_option('display.max_rows', None)
print(submission)
print("="*60)
print(f"\n Total houses predicted: {len(submission)}")
print(f" File saved: house_sale_price_predictions.csv")
print(f"\nPrice Statistics:")
print(f"  Minimum: ${predictions.min():,.2f}")
print(f"  Maximum: ${predictions.max():,.2f}")
print(f"  Average: ${predictions.mean():,.2f}")
print(f"  Median:  ${np.median(predictions):,.2f}")