# Kaggle House Price Prediction: Advanced Regression
### Project Goal:
Predict the final price of each home using 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa.

### Skills Demonstrated:
* Data Cleaning & Imputation
* Feature Encoding (One-Hot Encoding)
* Linear Regression Modeling
* Model Evaluation (RMSE)

## 1. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(0)

# for reproducibility
tf.random.set_seed(42)
np.set_printoptions(precision=3, suppress=True)

## 2. Load the Data

In [None]:
# Load dataset
train_df = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')

# Drop the 'Id' column as it's not a predictor
train_df.drop('Id', axis=1, inplace=True)

print(f"Dataset Shape: {train_df.shape}")
print(train_df.info())

In [None]:
train_df.head()

## 3. Simplified Data Cleaning

In [None]:
# Drop columns with more than 80% missing values
train_df = train_df.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])

# Drop rows where 'SalePrice' is missing
train_df = train_df.dropna(subset=['SalePrice'])

# Separate numerical and categorical columns
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = train_df.select_dtypes(include=['object']).columns

# Fill missing numerical values with median
train_df[num_cols] = train_df[num_cols].fillna(train_df[num_cols].median())

# Fill missing categorical values with "None"
train_df[cat_cols] = train_df[cat_cols].fillna('None')

## 4. Feature Engineering

In [None]:
# New features
train_final['TotalSF'] = train_final['TotalBsmtSF'] + train_final['1stFlrSF'] + train_final['2ndFlrSF']
train_final['TotalBath'] = (train_final['FullBath'] + 0.5 * train_final['HalfBath'] + train_final['BsmtFullBath'] + 0.5 * train_final['BsmtHalfBath'])
train_final['Age'] = train_final['YrSold'] - train_final['YearBuilt']
train_final['IsRemodeled'] = (train_final['YearBuilt'] != train_final['YearRemodAdd']).astype(int)
train_final['TotalPorchSF'] = (train_final['OpenPorchSF'] + train_final['EnclosedPorch'] + train_final['3SsnPorch'] + train_final['ScreenPorch'])

# Log scale the target variable to reduce skewness
train_final['SalePrice_Log'] = np.log1p(train_final['SalePrice'])

### Correlation Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Define the list of features to analyze
corr_features = [
    'SalePrice_Log', 'TotalSF', 'TotalBath', 'Age', 'IsRemodeled', 'TotalPorchSF',
    'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF', 
    'FullBath', 'YearRemodAdd', 'TotRmsAbvGrd', 'Fireplaces', 
    'MasVnrArea', 'LotFrontage', 'LotArea', 'GarageArea'
]

# 2. Calculate the correlation matrix
corrmat = train_final[corr_features].corr()

# 3. Create a mask for the diagonal (self-correlation)
mask = np.eye(len(corrmat), dtype=bool)

# 4. Plot the heatmap
plt.figure(figsize=(12, 9))
sns.heatmap(corrmat, annot=True, fmt='.2f', cmap='RdBu_r', square=True, 
            mask=mask, vmin=-1, vmax=1, center=0, linewidths=0.5)
plt.title('Correlation Matrix of Top Features')
plt.show()

In [None]:
# 1. Select features based on correlation analysis
selected_features = [
    'OverallQual', 'TotalSF', 'GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt', 
    'YearRemodAdd', 'Fireplaces', 'TotalBath', 'Age', 'TotalPorchSF',
    'ExterQual', 'KitchenQual', 'BsmtQual', 'HeatingQC', # Ordinal
    'Neighborhood', 'Foundation', 'CentralAir', 'MSZoning', 'SaleCondition' # Nominal
]

# 2. Prepare the final dataset
train_final = train_final[selected_features + ['SalePrice_Log']]

## 5. Feature Encoding

In [None]:
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}

# List of columns that use this specific scale
ord_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
            'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

for col in ord_cols:
    train_df[col] = train_df[col].map(quality_map)

# Convert 'CentralAir' to binary
train_df['CentralAir'] = train_df['CentralAir'].map({'Y': 1, 'N': 0})

# Create dummy variables for categorical data. Using One-Hot Encoding
train_final = pd.get_dummies(train_df)

print(f"Final Data Shape after Encoding: {train_final.shape}")

## 6. Model Training

In [None]:
# Define Features (X) and Target (y)
X = train_final.drop('SalePrice', axis=1)
y = np.log1p(train_final['SalePrice'])

# Split: 80% train, 20% cv (cross-validation)
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Set Shape: {X_train.shape}, {y_train.shape}")
print(f"CV Set Shape: {X_cv.shape}, {y_cv.shape}")

First let's try Random Forest Regressor