In [15]:
# House Price Prediction-Data Preprocessing & Feature Engineering

# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Loading the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ID = test['Id']

# Combining the data for preprocessing
train['TrainFlag'], test['TrainFlag'], test['SalePrice'] = 1, 0, np.nan
data = pd.concat([train, test], sort=False)

# Handle missing values
none_cols = ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','GarageType',
             'GarageFinish','GarageQual','GarageCond','BsmtQual','BsmtCond',
             'BsmtExposure','BsmtFinType1','BsmtFinType2','MasVnrType']
for col in none_cols:
    data[col] = data[col].fillna('None')

median_cols = ['GarageYrBlt', 'MasVnrArea', 'LotFrontage']
for col in median_cols:
    data[col] = data[col].fillna(data[col].median())

for col in data.select_dtypes(include='object'):
    data[col] = data[col].fillna(data[col].mode()[0])

for col in data.select_dtypes(include=np.number):
    data[col] = data[col].fillna(data[col].median())

# Feature engineering
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
data['TotalBath'] = data['BsmtFullBath'] + 0.5 * data['BsmtHalfBath'] + \
                    data['FullBath'] + 0.5 * data['HalfBath']
data['Age'] = data['YrSold'] - data['YearBuilt']
data['IsRemodeled'] = (data['YearBuilt'] != data['YearRemodAdd']).astype(int)

# Label encoding (ordinal categoricals)
label_cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageType',
              'ExterQual', 'ExterCond', 'BsmtFinType1', 'BsmtFinType2']
for col in label_cols:
    data[col] = LabelEncoder().fit_transform(data[col].astype(str))

# Transforming skewed numeric features
numeric_feats = data.select_dtypes(include=[np.number]).columns
skewed_feats = data[numeric_feats].apply(lambda x: x.skew()).abs()
high_skew = skewed_feats[skewed_feats > 0.75].index
for col in high_skew:
    data[col] = np.log1p(data[col])

data = pd.get_dummies(data)
train_clean = data[data['TrainFlag'] == 1].drop('TrainFlag', axis=1)
test_clean = data[data['TrainFlag'] == 0].drop(['TrainFlag', 'SalePrice'], axis=1)

# Features & Target
X = train_clean.drop(['Id', 'SalePrice'], axis=1)
y = train_clean['SalePrice']
