In [None]:
%load_ext autoreload
%autoreload 2

In [46]:
import pandas as pd

In [48]:
df = pd.read_csv('../../data/clean_data/train_df_preprocessed.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,No Alley,Reg,Lvl,AllPub,...,0,No Pool,No Fence,No feature,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,No Alley,Reg,Lvl,AllPub,...,0,No Pool,No Fence,No feature,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,No Alley,IR1,Lvl,AllPub,...,0,No Pool,No Fence,No feature,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,No Alley,IR1,Lvl,AllPub,...,0,No Pool,No Fence,No feature,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,No Alley,IR1,Lvl,AllPub,...,0,No Pool,No Fence,No feature,0,12,2008,WD,Normal,250000


*New Features Creation*

* Age/Date Features

In [49]:
df['HouseAge'] = df['YrSold'] - df['YearBuilt']
df['YearsSinceRemodel'] = df['YrSold'] - df['YearRemodAdd']
df['WasRemodeled'] = df.apply(lambda row: 1 if row['YearRemodAdd'] > row['YearBuilt'] else 0, axis=1)
df['GarageAge'] = df['YrSold'] - df['GarageYrBlt']
df['HouseAgeCategory'] = pd.cut(df['HouseAge'], bins=[0, 10, 20, 50, 100, 200], labels=['New', 'Recent', 'Modern', 'Old', 'Very Old'])
df['SeasonSold'] = df['MoSold'].apply(lambda x: 'Spring' if x in [3, 4, 5] else 'Summer' if x in [6, 7, 8] else 'Fall' if x in [9, 10, 11] else 'Winter')

* Quality and Condition Features

In [50]:
df['QualCondMult'] = df['OverallQual'] * df['OverallCond']
df['QualCondRatio'] = df['OverallQual'] / df['OverallCond']

In [51]:
# Create new numerical features for quality and condition
qc_mapping = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1
}

qc_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond', 'PoolQC', 'FireplaceQu']
for feature in qc_features:
    df[feature] = df[feature].map(qc_mapping).fillna(0)

In [52]:
df['OverallQualScore'] = (
    df['ExterQual'] + 
    df['ExterCond'] +
    df['BsmtQual'] +
    df['BsmtCond'] +
    df['HeatingQC'] +
    df['KitchenQual'] +
    df['GarageQual'] +
    df['GarageCond'] +
    df['PoolQC'] +
    df['FireplaceQu']
)

* Has {...} Features

In [58]:
df['HasAlley'] = df['Alley'].apply(lambda x: 0 if x == 'No Alley' else 1)
df['HasMasVnr'] = df['MasVnrType'].apply(lambda x: 0 if x == 'No masonry veneer' else 1)
df['HasBsmt'] = df['BsmtQual'].apply(lambda x: 0 if x == 0 else 1)
df['HasFireplace'] = df['FireplaceQu'].apply(lambda x: 0 if x == 0 else 1)
df['HasGarage'] = df['GarageQual'].apply(lambda x: 0 if x == 0 else 1)
df['HasPool'] = df['PoolQC'].apply(lambda x: 0 if x == 0 else 1)
df['HasFence'] =df['Fence'].apply(lambda x: 0 if x == 'No Fence' else 1)
df['HasPorch'] = df[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1).apply(lambda x: 1 if x > 0 else 0)

* Footage/Area Features

In [59]:
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['TotalPorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
df['TotalBsmtFinSF'] = df['BsmtFinSF1'] + df['BsmtFinSF2']
df['HouseToLotRatio'] = df['GrLivArea'] / df['LotArea']

* Neighbourhood Related Features

In [60]:
rich_neighborhoods = df.groupby('Neighborhood')['SalePrice'].mean().nlargest(10).index
df['IsRichNeighborhood'] = df['Neighborhood'].apply(lambda x: 1 if x in rich_neighborhoods else 0)

*Feature transformation*

In [None]:
# TODO apply np.log1p for right-skewed features, see outliers section in data_processing.ipynb