In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
import category_encoders as ce
import xgboost as xgb
from datetime import datetime

# TRAIN

In [2]:
train = pd.read_csv(r'D:\Data Science\Projects\Housing Project\Data\train.csv')

In [3]:
# train = pd.read_csv(r'/Users/kimberly/Documents/GitHub/githubtest/Data/train.csv')

## FillNA Functions

In [4]:
def fillna_groupby_train(df, feature, group, agg_funct):
    df[feature] = df.groupby(group)[feature].transform(lambda x: x.fillna(x.agg(agg_funct).iloc[0]))
    return df

def fillna_mode_groupby_test(test, feature, group):
    mode_dict = train.groupby(group)[feature].apply(lambda x: x.mode().iloc[0]).to_dict()
    test.loc[test[feature].isnull(), feature] = test[group].map(mode_dict)
    return test

def fillna_mean_groupby_test(test, feature, group):
    mode_dict = train.groupby(group)[feature].apply(lambda x: x.mean()).to_dict()
    test.loc[test[feature].isnull(), feature] = test[group].map(mode_dict)
    return test


def fillna_NA(df, feature):
    df[feature] = np.where(df[feature].isnull(), 'NA', df[feature])
    return df

def fillna_ZERO(df, feature):
    df[feature] = np.where(df[feature].isnull(), 0, df[feature])
    return df

def fillna__NA_basedon_feat_equalzero(df, depend_feat, feature):
    df[feature] = np.where((df[feature].isnull()) & (df[depend_feat]==0), 'NA', df[feature])
    return df

def fillna__NA_basedon_feat_greaterzero(df, depend_feat, feature):
    df[feature] = np.where((df[feature].isnull()) & (df[depend_feat]> 0), 'NA', df[feature])
    return df

def fillna__value_basedon_feat_greaterzero(df, depend_feat, feature, value):
    df[feature] = np.where((df[feature].isnull()) & (df[depend_feat]> 0), value, df[feature])
    return df

def fillna__mode(df, feature):
    df[feature] = np.where(df[feature].isnull(), df[feature].mode(), df[feature])
    return df

def fillna_mean(df, feature):
    df[feature] = np.where(df[feature].isnull(), df[feature].mean(), df[feature])
    return df

def fillna_mode_test(test, train, feature):
    test[feature] = np.where(test[feature].isnull(), train[feature].mode(), test[feature])
    return test

def fillna_ZERO_basedon_feat_equalzero(df, depend_feat, feature):
    df[feature] = np.where((df[feature].isnull()) & (df[depend_feat]==0), 0, df[feature])
    return df

def fillna_otherfeat(df, depend_feat, feature):
    df[feature] = np.where(df[feature].isnull(), df[depend_feat], df[feature])
    return df
 
# FillNA Specific Features
def fillna_MiscFeature(df):
    df['MiscFeature'] = np.where((df['MiscFeature'].isnull()) & (df['GarageType']=='2Types'), 'Gar2', df['MiscFeature'])
    df['MiscFeature'] = np.where(df['MiscFeature'].isnull(), 'NA', df['MiscFeature'])
    return df

def fillna_BsmtFinType2(df):
    df['BsmtFinType2'] = np.where((df['BsmtFinType2'].isnull() & df['BsmtUnfSF'] > 0), 'Unf', df['BsmtFinType2'])
    df['BsmtFinType2'] = np.where(df['BsmtFinType2'].isnull(), 'NA', df['BsmtFinType2'])
    return df


## Feature Engineering Functions

In [5]:
def feateng_age(df, feat, newfeat):
    df[newfeat] = datetime.now().year - df[feat]
    return df

def feateng_ratio(df, feat1, feat2, newfeat):
    df[newfeat] = (df[feat1]/df[feat2]).round(2)
    return df

def feateng_has_YN(df, feat, newfeat):
    df[newfeat]= np.where(df[feat]=='Y', 1, 0)
    return df

def feateng_has_greaterzero(df, feat, newfeat):
    df[newfeat] = df[feat].apply(lambda x: 1 if x > 0 else 0)
    return df

## Transformation Functions

In [6]:
def log_feat(df):
    skewed_feats = df.skew(numeric_only=True)[df.skew(numeric_only=True) > 0.75]
    for feat in skewed_feats.index:
        df[feat] = np.log1p(df[feat])

In [7]:
def log_each_feat(df, feat):
    df[feat] = np.log1p(df[feat])
    return df

## Datatype Functions

In [8]:
def convert_dtype(df, feat, dtype):
    df[feat] = df[feat].astype(dtype)
    return 

def convert_datetime_year(df, feat):
    df[feat] = pd.to_datetime(df[feat], format='%Y').dt.year
    return df

def convert_datetime_month(df, feat):
    df[feat] = pd.to_datetime(df[feat], format='%m').dt.month
    return df

def convert_datetime_number_year(df, feat):
    df[feat] = pd.to_datetime(df[feat], format='%Y').dt.strftime('%Y')
    return df

def convert_datetime_number_month(df, feat):
    df[feat] = pd.to_datetime(df[feat], format='%m').dt.month
    return df

def convert_float_int(df, feat):
    df[feat] = df[feat].astype(int)

## Encoding/Scaling Functions

In [9]:
def ordinal_encoding(df, ord_feats):
    ord = OrdinalEncoder()
    ord.fit(ord_feats)
    ord_feats_encoded = ord.transform(ord_feats)
    new_ord_feats = pd.DataFrame(ord_feats_encoded, columns=ord_feats.columns)
    df.drop(columns=ord_feats, inplace=True)
    df = df.merge(new_ord_feats, left_index=True, right_index=True)
    return df

def nom_encoding(df, nom_feats):
    one = ce.OneHotEncoder(use_cat_names=True)
    one.fit(nom_feats)
    nom_feats_encoded = one.transform(nom_feats)
    df.drop(columns=nom_feats, inplace=True)
    df = df.merge(nom_feats_encoded, left_index=True, right_index=True)
    return df

def cont_scaling(df, cont_feats):
    scaler_minmax = MinMaxScaler()
    scaler_minmax.fit(cont_feats)
    cont_feats_scaled = scaler_minmax.transform(cont_feats)
    new_cont_feats = pd.DataFrame(cont_feats_scaled, columns=cont_feats.columns)
    df.drop(columns=cont_feats, inplace=True)
    df = df.merge(new_cont_feats, left_index=True, right_index=True)
    return df

## Outliers

In [10]:
# train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 200000)].index).reset_index(drop=True)

In [11]:
train = train.drop(train[train['LotArea']> 150000].index).reset_index(drop=True)

## Fill Null Features

In [12]:
### Categorical Features
fillna__NA_basedon_feat_equalzero(train, 'PoolArea', 'PoolQC')#
fillna_MiscFeature(train)#
fillna_NA(train, ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'Fence'])#
fillna__NA_basedon_feat_equalzero(train, 'Fireplaces', 'FireplaceQu')#
fillna__NA_basedon_feat_equalzero(train, 'GarageCars', 'GarageType')#
fillna__NA_basedon_feat_equalzero(train, 'GarageCars', 'GarageCond')#
fillna__NA_basedon_feat_equalzero(train, 'GarageCars', 'GarageQual')#
fillna__NA_basedon_feat_equalzero(train, 'GarageCars', 'GarageFinish')#
fillna_BsmtFinType2(train)#
fillna__NA_basedon_feat_equalzero(train, 'TotalBsmtSF', 'BsmtExposure')#
fillna__mode(train, ['Electrical', 'BsmtExposure'])#

### Numerical Features
fillna_groupby_train(train, 'LotFrontage', 'Neighborhood', 'mode')##
fillna_ZERO(train, 'MasVnrArea')#
fillna_ZERO_basedon_feat_equalzero(train, 'GarageCars', 'GarageYrBlt')#


train.isnull().sum().sort_values(ascending=False)

Id             0
CentralAir     0
GarageYrBlt    0
GarageType     0
FireplaceQu    0
              ..
MasVnrArea     0
MasVnrType     0
Exterior2nd    0
Exterior1st    0
SalePrice      0
Length: 81, dtype: int64

## Feature Engineering 

In [13]:
feateng_age(train, 'YearBuilt', 'age')
feateng_age(train, 'YearRemodAdd', 'remodel_age')
feateng_ratio(train, 'GrLivArea', 'LotArea', 'livingtolot')
feateng_has_YN(train, 'PavedDrive', 'has_drivewaypaved')
feateng_has_greaterzero(train, 'Fireplaces', 'has_fireplace')
feateng_has_greaterzero(train, 'PoolArea', 'has_pool')


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,YrSold,SaleType,SaleCondition,SalePrice,age,remodel_age,livingtolot,has_drivewaypaved,has_fireplace,has_pool
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,2008,WD,Normal,208500,21,21,0.20,1,0,0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,2007,WD,Normal,181500,48,48,0.13,1,1,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,2008,WD,Normal,223500,23,22,0.16,1,1,0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,2006,WD,Abnorml,140000,109,54,0.18,1,1,0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,2008,WD,Normal,250000,24,24,0.15,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,2007,WD,Normal,175000,25,24,0.21,1,1,0
1453,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,2010,WD,Normal,210000,46,36,0.16,1,1,0
1454,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,2010,WD,Normal,266500,83,18,0.26,1,1,0
1455,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,2010,WD,Normal,142125,74,28,0.11,1,0,0


## Transformation

In [14]:
# log_feat(train)

In [15]:
skewed_feats = [
'MSSubClass',
'LotFrontage',
'LotArea',
'MasVnrArea',
'BsmtFinSF1',
'BsmtFinSF2',
'BsmtUnfSF',
'TotalBsmtSF',
'1stFlrSF',
'2ndFlrSF',
'LowQualFinSF',
'GrLivArea',
'BsmtHalfBath',
'KitchenAbvGr',
'WoodDeckSF',
'OpenPorchSF',
'EnclosedPorch',
'3SsnPorch',
'ScreenPorch',
'PoolArea',
'MiscVal',
'SalePrice'
]

log_each_feat(train, skewed_feats)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,YrSold,SaleType,SaleCondition,SalePrice,age,remodel_age,livingtolot,has_drivewaypaved,has_fireplace,has_pool
0,1,4.110874,RL,4.189655,9.042040,Pave,,Reg,Lvl,AllPub,...,2008,WD,Normal,12.247699,21,21,0.20,1,0,0
1,2,3.044522,RL,4.394449,9.169623,Pave,,Reg,Lvl,AllPub,...,2007,WD,Normal,12.109016,48,48,0.13,1,1,0
2,3,4.110874,RL,4.234107,9.328212,Pave,,IR1,Lvl,AllPub,...,2008,WD,Normal,12.317171,23,22,0.16,1,1,0
3,4,4.262680,RL,4.110874,9.164401,Pave,,IR1,Lvl,AllPub,...,2006,WD,Abnorml,11.849405,109,54,0.18,1,1,0
4,5,4.110874,RL,4.442651,9.565284,Pave,,IR1,Lvl,AllPub,...,2008,WD,Normal,12.429220,24,24,0.15,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,1456,4.110874,RL,4.143135,8.976894,Pave,,Reg,Lvl,AllPub,...,2007,WD,Normal,12.072547,25,24,0.21,1,1,0
1453,1457,3.044522,RL,4.454347,9.486152,Pave,,Reg,Lvl,AllPub,...,2010,WD,Normal,12.254868,46,36,0.16,1,1,0
1454,1458,4.262680,RL,4.204693,9.109746,Pave,,Reg,Lvl,AllPub,...,2010,WD,Normal,12.493133,83,18,0.26,1,1,0
1455,1459,3.044522,RL,4.234107,9.181735,Pave,,Reg,Lvl,AllPub,...,2010,WD,Normal,11.864469,74,28,0.11,1,0,0


## Datatype conversions

In [16]:
convert_dtype(train, ['MSSubClass', 'OverallCond', 'GarageFinish'], 'str')
convert_dtype(train, ['LotFrontage', 'MasVnrArea'], 'int')
convert_datetime_year(train, 'YearBuilt')
convert_datetime_year(train, 'YearRemodAdd')
convert_datetime_month(train, 'MoSold')
convert_datetime_number_year(train, 'YrSold')
convert_float_int(train, 'GarageYrBlt')

## Encoding/Scaling

In [17]:
df = train.copy()

In [18]:
ord_feats = train[[
'LotShape',
'LandContour',
'Utilities',
'LandSlope',
'OverallQual',
'OverallCond',
'YearBuilt',
'YearRemodAdd',
'ExterQual',
'ExterCond',
'BsmtQual',
'BsmtCond',
'BsmtExposure',
'BsmtFinType1',
'BsmtFinType2',
'HeatingQC',
'CentralAir',
'KitchenQual',
'Functional',
'FireplaceQu',
'GarageYrBlt',
'GarageFinish',
'GarageQual',
'GarageCond',
'PoolQC',
'Fence',
'MiscVal'
]]

nom_feats = train[[
'MSSubClass',
'MSZoning',
'Street',
'Alley',
'LotConfig',
'Neighborhood',
'Condition1',
'Condition2',
'BldgType',
'HouseStyle',
'RoofStyle',
'RoofMatl',
'Exterior1st',
'Exterior2nd',
'MasVnrType',
'Foundation',
'Heating',
'Electrical',
'GarageType',
'PavedDrive',
'MiscFeature',
'SaleType',
'SaleCondition'

]]


cont_feats = train[[
'LotFrontage',
'LotArea',
'MasVnrArea',
'BsmtFinSF1',
'BsmtFinSF2',
'BsmtUnfSF',
'TotalBsmtSF',
'1stFlrSF',
'2ndFlrSF',
'LowQualFinSF',
'GrLivArea',
'BsmtFullBath',
'BsmtHalfBath',
'FullBath',
'HalfBath',
'BedroomAbvGr',
'KitchenAbvGr',
'TotRmsAbvGrd',
'Fireplaces',
'GarageCars',
'GarageArea',
'WoodDeckSF',
'OpenPorchSF',
'EnclosedPorch',
'3SsnPorch',
'ScreenPorch',
'PoolArea',
'MoSold',
'YrSold'


]]


In [19]:
# ord = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
# ord.fit(ord_feats)
# ord_feats_encoded = ord.transform(ord_feats)
# new_ord_feats = pd.DataFrame(ord_feats_encoded, columns=ord_feats.columns)
# df.drop(columns=ord_feats, inplace=True)
# df = df.merge(new_ord_feats, left_index=True, right_index=True)

# one = ce.OneHotEncoder(use_cat_names=True)
# one.fit(nom_feats)
# nom_feats_encoded = one.transform(nom_feats)
# df.drop(columns=nom_feats, inplace=True)
# df = df.merge(nom_feats_encoded, left_index=True, right_index=True)

# scaler_minmax = MinMaxScaler()
# scaler_minmax.fit(cont_feats)
# cont_feats_scaled = scaler_minmax.transform(cont_feats)
# new_cont_feats = pd.DataFrame(cont_feats_scaled, columns=cont_feats.columns)
# df.drop(columns=cont_feats, inplace=True)
# df = df.merge(new_cont_feats, left_index=True, right_index=True)

In [20]:
ord = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ord.fit(ord_feats)
ord_feats_encoded = ord.transform(ord_feats)
new_ord_feats = pd.DataFrame(ord_feats_encoded, columns=ord_feats.columns)
df.drop(columns=ord_feats, inplace=True)
df = df.merge(new_ord_feats, left_index=True, right_index=True)
df.shape

(1457, 87)

In [21]:
one = ce.OneHotEncoder(use_cat_names=True)
one.fit(nom_feats)
nom_feats_encoded = one.transform(nom_feats)
nom_feats_encoded

Unnamed: 0,MSSubClass_4.110873864173311,MSSubClass_3.044522437723423,MSSubClass_4.2626798770413155,MSSubClass_3.9318256327243257,MSSubClass_5.25227342804663,MSSubClass_3.828641396489095,MSSubClass_4.51085950651685,MSSubClass_4.795790545596741,MSSubClass_3.4339872044851463,MSSubClass_4.454347296253507,...,SaleType_CWD,SaleType_ConLw,SaleType_Con,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_Partial,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1453,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1454,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1455,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [59]:
nom_feats_encoded.to_csv("nom_feats_encoded.csv", index=False)

In [23]:
one = ce.OneHotEncoder(use_cat_names=True)
one.fit(nom_feats)
nom_feats_encoded = one.transform(nom_feats)
df.drop(columns=nom_feats, inplace=True)
df = df.merge(nom_feats_encoded, left_index=True, right_index=True)
df.shape

(1457, 245)

In [24]:
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(cont_feats)
cont_feats_scaled = scaler_minmax.transform(cont_feats)
new_cont_feats = pd.DataFrame(cont_feats_scaled, columns=cont_feats.columns)
df.drop(columns=cont_feats, inplace=True)
df = df.merge(new_cont_feats, left_index=True, right_index=True)
df.shape

(1457, 245)

In [25]:
# ordinal_encoding(df, ord_feats)
# nom_encoding(df, nom_feats)
# cont_scaling(df, cont_feats)

## Train Test Split

In [26]:
y = df['SalePrice']
x = df.drop(columns=['Id', 'SalePrice'])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=0)
xtrain.shape, ytrain.shape, xtest.shape, ytest.shape

((1165, 243), (1165,), (292, 243), (292,))

## XGB

In [27]:
xgbr = xgb.XGBRegressor(n_estimators=1000, max_depth=2) #regularization and increase or reduce learning rate
xgbr.fit(xtrain, ytrain)

test_pred = xgbr.predict(xtest)
train_pred = xgbr.predict(xtrain)

ytrain = np.expm1(ytrain)
ytest = np.expm1(ytest)
test_pred = np.expm1(test_pred)
train_pred = np.expm1(train_pred)

print('RMSE Test:', mean_squared_error(ytest, test_pred)**0.5)
print('RMSE Train:', mean_squared_error(ytrain, train_pred)**0.5)

RMSE Test: 36043.854459849754
RMSE Train: 3648.9336293172714


# TEST

In [28]:
test = pd.read_csv(r'D:\Data Science\Projects\Housing Project\Data\test.csv')

In [29]:
# test = pd.read_csv(r'/Users/kimberly/Documents/GitHub/githubtest/Data/test.csv')

In [30]:
test.isnull().sum().sort_values(ascending=False)

PoolQC           1456
MiscFeature      1408
Alley            1352
Fence            1169
MasVnrType        894
                 ... 
Electrical          0
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
SaleCondition       0
Length: 80, dtype: int64

In [31]:
test['MSZoning'].isnull().sum()

4

In [32]:
test[test['MSZoning'].isnull()][['Neighborhood', 'MSZoning']]

Unnamed: 0,Neighborhood,MSZoning
455,IDOTRR,
756,IDOTRR,
790,IDOTRR,
1444,Mitchel,


In [33]:
train.groupby('Neighborhood')['MSZoning'].apply(lambda x: x.mode().iloc[0])

Neighborhood
Blmngtn    RL
Blueste    RM
BrDale     RM
BrkSide    RM
ClearCr    RL
CollgCr    RL
Crawfor    RL
Edwards    RL
Gilbert    RL
IDOTRR     RM
MeadowV    RM
Mitchel    RL
NAmes      RL
NPkVill    RL
NWAmes     RL
NoRidge    RL
NridgHt    RL
OldTown    RM
SWISU      RL
Sawyer     RL
SawyerW    RL
Somerst    FV
StoneBr    RL
Timber     RL
Veenker    RL
Name: MSZoning, dtype: object

In [34]:
test['MSZoning'].value_counts()

MSZoning
RL         1114
RM          242
FV           74
C (all)      15
RH           10
Name: count, dtype: int64

MSZoning
RL         1115
RM          245
FV           74
C (all)      15
RH           10

## Fill Null Features

In [35]:
### Categorical Features
fillna__NA_basedon_feat_equalzero(test, 'PoolArea', 'PoolQC')
fillna_mode_test(test, train, 'PoolQC')
fillna_MiscFeature(test)
fillna_NA(test, 'MiscFeature')
fillna_NA(test, 'Alley')
fillna_NA(test, 'Fence')
fillna_NA(test, 'MasVnrType')
fillna__NA_basedon_feat_equalzero(test, 'Fireplaces', 'FireplaceQu')
fillna__NA_basedon_feat_equalzero(test, 'GarageArea', 'GarageCond')
fillna_mode_groupby_test(test, 'GarageCond', 'GarageFinish')#
fillna__NA_basedon_feat_equalzero(test, 'GarageArea', 'GarageFinish')#
fillna_mode_groupby_test(test, 'GarageQual', 'GarageFinish')#
fillna_mode_test(test, train, 'GarageFinish')
fillna__NA_basedon_feat_equalzero(test, 'GarageArea', 'GarageQual')
fillna_mode_groupby_test(test, 'GarageFinish', 'GarageQual')
fillna__NA_basedon_feat_equalzero(test, 'GarageArea', 'GarageType')
fillna_NA(test, 'BsmtCond')
fillna_NA(test, 'BsmtQual')
fillna__NA_basedon_feat_equalzero(test, 'TotalBsmtSF', 'BsmtExposure')
fillna_mode_test(test, train, 'BsmtExposure')
fillna_BsmtFinType2(test)
fillna_NA(test, 'BsmtFinType2')
fillna_NA(test, 'BsmtFinType1')
fillna_mode_groupby_test(test, 'MSZoning', 'Neighborhood')
fillna_mode_test(test, train, 'Utilities')
fillna_mode_test(test, train, 'Functional')
fillna_mode_test(test, train, 'Exterior2nd')
fillna_mode_groupby_test(test, 'KitchenQual', 'KitchenAbvGr')
fillna_mode_test(test, train, 'KitchenQual')
fillna_mode_test(test, train, 'SaleType')
fillna_mode_test(test, train, 'Exterior1st')


### Numerical Features
fillna_mean_groupby_test(test,'LotFrontage', 'Neighborhood')
fillna_ZERO_basedon_feat_equalzero(test, 'GarageArea', 'GarageYrBlt')
fillna_otherfeat(test, 'YearRemodAdd', 'GarageYrBlt')
fillna_ZERO(test, 'MasVnrArea')
fillna_ZERO(test, 'BsmtFullBath')
fillna_ZERO(test, 'BsmtHalfBath')
fillna_ZERO(test, 'BsmtFinSF2')
fillna_mode_test(test, train, 'GarageCars')
fillna_mean(test, 'GarageArea')
fillna_ZERO(test, 'TotalBsmtSF')
fillna_ZERO(test, 'BsmtUnfSF')
fillna_ZERO(test, 'BsmtFinSF1')



# fillna_mode_test(test, train, 'Neighborhood')
# fillna_mode_groupby_test(test, 'Electrical', 'MSZoning')

test.isnull().sum().sort_values(ascending=False)





GarageQual       2
GarageCond       2
Id               0
BedroomAbvGr     0
FireplaceQu      0
                ..
MasVnrType       0
Exterior2nd      0
Exterior1st      0
RoofMatl         0
SaleCondition    0
Length: 80, dtype: int64

## Feature Engineering

In [36]:
feateng_age(test, 'YearBuilt', 'age')
feateng_age(test, 'YearRemodAdd', 'remodel_age')
feateng_ratio(test, 'GrLivArea', 'LotArea', 'livingtolot')
feateng_has_YN(test, 'PavedDrive', 'has_paveddrive')
feateng_has_greaterzero(test, 'Fireplaces', 'has_fireplace')
feateng_has_greaterzero(test, 'PoolArea', 'has_pool')


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MoSold,YrSold,SaleType,SaleCondition,age,remodel_age,livingtolot,has_paveddrive,has_fireplace,has_pool
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,6,2010,WD,Normal,63,63,0.08,1,0,0
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,66,66,0.09,1,0,0
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,3,2010,WD,Normal,27,26,0.12,1,1,0
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,26,26,0.16,1,1,0
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,1,2010,WD,Normal,32,32,0.26,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,6,2006,WD,Normal,54,54,0.56,1,0,0
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,4,2006,WD,Abnorml,54,54,0.58,1,0,0
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,9,2006,WD,Abnorml,64,28,0.06,1,1,0
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,7,2006,WD,Normal,32,32,0.09,1,0,0


## Datatype conversions

In [37]:
test['LotFrontage'].isnull().sum()

0

In [38]:
test['LotFrontage'].value_counts()

LotFrontage
60.0     133
80.0      68
70.0      63
50.0      60
75.0      52
        ... 
117.0      1
28.0       1
119.0      1
25.0       1
140.0      1
Name: count, Length: 135, dtype: int64

In [39]:
convert_dtype(test, ['MSSubClass', 'OverallCond'], 'str')
convert_dtype(test, ['LotFrontage', 'MasVnrArea'], 'int')
convert_datetime_year(test, 'YearBuilt')
convert_datetime_year(test, 'YearRemodAdd')
convert_datetime_month(test, 'MoSold')
convert_datetime_number_year(test, 'YrSold')
convert_float_int(test, 'GarageYrBlt')

## Encoding/Scaling

In [40]:
ord_feats = test[[
'LotShape',
'LandContour',
'Utilities',
'LandSlope',
'OverallQual',
'OverallCond',
'YearBuilt',
'YearRemodAdd',
'ExterQual',
'ExterCond',
'BsmtQual',
'BsmtCond',
'BsmtExposure',
'BsmtFinType1',
'BsmtFinType2',
'HeatingQC',
'CentralAir',
'KitchenQual',
'Functional',
'FireplaceQu',
'GarageYrBlt',
'GarageFinish',
'GarageQual',
'GarageCond',
'PoolQC',
'Fence',
'MiscVal'
]]

nom_feats = test[[
'MSSubClass',
'MSZoning',
'Street',
'Alley',
'LotConfig',
'Neighborhood',
'Condition1',
'Condition2',
'BldgType',
'HouseStyle',
'RoofStyle',
'RoofMatl',
'Exterior1st',
'Exterior2nd',
'MasVnrType',
'Foundation',
'Heating',
'Electrical',
'GarageType',
'PavedDrive',
'MiscFeature',
'SaleType',
'SaleCondition'

# 'has_paveddrive'
]]


cont_feats = test[[
'LotFrontage',
'LotArea',
'MasVnrArea',
'BsmtFinSF1',
'BsmtFinSF2',
'BsmtUnfSF',
'TotalBsmtSF',
'1stFlrSF',
'2ndFlrSF',
'LowQualFinSF',
'GrLivArea',
'BsmtFullBath',
'BsmtHalfBath',
'FullBath',
'HalfBath',
'BedroomAbvGr',
'KitchenAbvGr',
'TotRmsAbvGrd',
'Fireplaces',
'GarageCars',
'GarageArea',
'WoodDeckSF',
'OpenPorchSF',
'EnclosedPorch',
'3SsnPorch',
'ScreenPorch',
'PoolArea',
'MoSold',
'YrSold'


]]


In [41]:
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MoSold,YrSold,SaleType,SaleCondition,age,remodel_age,livingtolot,has_paveddrive,has_fireplace,has_pool
0,1461,20,RH,80,11622,Pave,,Reg,Lvl,AllPub,...,6,2010,WD,Normal,63,63,0.08,1,0,0
1,1462,20,RL,81,14267,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,66,66,0.09,1,0,0
2,1463,60,RL,74,13830,Pave,,IR1,Lvl,AllPub,...,3,2010,WD,Normal,27,26,0.12,1,1,0
3,1464,60,RL,78,9978,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,26,26,0.16,1,1,0
4,1465,120,RL,43,5005,Pave,,IR1,HLS,AllPub,...,1,2010,WD,Normal,32,32,0.26,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21,1936,Pave,,Reg,Lvl,AllPub,...,6,2006,WD,Normal,54,54,0.56,1,0,0
1455,2916,160,RM,21,1894,Pave,,Reg,Lvl,AllPub,...,4,2006,WD,Abnorml,54,54,0.58,1,0,0
1456,2917,20,RL,160,20000,Pave,,Reg,Lvl,AllPub,...,9,2006,WD,Abnorml,64,28,0.06,1,1,0
1457,2918,85,RL,62,10441,Pave,,Reg,Lvl,AllPub,...,7,2006,WD,Normal,32,32,0.09,1,0,0


In [42]:
ord_feats_encoded = ord.transform(ord_feats)
new_ord_feats = pd.DataFrame(ord_feats_encoded, columns = ord_feats.columns)
test.drop(columns=ord_feats, inplace=True)
test = test.merge(new_ord_feats, left_index=True, right_index=True)

In [43]:
nom_feats_encoded = one.transform(nom_feats)
test.drop(columns=nom_feats, inplace=True)
test = test.merge(nom_feats_encoded, right_index=True, left_index=True)

In [44]:
cont_feats_scaled = scaler_minmax.transform(cont_feats)
new_cont_feats = pd.DataFrame(cont_feats_scaled, columns=cont_feats.columns)
test.drop(columns=cont_feats, inplace=True)
test = test.merge(new_cont_feats, right_index=True, left_index=True)

In [45]:
set(xtrain.columns.tolist()) - set(test.columns.tolist())
set(test.columns.tolist()) - set(df.columns.tolist())
df_test = test.reindex(columns=xtrain.columns).fillna(0)
df_test = df_test[xtrain.columns]

In [46]:
price_pred_test = xgbr.predict(df_test)
price_pred_test = np.expm1(price_pred_test)
price_pred_test

array([213868.81, 193845.31, 256461.  , ..., 211121.7 , 165949.12,
       280689.72], dtype=float32)

In [47]:
submission = test[['Id']]
submission.loc[:, 'SalePrice'] = price_pred_test
submission

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission.loc[:, 'SalePrice'] = price_pred_test


Unnamed: 0,Id,SalePrice
0,1461,213868.812500
1,1462,193845.312500
2,1463,256461.000000
3,1464,270523.687500
4,1465,283787.500000
...,...,...
1454,2915,205686.187500
1455,2916,187918.921875
1456,2917,211121.703125
1457,2918,165949.125000


In [48]:
# submission.to_csv("baseline_function_reorg_4-22-24.csv", index=False)

Feature of Important after XGBOOST,
Check for multicolinearity
Use pearson correlation or spearmans correlation