# Need to encode some fields based on their definition and drop the original

In [181]:
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [182]:
train['source'] = 'train'
test['source'] = 'test'
combine = pd.concat([train,test]).reset_index().drop('index', axis = 1)


In [183]:
# reformat column Exterior1st and Exterior2nd
ext1 = pd.get_dummies(combine.Exterior1st)
ext2 = pd.get_dummies(combine.Exterior2nd).rename(columns={'Brk Cmn':'BrkComm', 'CmentBd':'CemntBd', 'Wd Shng':'WdShing'})
ext = ext1.add(ext2, fill_value=0).astype('uint8').replace(2,1).add_prefix('ext_')  # combine ext1 and ext 2 and add a prefix 
combine = pd.concat([combine, ext], axis=1).drop(['Exterior1st', 'Exterior2nd'], axis=1)


In [184]:
# combine BsmtFullBath, BsmtHalfBath (number of type of bathroom in the basement)
combine['bathtot'] = train.BsmtFullBath + train.BsmtHalfBath * 0.5
combine = combine.drop(['BsmtFullBath', 'BsmtHalfBath'], axis=1)

In [185]:
# reformat PoolQC, PoolArea
poolinfo = pd.concat([combine.PoolArea,  pd.get_dummies(combine.PoolQC)], axis =1) # df with dummified PoolQC columns + pool area
pool = poolinfo.apply(lambda row: row.replace(1, row['PoolArea']), axis=1) # cast pool area into dummified columns
combine = pd.concat([combine, pool], axis =1).drop(['PoolArea', 'PoolQC'], axis = 1)

In [211]:
#BsmtFinType1, BsmtFinType2, BsmtFinSF1 (Type 1 finished square feet), BsmtFinSF2 (Type 1 finished square feet)
#BsmtUnfSF: Unfinished square feet of basement area
#TotalBsmtSF: Total square feet of basement area

bsmt1info = pd.concat([combine.BsmtFinSF1, pd.get_dummies(combine.BsmtFinType1)], axis = 1) # df with dummified BsmtFinType1 columns + BsmtFinSF1
bsmt1 = bsmt1info.apply(lambda row: row.replace(1, row['BsmtFinSF1']), axis = 1) # cast BsmtFinSF1 into dummified columns (unfinished ones will be handled in the BsmtFinType2 section since we are merging type 1 and type2 at the end)

# merge in unfinished SF into BsmtFinSF2 column so all the unfinished can be cast onto the dummy
combine.BsmtFinSF2 = combine[['BsmtFinSF2','BsmtUnfSF']].apply(lambda row: row.replace(0, row['BsmtUnfSF']), axis = 1)
bsmt2info = pd.concat([combine.BsmtFinSF2, pd.get_dummies(combine.BsmtFinType2)], axis = 1)
bsmt2 = bsmt2info.apply(lambda row: row.replace(1, row['BsmtFinSF2']), axis = 1)

bsmt = bsmt1.add(bsmt2, fill_value=0).drop(['BsmtFinSF1','BsmtFinSF2'], axis = 1).add_prefix('bsmt_') # combine bsmt types and add prefix 
combine = pd.concat([combine, bsmt], axis = 1).drop(['BsmtFinSF1','BsmtFinSF2'], axis = 1)

In [None]:
#MasVnrType, MasVnrArea



In [None]:
#OpenPorchSF ( Open porch area in square feet)
#EnclosedPorch (Enclosed porch area in square feet)
#3SsnPorch (Three season porch area in square feet)
#ScreenPorch (Screen porch area in square feet)
????????????????