# Index

In [1]:
import gc
gc.collect()

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# the method write myself
from my_toolbox import data_transform as dt
from my_toolbox import data_describe as dd
from my_toolbox import outlier 
from my_toolbox import corr_select as cs
from my_toolbox import missing_value as mv
from my_toolbox import stepwise

help(dd)

Using TensorFlow backend.


In [2]:
path = '../data/'
data_name = ["train", 'test', 'sample_submission']

In [3]:
df_train = pd.read_csv("{}{}.csv".format(path,data_name[0]), encoding='utf8')
print(df_train.shape)

df_test = pd.read_csv("{}{}.csv".format(path,data_name[1]), encoding='utf8')
print(df_test.shape)

df_submission = pd.read_csv("{}{}.csv".format(path,data_name[2]), encoding='utf8')
print(df_submission.shape)

(1460, 81)
(1459, 80)
(1459, 2)


In [4]:
data_describe1 =dd.data_describe(df_train);
data_describe2 = dd.data_describe(df_test);
data_describe1[data_describe1.is_null==True]

Unnamed: 0,duplicate,is_null,null_number,null_rate(%)
LotFrontage,111,True,259,17.74
Alley,3,True,1369,93.77
MasVnrType,5,True,8,0.55
MasVnrArea,328,True,8,0.55
BsmtQual,5,True,37,2.53
BsmtCond,5,True,37,2.53
BsmtExposure,5,True,38,2.6
BsmtFinType1,7,True,37,2.53
BsmtFinType2,7,True,38,2.6
Electrical,6,True,1,0.07


In [5]:
# define the types of columns

category = ['MSSubClass', 'Street', 'LandContour', 'LotConfig', 'Neighborhood', 'HouseStyle', 'Condition1', 'Condition2',
            'RoofStyle', 'RoofMatl', 'Foundation', 'Heating', 'CentralAir', 'PavedDrive', 'MoSold', 'SaleCondition',
            'YrSold', 'Electrical', 'MSZoning', 'MasVnrType', 'Functional', 'Exterior1st', 'Exterior2nd', 'SaleType',
            'GarageType','BldgType']

numeric = ['SalePrice','LotArea', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath', 'HalfBath', 
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
       '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal','LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'Fireplaces',
       'GarageCars', 'GarageArea']

category_with_order = ['BsmtCond','LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 
                       'ExterQual', 'ExterCond', 'CentralAir', 'HeatingQC', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                       'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 
                       'KitchenQual', 'Utilities']

data_type_dict = {'category':category, 'category_with_order':category_with_order}

# set the default type as numeric because df_test doesn't have column 'SalePrice'
default_type = 'numeric'

dd.data_type(data_describe1, default_type, data_type_dict)
dd.data_type(data_describe2, default_type, data_type_dict)

# data_describe1;
# data_describe2;

--- 
# Table: Train
## 1. Category in order
* Normal:   
        LotShape, Utilities, LandSlope, OverallQual, OverallCond, ExterQual, ExterCond, CentralAir, HeatingQC, KitchenQual     
* NA is a category:  
        BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, FireplaceQu, GarageFinish, GarageQual, GarageCond,

In [None]:
# NA is a category; conver null number as 'NA'

tar_list = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
            'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond']

dt.NA_as_cat(df_train, tar_list, 'NA')
# ['BsmtQual', 'BsmtCond', 'FireplaceQu', 'GarageCond', 'GarageQual']
df_train[['BsmtQual', 'BsmtCond', 'FireplaceQu', 'GarageCond', 'GarageQual']].isnull().any()

In [None]:
# transformate order category as number between 0-1

dt.order_cat_to_num(df_train,['ExterQual', 'ExterCond','HeatingQC', 'KitchenQual'],
                    ['Po', 'Fa', 'TA', 'Gd', 'Ex'])
dt.order_cat_to_num(df_train, ['BsmtQual','BsmtCond', 'GarageQual', 'GarageCond', 'FireplaceQu'],
                    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'])

dt.order_cat_to_num(df_train, ['BsmtExposure'], ['NA', 'No', 'Mn', 'Av', 'Gd'])

dt.order_cat_to_num(df_train, ['LotShape'], ['IR3', 'IR2', 'IR1', 'Reg'])

dt.order_cat_to_num(df_train, ['Utilities'], ['ELO', 'NoSeWa', 'NoSewr', 'AllPub'])

dt.order_cat_to_num(df_train, ['LandSlope'], ['Sev', 'Mod', 'Gtl'])

dt.order_cat_to_num(df_train, ['OverallQual', 'OverallCond'], [1,2,3,4,5,6,7,8,9,10])

dt.order_cat_to_num(df_train, ['GarageFinish'], ['NA', 'Unf', 'RFn', 'Fin'])

dt.order_cat_to_num(df_train, ['BsmtFinType1','BsmtFinType2'], ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'])

dt.order_cat_to_num(df_train, ['CentralAir'], ['N', 'Y'])

[back to index](#Index)

## 2. Numeric
* Normal:   
        SalePrice, LotArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageCars, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, MiscVal    
* With null number:    
        LotFrontage, MasVnrArea, GarageYrBlt 
* drop the cloumns have too much null numbers:  
        Alley, PoolQC, Fence, MiscFeature'
* year's range is wilde -> take it as numeric
        YearBuilt, YearRemodAdd, GarageYrBlt, 

In [None]:
# if FireplaceQu == np.nan: Fireplaces = 0 
df_train['Fireplaces'] = np.where(df_train['FireplaceQu'] == 0,
                                  0, df_train['Fireplaces'])

In [None]:
# if BsmtHalfBath == np.nan: BsmtHalfBath =0
df_train.fillna({'BsmtHalfBath':0}, inplace=True)

# if BsmtFullBath == np.nan: BsmtFullBath =0
df_train.fillna({'BsmtFullBath':0}, inplace=True)

In [None]:
# if Functional == NULL: Functional = Typ
df_train.fillna({'Functional':'Typ'}, inplace=True)

In [None]:
# have no basement

# if BsmtFinType1 == -1: BsmtFinSF1 = 0  
df_train['BsmtFinSF1'] = np.where(df_train['BsmtFinType1'] == 0,
                                  0, df_train['BsmtFinSF1'])

# if BsmtFinType2 == -1: BsmtFinSF12 = 0  
df_train['BsmtFinSF2'] = np.where(df_train['BsmtFinType2'] == 0,
                                  0, df_train['BsmtFinSF2'])

df_train['TotalBsmtSF'] = np.where(
    (df_train['BsmtFinType2'] == 0)&(df_train['BsmtFinType1'] == 0),
    0, df_train['TotalBsmtSF'])
df_train['BsmtUnfSF'] = np.where(
    (df_train['BsmtFinType2'] == 0)&(df_train['BsmtFinType1'] == 0),
    0, df_train['BsmtUnfSF'])
df_train['BsmtFinSF1'] = np.where(
    (df_train['BsmtFinType2'] == 0)&(df_train['BsmtFinType1'] == 0),
    0, df_train['BsmtFinSF1'])

In [None]:
# drop the cloumns which have too much null numbers
df_train.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True, axis=1)
df_train.shape    # check

In [None]:
# These columns has many zero. 
# if we just put it in Tukey_test to deal with outlier. it'll cause problems.

print(df_train[['BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr',
 'EnclosedPorch', '3SsnPorch' , 'ScreenPorch', 'PoolArea', 'MiscVal']].head())

In [None]:
# outlier
tar_list=['SalePrice', 'LotArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 
          '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 
          'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
          'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',  
          'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
         'YearBuilt', 'YearRemodAdd', 'GarageYrBlt']

df_train_tmp = outlier.Tukey_test(df_train, tar_list, k=3)

df_train[list(df_train_tmp.columns)] = df_train_tmp

drop_columns = list(set(tar_list).difference(set(df_train_tmp.columns)))
df_train.drop(drop_columns, inplace=True, axis=1)


# print(df_train[tar_list].shape)
# print(df_train[tar_list].columns)
# print(df_train_tmp.shape)
# print(df_train_tmp.columns)
print(df_train.shape)

# check whether it have string in columns
for col in tar_list:
    count = 0
    for row in range(len(df_train[col])):
        if type(df_train[col][row]) == 'str':
            print(col, row)
            count+=1
    print('"{}" {} numbers:{}'.format(col,'str', count))


In [None]:
# filled missing vales of  'LotFrontage', 'MasVnrArea', 'GarageYrBlt' by KNN 

df_train[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']] = mv.fill_knn(
    df_train[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']], 5)

# to check whether it has missing value or not
df_train[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']].isnull().any()

In [None]:
# the columns with many zeros
tar_list.extend(['BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr',
 'EnclosedPorch', '3SsnPorch' , 'ScreenPorch', 'PoolArea', 'MiscVal'])
# print(tar_list)

# categories in order
tar_list.extend(['LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 
                 'ExterQual', 'ExterCond', 'CentralAir', 'HeatingQC', 'KitchenQual',  
                 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
                 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond'])

# correlation table
corr_table = df_train[tar_list].corr(method = 'spearman')
# print(corr_table)

# picture of correlation table
fig = plt.figure()
fig.set_figheight(20)
fig.set_figwidth(20)

g=sns.heatmap(corr_table, vmax=1, linewidths=0.01,
            square=True,cmap='YlGnBu',linecolor="white")
plt.title('Correlation between numeric features')

In [None]:
drop_list=[]
level = 0.2
for i in tar_list:   # beside row_id and country_code
    if abs(df_train['SalePrice'].corr(df_train[i], method = 'spearman')) < level:
        drop_list.append(i)
print('correlation under %.2f.\n'%level,drop_list)

# corr_select(df_train, 'SalePrice', 'spearman', level)
# multicollinearity_prevent(df_train, 'SalePrice', 'spearman', 0.7)

In [None]:

data_describe1.loc[['BldgType','HouseStyle'],:]
# data_describe1

[back to index](#Index)

## 3. Category
* one-hot-encoding(24):  
        MSSubClass, MSZoning, Street, LandContour, LotConfig, Neighborhood, Condition1, Condition2, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, Foundation, Heating, CentralAir(只有2類), PavedDrive, MoSold, SaleType, SaleCondition, YrSold, HouseStyle  
* Has null number(3):    
        MasVnrType, Electrical(FuseA, FuseF, FuseP有序; SBrkr, Mix無序。應拆分), Functional,   
* NA is a category(1): 
        GarageType   

In [None]:
df_train.MSSubClass.apply(str);
df_train.MoSold.apply(str);

In [None]:
#  'GarageType' 76筆空值為一類別
dt.NA_as_cat(df_train, ['GarageType'], fill_with='NA')

### check category features' scatter with SalePrice

In [None]:
# find out the columns with many duplicate categories

# [YearBuilt, YearRemodAdd, GarageYrBlt] -> take them as numeric
data_describe1[(data_describe1['duplicate']>10) & (data_describe1['type']=='category')]

In [None]:
# dt.cat_plot(df_train, 'MasVnrType', 'SalePrice')

# nan is not much fill it with mode
df_train.fillna({'MasVnrType':df_train['MasVnrType'].mode()[0]}, inplace=True)

In [None]:
# most of data gather in SBrkr. Maybe we should drop this column
# dt.cat_plot(df_train, 'Electrical', 'SalePrice')

# nan is not much fill it with mode
df_train.fillna({'Electrical':df_train['Electrical'].mode()[0]}, inplace=True)

In [None]:
# most of data gather in Typ and it's relationship with SalePrice is unclear
# dt.cat_plot(df_train, 'Functional', 'SalePrice')

# we should drop this column
df_train.drop(['Functional'], inplace=True, axis=1)

In [None]:
# most of data gather in Pave  
# dt.cat_plot(df_train, 'Street', 'SalePrice')

# we should drop this column
df_train.drop(['Street'], inplace=True, axis=1)

In [None]:
# scatter is unclear and most of them are gathering in Y
# dt.cat_plot(df_train, 'PavedDrive', 'SalePrice')

# we should drop this column
df_train.drop(['PavedDrive'], inplace=True, axis=1)

In [None]:
# scatter beteen years of saleprice is unclear 
# dt.cat_plot(df_train, 'YrSold', 'SalePrice')

# we should drop this column
df_train.drop(['YrSold'], inplace=True, axis=1)

In [None]:
# scatter beteen years of saleprice is unclear 
# dt.cat_plot(df_train, 'LandContour', 'SalePrice')

# we should drop this column
df_train.drop(['LandContour'], inplace=True, axis=1)

In [None]:
# scatter beteen years of saleprice is unclear 
# dt.cat_plot(df_train, 'LotConfig', 'SalePrice')

# we should drop this column
df_train.drop(['LotConfig'], inplace=True, axis=1)

In [None]:
# scatter beteen years of saleprice is unclear 
# dt.cat_plot(df_train, 'RoofStyle', 'SalePrice')

# we should drop this column
df_train.drop(['RoofStyle'], inplace=True, axis=1)

In [None]:
# check df_train has null number or not
print(df_train.shape)
print(df_train.columns)
df_train.isnull().any().any()

[back to index](#Index)

## undescide to drop or not 

[back to index](#Index)

In [None]:
# step result

print(df_train.shape, df_train.isnull().any().any())


# one-hot encoding
A = data_describe1[data_describe1['type']=='category'].index
B = df_train.columns

tar_col = set(A).intersection(set(B))

df_train = dt.one_hot_encoding(df_train, tar_col, False)

print(df_train.shape, df_train.isnull().any().any())
print(df_train.columns)

---

---
# Table: Test

## 1. Category in order
* Normal:   
       LotShape, Utilities, LandSlope, OverallQual, OverallCond, ExterQual, ExterCond, CentralAir, HeatingQC     

* NA is a category:  
        BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, FireplaceQu, GarageFinish, GarageQual, GarageCond, KitchenQual
          
* has null number:   
        Utilities,

In [None]:
# NA is a category; conver null number as 'NA'

tar_list = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
            'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond']

dt.NA_as_cat(df_test, tar_list, 'NA')

In [None]:
#   'Utilities' has only two null numbers, fil it with mode
df_test.fillna({'Utilities':df_test['Utilities'].mode()[0]}, inplace=True)
# print(df_test[df_test['Utilities'].isnull()==True])    # test

In [None]:
# transformate order category as number between 0-1

dt.order_cat_to_num(df_test,['ExterQual', 'ExterCond','HeatingQC', 'KitchenQual'],
                    ['Po', 'Fa', 'TA', 'Gd', 'Ex'])
dt.order_cat_to_num(df_test, ['BsmtQual','BsmtCond', 'GarageQual', 'GarageCond', 'FireplaceQu'],
                    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'])

dt.order_cat_to_num(df_test, ['BsmtExposure'], ['NA', 'No', 'Mn', 'Av', 'Gd'])

dt.order_cat_to_num(df_test, ['LotShape'], ['IR3', 'IR2', 'IR1', 'Reg'])

dt.order_cat_to_num(df_test, ['Utilities'], ['ELO', 'NoSeWa', 'NoSewr', 'AllPub'])

dt.order_cat_to_num(df_test, ['LandSlope'], ['Sev', 'Mod', 'Gtl'])

dt.order_cat_to_num(df_test, ['OverallQual', 'OverallCond'], [1,2,3,4,5,6,7,8,9,10])

dt.order_cat_to_num(df_test, ['GarageFinish'], ['NA', 'Unf', 'RFn', 'Fin'])

dt.order_cat_to_num(df_test, ['BsmtFinType1','BsmtFinType2'], ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'])

dt.order_cat_to_num(df_test, ['CentralAir'], ['N', 'Y'])

## 2. Numeric
* Normal:   
        SalePrice, LotArea, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, MiscVal    
* has null numbers:    
        LotFrontage, MasVnrArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath, BsmtHalfBath, Fireplaces, GarageCars, GarageArea
* drop the cloumns have too much null numbers:  
        Alley, PoolQC, Fence, MiscFeature'
* year's range is wilde -> take it as numeric
        YearBuilt, YearRemodAdd, GarageYrBlt, 

In [None]:
# if FireplaceQu == -1: Fireplaces = 0 
df_test['Fireplaces'] = np.where(df_test['FireplaceQu'] == 0,
                                 0, df_test['Fireplaces'])

In [None]:
# if BsmtHalfBath == np.nan: BsmtHalfBath =0
df_test.fillna({'BsmtHalfBath':0}, inplace=True)

# if BsmtFullBath == np.nan: BsmtFullBath =0
df_test.fillna({'BsmtFullBath':0}, inplace=True)

In [None]:
# if BsmtFinType1 == np.nan: BsmtFinSF1 = 0  
df_test['BsmtFinSF1'] = np.where(df_test['BsmtFinType1'] == 0,
                                 0, df_test['BsmtFinSF1'])

# if BsmtFinType2 == np.nan: BsmtFinSF12 = 0  
df_test['BsmtFinSF2'] = np.where(df_test['BsmtFinType2'] == 0, 0, df_test['BsmtFinSF2'])

df_test['TotalBsmtSF'] = np.where(
    (df_test['BsmtFinType2'] == 0)&(df_test['BsmtFinType1'] == 0),
    0, df_test['TotalBsmtSF'])
df_test['BsmtUnfSF'] = np.where(
    (df_test['BsmtFinType2'] == 0)&(df_test['BsmtFinType1'] == 0),
    0, df_test['BsmtUnfSF'])
df_test['BsmtFinSF1'] = np.where(
    (df_test['BsmtFinType2'] == 0)&(df_test['BsmtFinType1'] == 0),
    0, df_test['BsmtFinSF1'])

In [None]:
# if Functional == NULL: Functional = Typ
df_test.fillna({'Functional':'Typ'}, inplace=True)

In [None]:
# drop the cloumns which have too much null numbers
print(df_test.shape)
df_test.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True, axis=1)
df_test.shape    # check

In [None]:
# These columns has many zero. 
# if we just put it in Tukey_test to deal with outlier. it'll cause problems.

print(df_test[['BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr',
 'EnclosedPorch', '3SsnPorch' , 'ScreenPorch', 'PoolArea', 'MiscVal']].head())

In [None]:
# outlier
tar_list=['LotArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 
          '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 
          'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
          'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',  
          'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
         'YearBuilt', 'YearRemodAdd', 'GarageYrBlt']

df_test_tmp = outlier.Tukey_test(df_test, tar_list, k=3)

df_test[list(df_test_tmp.columns)] = df_test_tmp

# drop_columns = list(set(tar_list).difference(set(df_test_tmp.columns)))
# df_test.drop(drop_columns, inplace=True, axis=1)


# print(df_test[tar_list].shape)
# print(df_test[tar_list].columns)
# print(df_test_tmp.shape)
# print(df_test_tmp.columns)
print(df_test.shape)

# check whether it have string in columns
for col in tar_list:
    count = 0
    for row in range(len(df_test[col])):
        if type(df_test[col][row]) == 'str':
            print(col, row)
            count+=1
    print('"{}" {} numbers:{}'.format(col,'str', count))

In [None]:
# filled missing vales of  'LotFrontage', 'MasVnrArea', 'GarageYrBlt' by KNN 
null_list = ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
            'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'Fireplaces', 'GarageCars',
            'GarageArea', 'GarageYrBlt']

df_test[null_list] = mv.fill_knn(df_test[null_list], 5)

# to check whether it has missing value or not
df_test[null_list].isnull().any()

## 3. Category
* one-hot-encoding:  
        MSSubClass,  Street, LandContour, LotConfig, Neighborhood, Condition1, Condition2, RoofStyle, RoofMatl,  Foundation, Heating, CentralAir(只有2類), PavedDrive, MoSold, SaleCondition, YrSold, Electrical(FuseA, FuseF, FuseP有序; SBrkr, Mix無序。應拆分), HouseStyle  
        
* 有空值:    
        MSZoning, MasVnrType, Functional, Exterior1st, Exterior2nd, SaleType  

* NA is a category: 
        GarageType 

In [None]:
df_test.MSSubClass.apply(str);
df_test.MoSold.apply(str);

In [None]:
print(df_test['GarageType'].isnull().sum())
# df_test[df_test['GarageType']==0]

# null means has no Garage
dt.NA_as_cat(df_test, ['GarageType'], fill_with='NA')
print(df_test['GarageType'].isnull().sum())

In [None]:
# SaleType only has 1 null number, make it be Other
df_test.fillna({'SaleType':'Oth'}, inplace=True)

In [None]:
# MSZoning: 4  → fill with mode
print(df_test.MSZoning.isnull().sum())
df_test.fillna({'MSZoning':df_test['MSZoning'].mode()[0]}, inplace=True)

# MasVnrType: 16  → fill with mode
print(df_test.MasVnrType.isnull().sum())
df_test.fillna({'MasVnrType':df_test['MasVnrType'].mode()[0]}, inplace=True)


# MasVnrArea: 8 → if 'MasVnrType' is 'None', fill it with 0 , else fill it with mode
df_test['MasVnrArea'] = np.where(df_test['MasVnrType'] == 'None', 0, df_test['MasVnrArea'])
print(df_test.MasVnrType.isnull().sum())
df_test.fillna({'MasVnrArea':df_test['MasVnrArea'].mode()[0]}, inplace=True) 

In [None]:
# Functional: 2  
print(df_test.Functional.isnull().sum())
df_test.fillna({'Functional':'Typ'}, inplace=True)

In [None]:
# Exterior1st → fill with mode
print(df_test.Exterior1st.isnull().sum())
df_test.fillna({'Exterior1st':df_test['Exterior1st'].mode()[0]}, inplace=True)

# Exterior2nd → fill with mode
print(df_test.Exterior2nd.isnull().sum())
df_test.fillna({'Exterior2nd':df_test['Exterior2nd'].mode()[0]}, inplace=True)

# Utilities → fill with mode
print(df_test.Utilities.isnull().sum())
df_test.fillna({'Utilities':df_test['Utilities'].mode()[0]}, inplace=True)

# KitchenQual → fill with mode
print(df_test.KitchenQual.isnull().sum())
df_test.fillna({'KitchenQual':df_test['KitchenQual'].mode()[0]}, inplace=True)

In [None]:
# step result

print(df_test.shape)
print(df_test.columns)
df_test.isnull().any().any()

In [None]:
# one-hot encoding
A = data_describe2[data_describe2['type']=='category'].index
B = df_test.columns

tar_col = set(A).intersection(set(B))

df_test = dt.one_hot_encoding(df_test, tar_col, False)

print(df_test.shape, df_test.isnull().any().any())
print(df_test.columns)

In [None]:
train_col = set(df_train.columns)
train_col.remove('SalePrice')

test_col = set(df_test.columns)

res = list(train_col.intersection(test_col))

print('train length',len(train_col), df_train.isnull().any().any())
print('test length',len(test_col), df_test.isnull().any().any())
print('result length',len(res))

In [None]:
df_test_res = df_test[res]
res.append('SalePrice')
df_train_res = df_train[res]

print(df_train_res.shape, df_train_res.isnull().any().any())
print(df_test_res.shape, df_test_res.isnull().any().any())

In [None]:
# df_train_res.isnull().any().sum()
df_train_res.T[df_train_res.isnull().any()==True].index
# df_train_res.isnull()

In [None]:
df_train_res.to_csv('../data/df_train.csv', encoding='utf8', index=False)
df_test_res.to_csv('../data/df_test.csv', encoding='utf8', index=False)