## Cleaning Test Dataset
---

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Loading Data and Deleting Index from Train and dataset
pd.set_option('display.max_rows', None)
test_ = pd.read_csv('../data/test.csv')

In [3]:
# Setting up name columns in test set
test_.columns = test_.columns.str.replace('Ms ', '', regex = False).str.replace(' ', '_')
test_.columns = test_.columns.str.lower()

In [4]:
# Checking for Nulls
test_.isnull().sum()

id                   0
pid                  0
ms_subclass          0
ms_zoning            0
lot_frontage       160
lot_area             0
street               0
alley              820
lot_shape            0
land_contour         0
utilities            0
lot_config           0
land_slope           0
neighborhood         0
condition_1          0
condition_2          0
bldg_type            0
house_style          0
overall_qual         0
overall_cond         0
year_built           0
year_remod/add       0
roof_style           0
roof_matl            0
exterior_1st         0
exterior_2nd         0
mas_vnr_type         1
mas_vnr_area         1
exter_qual           0
exter_cond           0
foundation           0
bsmt_qual           25
bsmt_cond           25
bsmt_exposure       25
bsmtfin_type_1      25
bsmtfin_sf_1         0
bsmtfin_type_2      25
bsmtfin_sf_2         0
bsmt_unf_sf          0
total_bsmt_sf        0
heating              0
heating_qc           0
central_air          0
electrical 

In [5]:
# Dropping it because they are not relevent for the model  
col_deleted = ['lot_frontage', 'alley', 'fireplace_qu', 'pool_qc', 'fence', 'misc_feature']
test_.drop(columns=col_deleted, inplace=True)

In [6]:
# Deleting unnecessary columns
test_.drop(columns='street', inplace=True)
test_.drop(columns='pid', inplace=True)
test_.drop(columns='sale_type', inplace=True)

# Deleting unnecessary columns
test_.drop(columns='pool_area', inplace=True)
test_.drop(columns='misc_val', inplace=True)

# Deleting column which were used to create another one
# 1st_flr_sf	2nd_flr_sf	low_qual_fin_sf	=> gr_liv_area
test_.drop(columns='1st_flr_sf', inplace=True)
test_.drop(columns='2nd_flr_sf', inplace=True)
test_.drop(columns='low_qual_fin_sf', inplace=True)

# bsmtfin_sf_1	bsmtfin_sf_2	bsmt_unf_sf	=> total_bsmt_sf
test_.drop(columns='bsmtfin_sf_1', inplace=True)
test_.drop(columns='bsmtfin_sf_2', inplace=True)
test_.drop(columns='bsmt_unf_sf', inplace=True)

# Creating a new feature total_square_feet 
test_['total_bldg_sqft'] =  test_['total_bsmt_sf'] + test_['gr_liv_area']
test_.drop(columns='total_bsmt_sf', inplace = True)
test_.drop(columns='gr_liv_area', inplace = True)

# Creating a feature bldg_age - bld_age  = yr_sold - year_built
test_['bldg_age'] = test_['yr_sold'] - test_['year_built']
test_.drop(columns='yr_sold', inplace=True)
test_.drop(columns='year_built', inplace=True)

# Creating a feature with total baths value
test_['total_baths'] = test_['bsmt_full_bath']+ test_['bsmt_half_bath']+ test_['full_bath'] + test_ ['half_bath']
test_['total_baths'].isnull().sum()
test_.drop(columns='bsmt_full_bath', inplace=True)
test_.drop(columns='bsmt_half_bath', inplace=True)
test_.drop(columns='full_bath', inplace=True)
test_.drop(columns='half_bath', inplace=True)

# Creating a feature to total porch area
test_['total_porch'] = test_['open_porch_sf'] + test_['enclosed_porch'] + test_['3ssn_porch']+ test_['screen_porch']
test_['total_porch'].isnull().sum()
test_.drop(columns='open_porch_sf', inplace=True)
test_.drop(columns='enclosed_porch', inplace=True)
test_.drop(columns='3ssn_porch', inplace=True)
test_.drop(columns='screen_porch', inplace=True)

# Mapping a value for central
test_['central_air'] = test_['central_air'].map({'Y': 1, 'N': 0})

# Dropping Colinear features / unnecessary columns 
test_.drop(columns = 'heating_qc', inplace = True)
test_.drop(columns = 'mo_sold', inplace = True)
test_.drop(columns = 'garage_yr_blt', inplace = True)
test_.drop(columns = 'garage_type', inplace = True)
test_.drop(columns = 'electrical', inplace = True)
test_.drop(columns = 'bsmt_qual', inplace = True)
test_.drop(columns = 'year_remod/add', inplace = True)

test_.shape

(878, 48)

In [7]:
# Applying correct types for features columns 

# Setting up type (object) for the category features 
col_tp_object = ['id','ms_subclass', 'ms_zoning', 'lot_shape', 'land_contour', 
                'utilities', 'lot_config', 'land_slope', 'neighborhood', 'condition_1', 
                'condition_2', 'bldg_type', 'house_style','roof_style', 'roof_matl', 
                'exterior_1st', 'exterior_2nd', 'mas_vnr_type','exter_qual','exter_cond', 
                'foundation','bsmt_cond','bsmt_exposure','bsmtfin_type_1','bsmtfin_type_2','heating', 'kitchen_qual',
                'functional', 'garage_finish','garage_qual','garage_cond','paved_drive'
]
for c in col_tp_object:
    test_[c] = test_[c].astype(str)

# Setting up type (int) for the discret features
col_tp_disc = [  'overall_cond','overall_qual','central_air', 
                'bedroom_abvgr', 'kitchen_abvgr','totrms_abvgrd', 
                'fireplaces', 'garage_cars', 'total_baths','bldg_age'
]
for c in col_tp_disc:
    test_[c] = test_[c].astype(int) 

# Setting up type (float) for the continuous features 
col_tp_continuos = ['lot_area','mas_vnr_area',
                    'garage_area', 'wood_deck_sf',
                    'total_bldg_sqft','total_porch'
]            
for c in col_tp_continuos:
    test_[c] = test_[c].astype(float)

In [8]:
test_['mas_vnr_area'].sort_values(ascending=True)


0         0.0
499       0.0
500       0.0
502       0.0
503       0.0
504       0.0
506       0.0
507       0.0
497       0.0
508       0.0
510       0.0
512       0.0
513       0.0
514       0.0
517       0.0
520       0.0
521       0.0
509       0.0
522       0.0
496       0.0
491       0.0
466       0.0
468       0.0
469       0.0
470       0.0
472       0.0
475       0.0
476       0.0
492       0.0
478       0.0
480       0.0
482       0.0
483       0.0
484       0.0
485       0.0
489       0.0
490       0.0
479       0.0
465       0.0
523       0.0
527       0.0
560       0.0
561       0.0
562       0.0
564       0.0
565       0.0
566       0.0
568       0.0
559       0.0
569       0.0
571       0.0
572       0.0
573       0.0
575       0.0
579       0.0
580       0.0
583       0.0
570       0.0
526       0.0
558       0.0
555       0.0
528       0.0
529       0.0
530       0.0
531       0.0
534       0.0
535       0.0
536       0.0
557       0.0
537       0.0
539       0.0
541   

In [9]:
test_['mas_vnr_area'] = test_['mas_vnr_area'].fillna(0)

In [10]:
test_.isnull().sum()

id                 0
ms_subclass        0
ms_zoning          0
lot_area           0
lot_shape          0
land_contour       0
utilities          0
lot_config         0
land_slope         0
neighborhood       0
condition_1        0
condition_2        0
bldg_type          0
house_style        0
overall_qual       0
overall_cond       0
roof_style         0
roof_matl          0
exterior_1st       0
exterior_2nd       0
mas_vnr_type       0
mas_vnr_area       0
exter_qual         0
exter_cond         0
foundation         0
bsmt_cond          0
bsmt_exposure      0
bsmtfin_type_1     0
bsmtfin_type_2     0
heating            0
central_air        0
bedroom_abvgr      0
kitchen_abvgr      0
kitchen_qual       0
totrms_abvgrd      0
functional         0
fireplaces         0
garage_finish      0
garage_cars        0
garage_area        0
garage_qual        0
garage_cond        0
paved_drive        0
wood_deck_sf       0
total_bldg_sqft    0
bldg_age           0
total_baths        0
total_porch  

In [11]:
df_test_processed = test_.copy()
df_test_processed.to_csv('../data/test_cleaned.csv')

___________