In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from scipy import stats
import os
import re
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.linear_model import LinearRegression

In [2]:
test = pd.read_csv('datasets/test.csv')

In [3]:
pd.set_option('display.max_columns',100)

In [4]:
nominal = ['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood','Condition 1','Condition 2', 'Bldg Type', 'House Style', 'Roof Style','Roof Matl','Exterior 1st','Exterior 2nd','Mas Vnr Type','Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type'] 
ordinal = ['Lot Shape','Utilities','Land Slope','Overall Qual','Overall Cond','Exter Qual','Exter Cond','Bsmt Qual', 'Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin Type 2','Heating QC', 'Electrical', 'Kitchen Qual','Functional', 'Fireplace Qu', 'Garage Finish', 'Garage Qual','Garage Cond', 'Paved Drive', 'Pool QC', 'Fence']
continuous = ['Lot Frontage', 'Lot Area', 'Mas Vnr Area','BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch','3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val']
discrete = ['Year Built','Year Remod/Add', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath','Half Bath','Bedroom AbvGr','Kitchen AbvGr','TotRms AbvGrd','Fireplaces','Garage Yr Blt', 'Garage Cars', 'Mo Sold', 'Yr Sold'] 

In [5]:
test['Alley']=test['Alley'].map({'Grvl': 1, 'Pave':1, np.nan:0})

In [6]:
test['Garage Type'] = test['Garage Type'].replace(np.nan,test['Garage Type'].mode()[0])


In [7]:
test['Mas Vnr Type']= test['Mas Vnr Type'].replace(np.nan,test['Mas Vnr Type'].mode()[0])


In [8]:
test[ordinal]['Bsmt Qual'].replace(np.nan,test[ordinal]['Bsmt Qual'].mode()[0])

0      Fa
1      Gd
2      Gd
3      TA
4      Gd
       ..
873    TA
874    TA
875    TA
876    TA
877    TA
Name: Bsmt Qual, Length: 878, dtype: object

In [9]:
test[ordinal]= test[ordinal].replace(np.nan,'NA')


In [10]:
imputer = KNNImputer(n_neighbors=5)
cont_transformed = pd.DataFrame(imputer.fit_transform(test[continuous]),columns=continuous)
cont_transformed.set_index(test.index,inplace=True)

In [11]:
test[continuous]=cont_transformed

In [12]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy = 'most_frequent', copy=False)
ont_transformed=pd.DataFrame(imp_mode.fit_transform(test[discrete]), columns=discrete)
ont_transformed.set_index(test.index,inplace=True)

In [13]:
test[discrete]=ont_transformed

In [14]:
test = test.drop(['Misc Feature', 'Pool QC'], axis = 1)


In [15]:
test = test[(np.abs(stats.zscore(test[continuous+discrete])) < 3).all(axis=1)]


In [16]:
ordinal = ['Lot Shape','Utilities','Land Slope','Overall Qual','Overall Cond','Exter Qual','Exter Cond','Bsmt Qual', 'Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin Type 2','Heating QC', 'Electrical', 'Kitchen Qual','Functional', 'Fireplace Qu', 'Garage Finish', 'Garage Qual','Garage Cond', 'Paved Drive', 'Fence']

In [17]:
test = test.replace({'Lot Shape': {'Reg': 3,'IR1': 2, 'IR2': 1, 'IR3': 2}, 'Utilities': {'AllPub': 1, 'NoSewr': 0},
           'Land Slope': {'Gtl':0,'Mod':1,'Sev':2}, 'Exter Qual': {'Ex':3, 'Gd':2, 'TA':1,'Fa':0}, 'Exter Cond':{'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4},
            'Bsmt Qual': {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}, 'Bsmt Cond': {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}, 
            'Bsmt Exposure': {'Gd':4,'Av':3,'Mn':2,'No':1, 'NA':0}, 'BsmtFin Type 1':{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'NA':0},
            'BsmtFin Type 2': {'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'NA':0}, 'Heating QC': {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1}, 
            'Electrical': {'SBrkr':5,'FuseA':4,'FuseF':3,'FuseP':2,'Mix':1, 'NA':0}, 'Kitchen Qual':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1},
            'Functional' : {'Typ':8,'Min1':7,'Min2':6,'Mod':5,'Maj1':4,'Maj2':3,'Sev':2,'Sal':1}, 'Fireplace Qu': {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1, 'NA':0},
            'Garage Finish':{'Fin':3,'RFn':2,'Unf':1,'NA':0}, 'Garage Qual':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},
            'Garage Cond':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0}, 'Paved Drive': {'Y':3,'P':2,'N':1},
            'Fence': {'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1,'NA':0} })

In [18]:
nominal = ['MS Zoning', 'Street', 'Land Contour', 'Lot Config', 'Neighborhood','Condition 1','Condition 2', 'Bldg Type', 'House Style', 'Roof Style','Roof Matl','Exterior 1st','Exterior 2nd','Mas Vnr Type','Foundation', 'Heating', 'Central Air', 'Garage Type', 'Sale Type'] 

In [19]:
def convert_to_snake_case(df):
    #add a space between any lowercase-capital letter pair, then replace spaces with _, the all to lowercase
    new_cols = {col: re.sub(r"([a-z]{1})([A-Z]{1})", r"\1 \2", col).replace(" ", "_").lower() for col in df.columns}
    return df.rename(columns = new_cols, inplace = True)
# from B~


In [20]:
convert_to_snake_case(test)

In [21]:
ordinal = ['lot_shape','utilities','land_slope','overall_qual','overall_cond','exter_qual','exter_cond','bsmt_qual', 'bsmt_cond','bsmt_exposure','bsmt_fin_type_1','bsmt_fin_type_2','heating_qc', 'electrical', 'kitchen_qual','functional', 'fireplace_qu', 'garage_finish', 'garage_qual','garage_cond', 'paved_drive', 'fence']

In [22]:
nominal = ['ms_zoning', 'street', 'land_contour', 'lot_config', 'neighborhood','condition_1','condition_2', 'bldg_type', 'house_style', 'roof_style','roof_matl','exterior_1st','exterior_2nd','mas_vnr_type','foundation', 'heating', 'central_air', 'garage_type', 'sale_type'] 

In [23]:
continuous = ['lot_frontage', 'lot_area', 'mas_vnr_area','bsmt_fin_sf_1', 'bsmt_fin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf', '1st_flr_sf', '2nd_flr_sf', 'gr_liv_area', 'garage_area', 'wood_deck_sf', 'open_porch_sf', 'enclosed_porch','3ssn_porch', 'screen_porch', 'pool_area', 'misc_val']  
discrete = ['year_built','year_remod/add', 'bsmt_full_bath', 'bsmt_half_bath', 'full_bath','half_bath','bedroom_abv_gr','kitchen_abv_gr','tot_rms_abv_grd','fireplaces','garage_yr_blt', 'garage_cars', 'mo_sold', 'yr_sold'] 

In [24]:
set(test[ordinal+nominal+continuous+discrete])-set(test)

set()

In [25]:
set(test)-set(test[ordinal+nominal+continuous+discrete])

{'alley', 'id', 'low_qual_fin_sf', 'ms_sub_class', 'pid'}

In [27]:
test.isna().sum().sum()

0

In [28]:
test = pd.get_dummies(test, drop_first=True)

In [None]:
set(nominal)-set(test[nominal].columns)

In [30]:
test.shape

(649, 173)

In [33]:
Z_train = pd.read_csv('cleaned_datasets/Z_train.csv')


In [34]:
set(Z_train)-set(test)

{'condition_2_Feedr', 'heating_Wall', 'sale_type_Con'}

In [37]:
Z_train=Z_train.drop(['condition_2_Feedr', 'heating_Wall', 'sale_type_Con'], axis=1)

In [35]:
set(test)-set(Z_train)

{'condition_1_RRNe',
 'condition_1_RRNn',
 'exterior_1st_BrkComm',
 'exterior_1st_PreCast',
 'exterior_2nd_Brk Cmn',
 'exterior_2nd_Other',
 'exterior_2nd_PreCast',
 'heating_GasA',
 'house_style_2.5Fin',
 'mas_vnr_type_CBlock',
 'neighborhood_NPkVill',
 'roof_matl_WdShake',
 'roof_matl_WdShngl',
 'roof_style_Shed'}

In [38]:
test = test.drop(['condition_1_RRNe',
 'condition_1_RRNn',
 'exterior_1st_BrkComm',
 'exterior_1st_PreCast',
 'exterior_2nd_Brk Cmn',
 'exterior_2nd_Other',
 'exterior_2nd_PreCast',
 'heating_GasA',
 'house_style_2.5Fin',
 'mas_vnr_type_CBlock',
 'neighborhood_NPkVill',
 'roof_matl_WdShake',
 'roof_matl_WdShngl',
 'roof_style_Shed'], axis=1)


In [36]:
Z_test=pd.read_csv('cleaned_datasets/Z_test.csv')

In [45]:
Z_test = Z_test.drop(['condition_2_Feedr', 'heating_Wall', 'sale_type_Con'], axis=1)


In [39]:
set(Z_train)-set(Z_test)

set()

In [47]:
set(Z_test)-set(Z_train)

set()

In [46]:
set(Z_test)-set(test)

set()

In [41]:
set(test)-set(Z_test)

set()

In [43]:
set(Z_train) - set(test)

set()

In [44]:
set(test)-set(Z_train)

set()