In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
df = pd.read_csv('test.csv')

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [4]:
df1 = df.copy()

In [5]:
df1.drop('Id',axis=1,inplace=True)

In [6]:
numerical_features = [features for features in df1.columns if df1[features].dtype != 'O']

print(f'total {len(numerical_features)} numerical features')

total 36 numerical features


In [7]:
categorical_features = [features for features in df1.columns if df1[features].dtype == 'O']

print(f'total {len(categorical_features)} categorical features')

total 43 categorical features


In [8]:
year_feature = [feature for feature in numerical_features if 'Yr' in feature or 'Year' in feature]
len(year_feature)

4

In [9]:
discrete_features = [feature for feature in numerical_features if df1[feature].nunique() < 27 and feature not in year_feature]
len(discrete_features)

17

In [10]:
continuous_features = [feature for feature in numerical_features if feature not in discrete_features+year_feature]
len(continuous_features)

15

In [11]:
# Filling missing values in categorical variables

df1[categorical_features] = df1[categorical_features].fillna('missing')

In [12]:
# numericalize the categorical features

df1[categorical_features] = df1[categorical_features].astype('category')

for feature in categorical_features:
    df1[feature] = df1[feature].cat.codes

In [13]:
for feature in year_feature:
    if feature == 'YrSold':
        pass
    else:
        df1[feature] = df1['YrSold'] - df1[feature]

In [14]:
df1[year_feature].isnull().sum()

YearBuilt        0
YearRemodAdd     0
GarageYrBlt     78
YrSold           0
dtype: int64

In [15]:
for feature in year_feature:
    if df1[feature].isnull().sum() > 0:
        df1[feature+'_na'] = df1[feature].isnull()
        df1[feature] = df1[feature].fillna(df1[feature].median())

In [16]:
df1[discrete_features].isnull().sum()

MSSubClass      0
OverallQual     0
OverallCond     0
LowQualFinSF    0
BsmtFullBath    2
BsmtHalfBath    2
FullBath        0
HalfBath        0
BedroomAbvGr    0
KitchenAbvGr    0
TotRmsAbvGrd    0
Fireplaces      0
GarageCars      1
3SsnPorch       0
PoolArea        0
MiscVal         0
MoSold          0
dtype: int64

In [17]:

for feature in discrete_features:
    if df1[feature].isnull().sum() > 0:
        df1[feature].fillna(df1[feature].median(),inplace=True)

In [18]:
df1[continuous_features].isnull().sum()

LotFrontage      227
LotArea            0
MasVnrArea        15
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
1stFlrSF           0
2ndFlrSF           0
GrLivArea          0
GarageArea         1
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
ScreenPorch        0
dtype: int64

In [19]:
# filling missing values in continuous features

df1['LotFrontage_na'] = np.where(df1['LotFrontage'].isnull(), 1, 0)
df1['LotFrontage'].fillna(df1['LotFrontage'].median(),inplace=True)

df1['MasVnrArea_na'] = np.where(df1['MasVnrArea'].isnull(), 1, 0)
df1['MasVnrArea'].fillna(df1['MasVnrArea'].median(),inplace=True)

In [20]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 82 columns):
MSSubClass        1459 non-null int64
MSZoning          1459 non-null int8
LotFrontage       1459 non-null float64
LotArea           1459 non-null int64
Street            1459 non-null int8
Alley             1459 non-null int8
LotShape          1459 non-null int8
LandContour       1459 non-null int8
Utilities         1459 non-null int8
LotConfig         1459 non-null int8
LandSlope         1459 non-null int8
Neighborhood      1459 non-null int8
Condition1        1459 non-null int8
Condition2        1459 non-null int8
BldgType          1459 non-null int8
HouseStyle        1459 non-null int8
OverallQual       1459 non-null int64
OverallCond       1459 non-null int64
YearBuilt         1459 non-null int64
YearRemodAdd      1459 non-null int64
RoofStyle         1459 non-null int8
RoofMatl          1459 non-null int8
Exterior1st       1459 non-null int8
Exterior2nd       1459 non-null i

In [21]:
for feature in continuous_features:
    if df1[feature].isnull().sum() > 0:
        df1[feature].fillna(df1[feature].median(),inplace=True)

In [22]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 82 columns):
MSSubClass        1459 non-null int64
MSZoning          1459 non-null int8
LotFrontage       1459 non-null float64
LotArea           1459 non-null int64
Street            1459 non-null int8
Alley             1459 non-null int8
LotShape          1459 non-null int8
LandContour       1459 non-null int8
Utilities         1459 non-null int8
LotConfig         1459 non-null int8
LandSlope         1459 non-null int8
Neighborhood      1459 non-null int8
Condition1        1459 non-null int8
Condition2        1459 non-null int8
BldgType          1459 non-null int8
HouseStyle        1459 non-null int8
OverallQual       1459 non-null int64
OverallCond       1459 non-null int64
YearBuilt         1459 non-null int64
YearRemodAdd      1459 non-null int64
RoofStyle         1459 non-null int8
RoofMatl          1459 non-null int8
Exterior1st       1459 non-null int8
Exterior2nd       1459 non-null i

In [23]:
df1.to_csv('x_test1.csv',index=False)