In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("../data/raw/train.csv")

In [4]:
df.shape

(1460, 81)

In [5]:
df.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [6]:
dup = df.duplicated('Id')

In [8]:
dup.sum()

0

In [22]:
def return_meta(df):
    
    """
    return a dataframe which contains column specs of the dataframe
    
    Args:
        df: The dataframe, on which meta data is going to built
        
    Returns:
        meta_df: The data about the columns
    """
    
    meta_df = pd.DataFrame()
    meta_df['nullity'] = df.isna().sum()
    meta_df['null_pct'] = meta_df['nullity'] * 100 / df.shape[0]
    meta_df['dtype'] = df.dtypes
    return meta_df 

In [23]:
meta = return_meta(df)
meta

Unnamed: 0,nullity,null_pct,dtype
Id,0,0.000000,int64
MSSubClass,0,0.000000,int64
MSZoning,0,0.000000,object
LotFrontage,259,17.739726,float64
LotArea,0,0.000000,int64
...,...,...,...
MoSold,0,0.000000,int64
YrSold,0,0.000000,int64
SaleType,0,0.000000,object
SaleCondition,0,0.000000,object


In [34]:
to_drop = meta[meta['null_pct']>80].index
df.drop(to_drop,axis=1, inplace=True)
df.shape

(1460, 77)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 77 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [49]:
ls_nan_5pct = meta[meta['null_pct']>1].index.tolist()
ls_null5 = {i: i+'_nan' for i in ls_nan_5pct}

In [50]:
for i in df:
    if i in ls_null5:
        val = ls_null5[i]
        df[val] = 0
        df.loc[df[i].isna(), val] = 1

In [51]:
for i in df:
    if i in ls_null5.values():
        print(i)
        print("--------------")
        print(df[i].value_counts())
        print('\n\n')

LotFrontage_nan
--------------
0    1201
1     259
Name: LotFrontage_nan, dtype: int64



BsmtQual_nan
--------------
0    1423
1      37
Name: BsmtQual_nan, dtype: int64



BsmtCond_nan
--------------
0    1423
1      37
Name: BsmtCond_nan, dtype: int64



BsmtExposure_nan
--------------
0    1422
1      38
Name: BsmtExposure_nan, dtype: int64



BsmtFinType1_nan
--------------
0    1423
1      37
Name: BsmtFinType1_nan, dtype: int64



BsmtFinType2_nan
--------------
0    1422
1      38
Name: BsmtFinType2_nan, dtype: int64



FireplaceQu_nan
--------------
0    770
1    690
Name: FireplaceQu_nan, dtype: int64



GarageType_nan
--------------
0    1379
1      81
Name: GarageType_nan, dtype: int64



GarageYrBlt_nan
--------------
0    1379
1      81
Name: GarageYrBlt_nan, dtype: int64



GarageFinish_nan
--------------
0    1379
1      81
Name: GarageFinish_nan, dtype: int64



GarageQual_nan
--------------
0    1379
1      81
Name: GarageQual_nan, dtype: int64



GarageCond_nan
-----

In [52]:
df2 = df.notna()

In [53]:
# removing the constant features

from sklearn.feature_selection import VarianceThreshold
col = df2.columns
var_rem = VarianceThreshold(threshold=0)
data = var_rem.fit_transform(df2)

In [56]:
data.shape, df2.shape, df.shape

((1460, 15), (1460, 106), (1460, 106))

In [67]:
for k,v in zip(col,var_rem.variances_):
    print(k,  v)

Id 0.0
MSSubClass 0.0
MSZoning 0.0
LotFrontage 0.14592747232126102
LotArea 0.0
Street 0.0
LotShape 0.0
LandContour 0.0
Utilities 0.0
LotConfig 0.0
LandSlope 0.0
Neighborhood 0.0
Condition1 0.0
Condition2 0.0
BldgType 0.0
HouseStyle 0.0
OverallQual 0.0
OverallCond 0.0
YearBuilt 0.0
YearRemodAdd 0.0
RoofStyle 0.0
RoofMatl 0.0
Exterior1st 0.0
Exterior2nd 0.0
MasVnrType 0.005449427659973729
MasVnrArea 0.005449427659973729
ExterQual 0.0
ExterCond 0.0
Foundation 0.0
BsmtQual 0.02470022518296116
BsmtCond 0.02470022518296116
BsmtExposure 0.025349971852129848
BsmtFinType1 0.02470022518296116
BsmtFinSF1 0.0
BsmtFinType2 0.02534997185212985
BsmtFinSF2 0.0
BsmtUnfSF 0.0
TotalBsmtSF 0.0
Heating 0.0
HeatingQC 0.0
CentralAir 0.0
Electrical 0.0006844623756802397
1stFlrSF 0.0
2ndFlrSF 0.0
LowQualFinSF 0.0
GrLivArea 0.0
BsmtFullBath 0.0
BsmtHalfBath 0.0
FullBath 0.0
HalfBath 0.0
BedroomAbvGr 0.0
KitchenAbvGr 0.0
KitchenQual 0.0
TotRmsAbvGrd 0.0
Functional 0.0
Fireplaces 0.0
FireplaceQu 0.249249390129480

In [71]:
sel=var_rem.get_support()

In [73]:
col_sel = col[sel]
col_sel

Index(['LotFrontage', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical',
       'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageQual', 'GarageCond'],
      dtype='object')