In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_info_columns", 300)


In [2]:
def load_test_train_data() :

    print("\n\n*****\nLoading test and train data....")
    df_train = pd.read_csv("../data/raw/train.csv",index_col=0)
    df_test = pd.read_csv("../data/raw/test.csv", index_col=0)
    df_train["data_type"] = "train"
    df_test["data_type"] = "test"
    df_test["SalePrice"] = 0

    df = pd.concat([df_train, df_test], sort=False)
#     df = df.reset_index(drop=True)

    print("Train Data Shape :", df_train.shape)
    print("Test Data Shape : ", df_test.shape)
    print("Combined Data Shape : ", df.shape)

    return (df_train, df_test, df)


In [3]:
def write_processed_files(df):
    
    print("\n\n*****\nGenerating processed data files...")

    train = df.loc[df.data_type_train == 1]
    test = df.loc[df.data_type_test == 1]    

    train = train.drop("data_type_train", axis=1)
    train = train.drop("data_type_test", axis=1)    
    test = test.drop("data_type_train", axis=1)
    test = test.drop("data_type_test", axis=1)
    test = test.drop("SalePrice", axis=1)
    
    df.to_csv("../data/processed_data/full_processed.csv")
    train.to_csv("../data/processed_data/train_processed.csv")
    test.to_csv("../data/processed_data/test_processed.csv")

    return df



In [4]:
def get_column_names_by_type(df):
    numeric_columns = df.describe().columns
    non_numeric_columns = [i for i in df.columns if i not in numeric_columns]
    return (numeric_columns, non_numeric_columns)
    

In [5]:
#missing value counter
def na_counter(data):
    coln = list(data.columns)

    if data.isnull().any().sum() == 0:
        return pd.DataFrame(columns=['NA Count',"PERCENT"])
    
    na_dict ={}
    for col in coln:
        if data[col].isnull().any()==True:
            na_count = data[col].isnull().sum()
            na_percent = data[col].isnull().sum()/len(data)
            na_dict[col]=[na_count, na_percent]
    df_na =pd.DataFrame(na_dict).T
    df_na.columns = ['NA Count',"PERCENT"]
    df_na.index.name = "Column"
    return df_na

In [6]:
def impute_null_fields(df):
    
    print("\n\n*****\nImputing Null values....")
    print("Null counts before imputing:")
    print(na_counter(df))
    

    """ ?? """
    df["MSZoning"] = df["MSZoning"].fillna("RM")
    df["LotFrontage"] = df["LotFrontage"].fillna(df.groupby("Neighborhood")["LotFrontage"].transform("mean"))
    df["MasVnrType"] = df["MasVnrType"].fillna("None") 
    df["MasVnrArea"] = df["MasVnrArea"].fillna(0.0) 
    df["BsmtQual"] = df["BsmtQual"].fillna("None")
    df["BsmtCond"] = df["BsmtCond"].fillna("None")
    df["BsmtExposure"] = df["BsmtExposure"].fillna("NoBsmt")
    df["BsmtFinType1"] = df["BsmtFinType1"].fillna("NoBsmt")
    df["BsmtFinType2"] = df["BsmtFinType2"].fillna("NoBsmt")
    df['Exterior1st'] = df['Exterior1st'].fillna("None") 
    df['Exterior2nd'] = df['Exterior2nd'].fillna("None")
    df['BsmtFinSF1'] = df['BsmtFinSF1'].fillna(0) 
    df['BsmtFinSF2'] = df['BsmtFinSF2'].fillna(0) 
    df['BsmtUnfSF'] = df['BsmtUnfSF'].fillna(0) 
    df['TotalBsmtSF'] = df['TotalBsmtSF'].fillna(0) 
    

    """ Only one row missing the value, impute with "SBrkr" which is the most used """
    df.Electrical = df.Electrical.fillna("SBrkr")

    """ Impute with equavelant of overall house quality : Average, equivalent "TA" """
    df.KitchenQual = df.KitchenQual.fillna("None")

    df.FireplaceQu = df.FireplaceQu.fillna("None")
    
    """ First change Garage Type for row with Null area and then change Garage Area """
    df.loc[df.GarageArea.isnull(), "GarageType"] = np.nan
    df.loc[df.GarageArea.isnull(), "GarageArea"] = 0
    
        

    """ For one record with Garage Area 360 and with missing Garage values, set to average values based on Overall Quality """
    df.loc[(df.GarageArea==360) & df.GarageFinish.isnull(), "GarageFinish"] = "Unf"
    df.loc[(df.GarageArea==360) & df.GarageQual.isnull(), "GarageQual"] = "TA"
    df.loc[(df.GarageArea==360) & df.GarageCond.isnull(), "GarageCond"] = "TA"
    """ Setting the Garage built to "Year Remodelled" """
    df.loc[(df.GarageArea==360) & df.GarageYrBlt.isnull(), "GarageYrBlt"] = df.YearRemodAdd


    
    """ Set rest of the missing Garage valies to None and GarageCars to zero"""
    df.GarageFinish = df.GarageFinish.fillna("None")
    df.GarageQual = df.GarageQual.fillna("None")
    df.GarageCond = df.GarageCond.fillna("None")
    df.GarageType = df.GarageType.fillna("None")

    df.GarageCars = df.GarageCars.fillna(0)


    """ Is it correct to fill with Zero for Year built as the scale will change compared to most values in 1900""" 
    df.GarageYrBlt = df.GarageYrBlt.fillna(0)


    """ Impiute with Typ as mentioned in documentation. There does not seem to be clear relation with
        Overall quality fields
    """
    df.loc[df.Functional.isnull(), "Functional"] = "Typ"


    """Basement Fullbath and Halfbath NA values are related to  houses without basements. Soe set them to 0 """
    df.BsmtFullBath = df.BsmtFullBath.fillna(0)
    df.BsmtHalfBath = df.BsmtHalfBath.fillna(0)
    
    
    """ Other """
    df['SaleType'] = df['SaleType'].fillna("Oth")
    df['GarageQual'] = df['GarageQual'].fillna("None")
    
    df["Alley"] = df["Alley"].fillna("None")
    df["Fence"] = df["Fence"].fillna("None")

    print("\n\n\nNull counts after imputing:")
    print(na_counter(df))

    return df

In [7]:
def correct_data(df):
    
    print("\n\n*****\nCorrecting incorrect data values...")

    """Seems the year is 2207 is a typo, setting to the year remodeled"""
    df.loc[df.GarageYrBlt==2207, "GarageYrBlt"] = 2007
    return df

In [8]:

def transform_ordinal(df):
    typical_graded_cols = ["HeatingQC", "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond", "ExterQual",
                          "ExterCond", "BsmtQual", "BsmtCond"]
    typical_grade_map = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'None':0}

    for i in typical_graded_cols:
        df[i].replace(typical_grade_map, inplace=True)
    
    df['Functional'].replace({'Typ':7,'Min1':6,'Min2':5,'Mod':4,'Maj1':3,'Maj2':2,'Sev':1,'Sal':0}, inplace=True)
    df['GarageFinish'].replace({'Fin':3,'RFn':2,'Unf':1,'None':0}, inplace=True)
    
    return df

In [9]:
def drop_columns(df):
    
    print("\n\n*****\nDropping selective columns...")
    columsn_to_drop = ["Street","Utilities","Condition2", "LowQualFinSF","GarageYrBlt",
                       "PoolQC","MiscFeature", "MiscVal"]

    columsn_to_drop = columsn_to_drop + ["MoSold", "YrSold"]
    
    print("\n\n*****\nNo of columns before dropping : ", len(df.columns))
    print("No of columns to drop : ", len(columsn_to_drop))
    
    df.drop(columsn_to_drop, axis=1, inplace=True)
    print("No of columns after dropping : ", len(df.columns))

    return df

In [10]:
def value_counts(data, kind="non-numeric", threshold = 15): 
    
    continuous_col = data.describe().columns
    categorical_col =data.drop(continuous_col, axis=1).columns
    
    if (kind=="non-numeric"):
        columns = categorical_col
    elif (kind == "numeric"):
        columns = continuous_col
    else :
        columns = data.columns
    
    for col in (columns):
        if (len(data[col].value_counts())<= threshold):
            print(data[col].value_counts())
            print("")

In [11]:
def print_data_stats(df):
    print("\n\n*****\nPrinting stats...")
    print("No of rows : ", df.shape[0])
    print("No of columns : ", df.shape[1])
    
    x,y = get_column_names_by_type(df)
    print("Numeric columns : ", len(x))
    print("Non Numeric columns : ", len(y))
    
    print("\n\nNull counts :")
    print(na_counter(df))
    
    return df

In [12]:
def change_column_type(df):
    
    print("\n\n*****\n Changing some columns to category ...")
    df.MSSubClass = df.MSSubClass.astype("category")    
    return df

In [13]:
def standardize_features(df):

    print("\n\n*****\n Standardizing numeric columns ...")
    sclr = StandardScaler()
    num_cols, cat_cols = get_column_names_by_type(df)
    num_cols = num_cols.drop("SalePrice")
    
    
    df2 = sclr.fit_transform(df[num_cols])
    df.loc[:,num_cols] = df2

    return df

In [14]:
def feature_transformation(df):
    
        print("\n\n*****\n Transforming features ...")


        """ Changing Columns to Binary where majority values are one specific value and rest all had 
        similar pattern against SalePrice
        """
        df["Electrical"] = np.where(df["Electrical"].str.contains("SBrkr"), "1", "0")
        df["Heating"] = np.where(df["Heating"].str.contains("GasA"), "1", "0")
        df["RoofMatl"] = np.where(df["RoofMatl"].str.contains("CompShg|WdShngl", regex=True), "1", "0")

        df["PoolArea"]=np.where(df["PoolArea"]>1,"Yes","No")

        
        """ Dropping 1st and 2nd Floor SFT as they are collinear with GrLivArea"""
        df = df.drop([ '1stFlrSF', '2ndFlrSF'], axis=1)
        df = df.drop("TotalBsmtSF", axis=1)
        
        
        return df

In [15]:
def data_transformation(df):
    
    print("\n\n*****\n Transforming data values ...")

    df.loc[(df.OverallCond==2) &( df.SalePrice>300000), "OverallCond"] = 5
    df.loc[df.LotFrontage>250,"LotFrontage"] = 200
    df.loc[df.BsmtFinSF1>=3000, "BsmtFinSF1"] = 2200 ## No need to update TotalBsmtSF, will be dropped 
    df.loc[df.BsmtFullBath==3, "BsmtFullBath"] = 2
    df.loc[df.FullBath==4, "FullBath"] = 3
    df.loc[df.Fireplaces==4, "Fireplaces"] = 3
    df.loc[df.GarageCars==5, "GarageCars"] = 4
    df.loc[df.WoodDeckSF>100, "WoodDeckSF"] = 850
#     df.loc[df.OpenPorchSF>550, "OpenPorchSF"] = 520
#     df.loc[df.EnclosedPorch>400, "EnclosedPorch"] = 380 + np.random.randint(0,20,5)

    return df

In [16]:
def feature_generation(df):
    
    print("\n\n*****\n Generating features ...")
    
    """ Consolidate columns and drop related columns"""
    df["TotalFullBaths"] = df["BsmtFullBath"]+df["FullBath"]
    df["TotalHalfBaths"] = df["BsmtHalfBath"]+df["HalfBath"]
    df = df.drop(["BsmtFullBath","FullBath", "BsmtHalfBath", "HalfBath" ], axis=1)
    
    df["TotalPorch"] = df["OpenPorchSF"]+df["EnclosedPorch"]+df["3SsnPorch"]+df["ScreenPorch"]
    df = df.drop(["OpenPorchSF","EnclosedPorch", "3SsnPorch", "ScreenPorch" ], axis=1)
    
    return df

In [17]:
def process_raw_data(standardize = True, use_ordinal=True):
    
    print("\n\n*****\n Processing raw files to generate processed files... hold tight...")
    train_df, test_df, full_df = load_test_train_data()
    
    final_df = full_df.copy().\
        pipe(print_data_stats).\
        pipe(impute_null_fields).\
        pipe(print_data_stats).\
        pipe(correct_data).\
        pipe(print_data_stats).\
        pipe(change_column_type).\
        pipe(print_data_stats).\
        pipe(drop_columns).\
        pipe(print_data_stats).\
        pipe(feature_transformation).\
        pipe(print_data_stats).\
        pipe(data_transformation).\
        pipe(print_data_stats).\
        pipe(feature_generation)
    
    if (use_ordinal):
        final_df = transform_ordinal(final_df).\
        pipe(print_data_stats)

    if (standardize):
        final_df = standardize_features(final_df).\
        pipe(print_data_stats)
        
    final_df = final_df.pipe(pd.get_dummies).\
            pipe(print_data_stats).\
            pipe(write_processed_files)

    return(train_df, test_df, full_df,final_df)
    

In [18]:
train_df,test_df,all_df,fin_df = process_raw_data()



*****
 Processing raw files to generate processed files... hold tight...


*****
Loading test and train data....
Train Data Shape : (1460, 81)
Test Data Shape :  (1459, 81)
Combined Data Shape :  (2919, 81)


*****
Printing stats...
No of rows :  2919
No of columns :  81
Numeric columns :  37
Non Numeric columns :  44


Null counts :
              NA Count   PERCENT
Column                          
MSZoning           4.0  0.001370
LotFrontage      486.0  0.166495
Alley           2721.0  0.932169
Utilities          2.0  0.000685
Exterior1st        1.0  0.000343
Exterior2nd        1.0  0.000343
MasVnrType        24.0  0.008222
MasVnrArea        23.0  0.007879
BsmtQual          81.0  0.027749
BsmtCond          82.0  0.028092
BsmtExposure      82.0  0.028092
BsmtFinType1      79.0  0.027064
BsmtFinSF1         1.0  0.000343
BsmtFinType2      80.0  0.027407
BsmtFinSF2         1.0  0.000343
BsmtUnfSF          1.0  0.000343
TotalBsmtSF        1.0  0.000343
Electrical         1.0  0.000343
Bs

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)




*****
Printing stats...
No of rows :  2919
No of columns :  63
Numeric columns :  33
Non Numeric columns :  30


Null counts :
Empty DataFrame
Columns: [NA Count, PERCENT]
Index: []


*****
Printing stats...
No of rows :  2919
No of columns :  230
Numeric columns :  230
Non Numeric columns :  0


Null counts :
Empty DataFrame
Columns: [NA Count, PERCENT]
Index: []


*****
Generating processed data files...


In [19]:
all_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,data_type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500,train
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500,train
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500,train
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000,train
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000,train


In [20]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 1 to 2919
Data columns (total 81 columns):
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-null object
Exterior2nd      2918 non

In [21]:
all_df.describe().columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [22]:
fin_df.head()

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,HeatingQC,GrLivArea,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,SalePrice,TotalFullBaths,TotalHalfBaths,TotalPorch,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_C (all),...,BsmtFinType1_NoBsmt,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_NoBsmt,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_0,Heating_1,CentralAir_N,CentralAir_Y,Electrical_0,Electrical_1,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,PavedDrive_N,PavedDrive_P,PavedDrive_Y,PoolArea_No,PoolArea_Yes,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,data_type_test,data_type_train
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,-0.23217,-0.217879,0.646183,-0.509081,1.046258,0.896833,0.529034,1.039805,-0.230047,0.577094,0.141836,0.602047,-0.293025,-0.934165,0.885619,0.413547,0.169927,-0.207698,0.737021,0.986849,0.237316,-0.92599,-0.97883,0.316333,0.307986,0.349364,0.27738,0.267611,-0.831532,208500,1.346257,1.021796,-0.261852,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1
2,0.480759,-0.072044,-0.063185,2.191113,0.154764,-0.395604,-0.567016,-0.683756,-0.230047,0.577094,0.141836,1.216455,-0.293025,-0.629284,0.885619,-0.471891,0.169927,-0.207698,-0.766379,-0.287758,0.237316,0.625655,0.682014,0.316333,0.307986,-0.058991,0.27738,0.267611,1.235139,181500,0.005969,1.021796,-0.827273,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1
3,-0.089584,0.137197,0.646183,-0.509081,0.980221,0.848965,0.338903,1.039805,-0.230047,0.577094,0.141836,0.1051,-0.293025,-0.287999,0.885619,0.563755,0.169927,-0.207698,0.737021,-0.287758,0.237316,0.625655,0.682014,0.316333,0.307986,0.627787,0.27738,0.267611,-0.831532,223500,1.346257,1.021796,-0.437967,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1
4,-0.469814,-0.078385,0.646183,-0.509081,-1.859351,-0.682812,-0.567016,-0.683756,-0.230047,-0.527521,1.881417,-0.504791,-0.293025,-0.046824,-0.158453,0.427382,0.169927,-0.207698,0.737021,0.349546,0.237316,0.625655,1.235629,-0.798856,1.622332,0.785561,0.27738,0.267611,-0.831532,140000,0.005969,-0.808034,2.018368,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1
5,0.670874,0.518903,1.355551,-0.509081,0.947203,0.753229,1.390216,1.039805,-0.230047,0.577094,0.141836,0.486846,-0.293025,-0.160586,0.885619,1.378042,1.385655,-0.207698,0.737021,1.624153,0.237316,0.625655,0.682014,0.316333,1.622332,1.685798,0.27738,0.267611,1.235139,250000,1.346257,1.021796,-0.048661,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1


In [23]:
fin_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 1 to 2919
Data columns (total 230 columns):
LotFrontage              2919 non-null float64
LotArea                  2919 non-null float64
OverallQual              2919 non-null float64
OverallCond              2919 non-null float64
YearBuilt                2919 non-null float64
YearRemodAdd             2919 non-null float64
MasVnrArea               2919 non-null float64
ExterQual                2919 non-null float64
ExterCond                2919 non-null float64
BsmtQual                 2919 non-null float64
BsmtCond                 2919 non-null float64
BsmtFinSF1               2919 non-null float64
BsmtFinSF2               2919 non-null float64
BsmtUnfSF                2919 non-null float64
HeatingQC                2919 non-null float64
GrLivArea                2919 non-null float64
BedroomAbvGr             2919 non-null float64
KitchenAbvGr             2919 non-null float64
KitchenQual              2919 non-null float64