# Import Necessary Libraries

In [1]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Read and Generate Report of EDA for train data

In [2]:
# Read clean train csv file
train_df = pd.read_csv("clean_train.csv", )

In [3]:
# See the dimension of the dataset
train_df.shape

(1407, 76)

In [4]:
# Display head of the train dataset
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,SaleType,SaleCondition,SalePrice,AgeRemod
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,WD,Normal,208500,0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,WD,Normal,181500,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,WD,Normal,223500,1
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,WD,Abnormal,140000,55
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,WD,Normal,250000,0


In [5]:
# Information train dataset
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407 entries, 0 to 1406
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1407 non-null   int64  
 1   MSSubClass     1407 non-null   int64  
 2   MSZoning       1407 non-null   object 
 3   LotFrontage    1407 non-null   float64
 4   LotArea        1407 non-null   int64  
 5   Street         1407 non-null   object 
 6   Alley          88 non-null     object 
 7   LotShape       1407 non-null   object 
 8   LandContour    1407 non-null   object 
 9   Utilities      1407 non-null   object 
 10  LotConfig      1407 non-null   object 
 11  LandSlope      1407 non-null   object 
 12  Neighborhood   1407 non-null   object 
 13  Condition1     1407 non-null   object 
 14  Condition2     1407 non-null   object 
 15  BldgType       1407 non-null   object 
 16  HouseStyle     1407 non-null   object 
 17  OverallQual    1407 non-null   int64  
 18  OverallC

In [6]:
# Since we checked in the preprocessed that NaN is nothing but 'NA'
# So we will replace it
train_df.fillna('NA',inplace=True)

In [7]:
# Again check if any missing value left
train_df.isna().any().sum()

0

In [8]:
# Drop Id as we don't need and have any impact on SalePrice
train_df.drop('Id', axis=1, inplace=True)

In [9]:
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,SaleType,SaleCondition,SalePrice,AgeRemod
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,WD,Normal,208500,0
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,WD,Normal,181500,0
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,WD,Normal,223500,1
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,WD,Abnormal,140000,55
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,WD,Normal,250000,0


In [10]:
# Since there are more features. Performing EDA through all feratures takes time.
# Pandas profiling has a good library which automatically perform EDA for all features.
# let look into it
report = pp.ProfileReport(train_df)

In [11]:
report.to_file("report.html")

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=89.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [12]:
# Go and open report.html to check it out

# Split the train data into train,cv and test data

In [13]:
from sklearn.model_selection import train_test_split
import pickle

In [14]:
X = train_df.drop('SalePrice', axis=1).values
columns = train_df.drop('SalePrice', axis=1).columns
y = train_df['SalePrice'].values

In [15]:
train_df.drop(['SalePrice'], axis=1, inplace=True)

In [16]:
# Store and save the column index whose are categorical feature
cat_index = []
for i in columns:
    if train_df[i].dtype == ('object'):
        cat_index.append(train_df.columns.get_loc(i))

In [17]:
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.2, random_state=1)
train_X.shape, train_y.shape

((1125, 74), (1125,))

In [18]:
train_X, cv_X, train_y, cv_y = train_test_split(train_X,train_y, test_size=0.2, random_state=1)
train_X.shape, train_y.shape

((900, 74), (900,))

In [19]:
vec_feat = dict()
norm_train_X = []

for i in range(train_X.shape[1]):
    if i in cat_index:
        oneHot = OneHotEncoder(handle_unknown='ignore')
        vec_feat[i] = oneHot.fit(train_X[:,i].reshape(-1,1))
        result = oneHot.transform(train_X[:,i].reshape(-1,1))
        
        if len(norm_train_X) > 0:    
            norm_train_X = np.hstack((norm_train_X,result.toarray()))
        else:
            norm_train_X = result
    else:
        if len(norm_train_X) > 0:    
            norm_train_X = np.hstack((norm_train_X,train_X[:,i].reshape(-1,1)))
        else:
            norm_train_X = train_X[:,i].reshape(-1,1)

In [20]:
norm_train_X.shape

(900, 281)

In [21]:
norm_cv_X = []

for i in range(cv_X.shape[1]):
    if i in cat_index:
        result = vec_feat[i].transform(cv_X[:,i].reshape(-1,1))
        
        if len(norm_cv_X) > 0:    
            norm_cv_X = np.hstack((norm_cv_X,result.toarray()))
        else:
            norm_cv_X = result
    else:
        if len(norm_cv_X) > 0:    
            norm_cv_X = np.hstack((norm_cv_X,cv_X[:,i].reshape(-1,1)))
        else:
            norm_cv_X = cv_X[:,i].reshape(-1,1)

In [22]:
norm_cv_X.shape

(225, 281)

In [23]:
norm_test_X = []

for i in range(test_X.shape[1]):
    if i in cat_index:
        result = vec_feat[i].transform(test_X[:,i].reshape(-1,1))
        
        if len(norm_test_X) > 0:    
            norm_test_X = np.hstack((norm_test_X,result.toarray()))
        else:
            norm_test_X = result
    else:
        if len(norm_test_X) > 0:    
            norm_test_X = np.hstack((norm_test_X,test_X[:,i].reshape(-1,1)))
        else:
            norm_test_X = test_X[:,i].reshape(-1,1)

In [24]:
norm_test_X.shape

(282, 281)

# Saving all normalization result (of splitting dataL train,cv and test)

In [25]:
with open('train_X.pkl','wb') as f:
    pickle.dump(norm_train_X,f)

In [26]:
with open('train_y.pkl','wb') as f:
    pickle.dump(train_y,f)

In [27]:
with open('test_X.pkl','wb') as f:
    pickle.dump(norm_test_X,f)

In [28]:
with open('test_y.pkl','wb') as f:
    pickle.dump(test_y,f)

In [29]:
with open('cv_X.pkl','wb') as f:
    pickle.dump(norm_cv_X,f)

In [30]:
with open('cv_y.pkl','wb') as f:
    pickle.dump(cv_y,f)

In [31]:
with open('cat_index.pkl','wb') as f:
    pickle.dump(cat_index,f)

In [32]:
with open('vec_feat.pkl','wb') as f:
    pickle.dump(vec_feat,f)

In [33]:
with open('columns.pkl','wb') as f:
    pickle.dump(columns,f)