
##  DSI-US-11 Project 2 Regression Challenge (Kaggle)

PART 2, after the 01_EDA_and_Cleaning

## This notebook can be used for both Train and Test datasets

- read, cleanup and save preprocessed data for both train and test
- drop records only for train dataset



In [43]:
#configuration variable
is_train_dataset = False

In [45]:
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn import preprocessing

%matplotlib inline

## Loading of  Preproc Dataset

In [46]:
if is_train_dataset:
    df_ames = pd.read_csv("../datasets/train_preproc.csv")
    print("Loading train_preproc.csv")
else:
    df_ames = pd.read_csv("../datasets/test_preproc.csv")
    print("Loading test_preproc.csv")
    
df_ames.head()

Loading test_preproc.csv


Unnamed: 0.1,Unnamed: 0,Id,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,...,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Fence,Yr Sold,Sale Type
0,0,2658,RM,69.0,9142,Pave,Reg,Lvl,AllPub,Inside,...,1,440,Po,Po,Y,0,60,,2006,WD
1,1,2718,RL,69.30802,9662,Pave,IR1,Lvl,AllPub,Inside,...,2,580,TA,TA,Y,170,0,,2006,WD
2,2,2414,RL,58.0,17104,Pave,IR1,Lvl,AllPub,Inside,...,2,426,TA,TA,Y,100,24,,2006,New
3,3,1989,RM,60.0,8520,Pave,Reg,Lvl,AllPub,Inside,...,2,480,Fa,TA,N,0,0,,2007,WD
4,4,625,RL,69.127591,9500,Pave,IR1,Lvl,AllPub,Inside,...,2,514,TA,TA,Y,0,76,,2009,WD


## Feauture Engineering

In [47]:
#grouping by zone we can see the average 
if is_train_dataset == True:
    print(df_ames.groupby(by="MS Zoning")['SalePrice'].mean().sort_values())


In [48]:
def zoning_rank(x):
    if 'RM' in x:
        return 1
    elif 'RH' in x:
        return 2
    elif 'RL' in x:
        return 3
    elif 'FV' in x:
        return 4
    else:
        return 0

In [49]:
def lotshape_rank(x):
    if 'Reg' in x:
        return 4
    elif 'IR1' in x:
        return 3
    elif 'IR2' in x:
        return 2
    elif 'IR3' in x:
        return 1
    else:
        return 0

In [50]:
def condition_quality_rank(x):
    if pd.isna(x):
        return 0
    elif 'Ex' in x:
        return 5
    elif 'Gd' in x:
        return 4
    elif 'TA' in x:
        return 3
    elif 'Fa' in x:
        return 2
    elif 'Po' in x:
        return 1
    else:
        return 0

In [51]:
##based on the mean, rank the zone
df_ames['zone_dummy'] = df_ames['MS Zoning'].map(zoning_rank)

In [52]:
df_ames['zone_dummy'].value_counts()

3    674
1    146
4     38
2     13
0      7
Name: zone_dummy, dtype: int64

In [53]:
##based on the irregularities, rank the lot shape
df_ames['lotshape_dummy'] = df_ames['Lot Shape'].map(lotshape_rank)

In [54]:
df_ames['lotshape_dummy'].value_counts()

4    564
3    286
2     21
1      7
Name: lotshape_dummy, dtype: int64

In [55]:
#convert the condition to numerical values 
df_ames['exterqual_dummy'] = df_ames['Exter Qual'].map(condition_quality_rank)
df_ames['extercond_dummy'] = df_ames['Exter Cond'].map(condition_quality_rank)
df_ames['heatqc_dummy'] = df_ames['Heating QC'].map(condition_quality_rank)
df_ames['kitchenqual_dummy'] = df_ames['Kitchen Qual'].map(condition_quality_rank)
df_ames['fireplacequal_dummy'] = df_ames['Fireplace Qu'].map(condition_quality_rank)
df_ames['garagequal_dummy'] = df_ames['Garage Qual'].map(condition_quality_rank)
df_ames['garagecond_dummy'] = df_ames['Garage Cond'].map(condition_quality_rank)
df_ames['bsmtqual_dummy'] = df_ames['Bsmt Qual'].map(condition_quality_rank)
df_ames['bsmtcond_dummy'] = df_ames['Bsmt Cond'].map(condition_quality_rank)

In [56]:
#convert Y to 2 and  P to 1 and N to 0
#PavedDrive: Paved driveway
#Y Paved
#P Partial Pavement
#N Dirt/Gravel#

df_ames['paveddrive_dummy'] = df_ames["Paved Drive"].map({"Y":2,"P":1, "N":0 })

In [57]:
#convert Y to 1 and  N to 0

df_ames['centralair_dummy'] = df_ames["Central Air"].map(lambda x: 1 if x=='Y' else 0)

In [58]:
df_ames["Central Air"].unique()

array(['N', 'Y'], dtype=object)

In [59]:
#Age calculation based in Year it was sold and Year it was built
df_ames['HouseAge'] = df_ames['Yr Sold']- df_ames['Year Built']

In [60]:
df_ames[['Yr Sold', 'Year Built', 'Year Remod/Add', 'HouseAge']].head()

Unnamed: 0,Yr Sold,Year Built,Year Remod/Add,HouseAge
0,2006,1910,1950,96
1,2006,1977,1977,29
2,2006,2006,2006,0
3,2007,1923,2006,84
4,2009,1963,1963,46


In [61]:
#drop fields that we used to create new one
df_ames.drop(columns=['MS Zoning', 'Lot Shape', 'Exter Qual', 'Exter Cond', 'Bsmt Qual', 'Bsmt Cond', 'Heating QC','Kitchen Qual','Fireplace Qu','Garage Qual','Garage Cond'], inplace=True)

In [62]:
#One hot encoding for categorical values

var_categorical = df_ames.select_dtypes(include = 'object').columns

le = preprocessing.LabelEncoder()

for x in var_categorical:
    df_ames[x] = le.fit_transform(df_ames[x] )

In [63]:
df_ames.select_dtypes(include = 'object').columns

Index([], dtype='object')

In [64]:
#final check for null values
df_ames.isnull().sum()[df_ames.isnull().sum()>0].sort_values()

Series([], dtype: int64)

In [65]:
#saved the preprocessed files
if is_train_dataset:
    df_ames.to_csv("../datasets/train_wfeature.csv")
else:
    df_ames.to_csv("../datasets/test_wfeature.csv")