In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [5]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [16]:
# apply train test split and also apply SAME feature engineering for train and test data

# first handle the missing values

#which all categorical values have nan values

features_nan = [feature for feature in df.columns if df[feature].isnull().sum()>1 and df[feature].dtypes=='O']
print(features_nan)
for feature in features_nan:
    print(" {}: {}%missing values".format(feature, np.round(df[feature].isnull().mean(),4)))


['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
 Alley: 0.9377%missing values
 MasVnrType: 0.0055%missing values
 BsmtQual: 0.0253%missing values
 BsmtCond: 0.0253%missing values
 BsmtExposure: 0.026%missing values
 BsmtFinType1: 0.0253%missing values
 BsmtFinType2: 0.026%missing values
 FireplaceQu: 0.4726%missing values
 GarageType: 0.0555%missing values
 GarageFinish: 0.0555%missing values
 GarageQual: 0.0555%missing values
 GarageCond: 0.0555%missing values
 PoolQC: 0.9952%missing values
 Fence: 0.8075%missing values
 MiscFeature: 0.963%missing values


In [17]:
# replacing nan values
# replacing missing values with new labels
def replace_cat_feature(dataset, features_nan):

    data = dataset.copy()
    # converting nan values in new category "missing"
    data[features_nan] = data[features_nan].fillna("Missing")
    return data

dataset = replace_cat_feature(df, features_nan=features_nan)

dataset[features_nan].isnull().sum()


Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [18]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,0,Missing,Missing,Missing,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,0,Missing,Missing,Missing,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,12,2008,WD,Normal,250000


In [20]:
# numerical features that contain the missing values

numerical_with_nan =[feature for feature in df.columns if df[feature].isnull().sum()>1 and df[feature].dtypes!="0" ]

for feature in numerical_with_nan:
    print("{}: {}% missing values".format(feature, np.around(df[feature].isnull().mean(),4)))

LotFrontage: 0.1774% missing values
Alley: 0.9377% missing values
MasVnrType: 0.0055% missing values
MasVnrArea: 0.0055% missing values
BsmtQual: 0.0253% missing values
BsmtCond: 0.0253% missing values
BsmtExposure: 0.026% missing values
BsmtFinType1: 0.0253% missing values
BsmtFinType2: 0.026% missing values
FireplaceQu: 0.4726% missing values
GarageType: 0.0555% missing values
GarageYrBlt: 0.0555% missing values
GarageFinish: 0.0555% missing values
GarageQual: 0.0555% missing values
GarageCond: 0.0555% missing values
PoolQC: 0.9952% missing values
Fence: 0.8075% missing values
MiscFeature: 0.963% missing values


In [21]:
# a lot of outliers = replace with median or mode

for feature in numerical_with_nan:

    median_value = df[feature].median()

    # create new feature to capture nan values
    # if it has nan, put 1, if not 0
    # to get more info fom a particular feature
    # creating a new coulmn with 1 and 0

    df[feature+'nan'] = np.where(df[feature].isnull(),1,0)
    df[feature].fillna(median_value, inplace=True)
# now there should be no null values
df[numerical_with_nan].isnull().sum()


TypeError: could not convert string to float: 'Grvl'

In [26]:
# now temporal variables like date and time
# first find which are the date time variables
numerical_features = [feature for feature in df.columns if df[feature].dtypes!='0']
df[numerical_features].head()

year_feature = [feature for feature in numerical_features if "Yr" in feature or "Year" in feature]

for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    df[feature] = df['YrSold'] - df[feature]
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,0


In [27]:
num_features=['LotFrontage', 'LotArea','1stFlrSF', 'GrLivArea','SalePrice']

# if above features have 0, skip it.

for feature in num_features:
    df[feature] = np.log(df[feature])

# converting to log normal becuase of skewed values



In [30]:
#handling rare categorical features, dtype is O capital O

categorical_features = [feature for feature in df.columns if df[feature].dtypes=='O']
categorical_features


['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [33]:
# if categorical feature is > 0.015 than only consider it otherwise dispose it
for feature in categorical_features:
    tmp = df.groupby(feature)["SalePrice"].count()/len(df)
    tmp_df=tmp[tmp>0.01].index

    df[feature]= np.where(df[feature].isin(tmp_df), df[feature],'Rare_var')


In [37]:
# feature scaling!!!
feature_scale =[feature for feature in df.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # scales down data between 0,1
#standard scaler transforms the data using Standar normal distibution
scaler.fit(df[feature_scale])
scaler.transform(df[feature_scale])

ValueError: could not convert string to float: 'RL'