In [3]:
%matplotlib inline

import csv

import scipy
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from IPython.display import FileLink, FileLinks, display, HTML

In [4]:
# Set these options to display all the columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [5]:
def write_csv(output_fn, ids, test_y):
    with open(output_fn, "w") as fp:
        writer = csv.writer(fp)
        writer.writerow(['Id', 'SalePrice'])
        for _id, y in zip(ids, test_y):
            writer.writerow([_id, y])

In [6]:
base_path = "./data"

In [7]:
def train_model(train_X, train_y, **kwargs):
    model = XGBRegressor(**kwargs)
    model.fit(train_X, train_y, verbose=True)
    return model

In [6]:
testdf = pd.read_csv(f"{base_path}/test.csv")
traindf = pd.read_csv(f"{base_path}/train.csv")

Check for null values and delete them before we proceed. We don't have any in our case

In [57]:
traindf['SalePrice'].isnull().values.any()

False

Understand some basics about the data. For convenience i split them into two parts 

In [59]:
traindf.select_dtypes(exclude=['object']).describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1379.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,567.240411,1057.429452,1162.626712,346.992466,5.844521,1515.463699,0.425342,0.057534,1.565068,0.382877,2.866438,1.046575,6.517808,0.613014,1978.506164,1.767123,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,441.866955,438.705324,386.587738,436.528436,48.623081,525.480383,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,1.625393,0.644666,24.689725,0.747315,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,223.0,795.75,882.0,0.0,0.0,1129.5,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1961.0,1.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,477.5,991.5,1087.0,0.0,0.0,1464.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1980.0,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,808.0,1298.25,1391.25,728.0,0.0,1776.75,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2002.0,2.0,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,572.0,5642.0,3.0,2.0,3.0,2.0,8.0,3.0,14.0,3.0,2010.0,4.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [60]:
train_data.select_dtypes(include=['object']).describe()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1452.0,1460,1460,1460,1423,1423,1422,1423,1422,1460,1460,1460,1459,1460,1460,770,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,8,5,8,6,8,15,16,4.0,4,5,6,4,4,4,6,6,6,5,2,5,4,7,5,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,TA,TA,No,Unf,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Gd,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,1445,1220,726,1141,1434,515,504,864.0,906,1282,647,649,1311,953,430,1256,1428,741,1365,1334,735,1360,380,870,605,1311,1326,1340,3,157,49,1267,1198


Convert the string data into categorical. 

In [65]:
traindf[traindf.select_dtypes(['object']).columns] = traindf.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [78]:
y = traindf.SalePrice
X = traindf.drop(['SalePrice'], axis=1).select_dtypes(exclude=['category'])

In [115]:
model = train_model(X, y)

On our first try lets remove all the categorical data and try. 

In [84]:
test_X = testdf.select_dtypes(exclude=['object'])
test_y = model.predict(test_X)

In [108]:
write_csv("house_price_predict_1.csv", test_X['Id'], test_y)
FileLink("house_price_predict_1.csv")

After submission, we got about 0.14179 score with our test data. We are approximately at 50 percentile ! position. You might get a different score because of randomization. 

In [119]:
X1 = traindf.drop(['SalePrice'], axis=1)
X1 = pd.get_dummies(X1)
test_X1 = pd.get_dummies(testdf)

In [55]:
model_cat = train_model(X1, y)
model_cat.predict(test_X1)

NameError: name 'train_model' is not defined

In [125]:
train_total = len(traindf)
df_concat = pd.get_dummies(pd.concat([traindf.drop(['SalePrice'], axis=1), testdf], axis=0))
train_dummy = df_concat[:train_total]
test_dummy = df_concat[train_total:]

In [128]:
model_dummy = train_model(train_dummy, y)
y_dummy = model_dummy.predict(test_dummy)

In [129]:
write_csv("house_price_predict_2.csv", test_dummy['Id'], y_dummy)
FileLink("house_price_predict_2.csv")

In [130]:
model_dummy

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

Got about 0.136 which is some improvement over previous value

In [134]:
traindf.select_dtypes(['category']).columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'], dtype='object')

In [151]:
df_label = pd.concat([traindf.drop(['SalePrice'], axis=1), testdf], axis=0)
df_label[df_label.select_dtypes(['object']).columns] = df_label.select_dtypes(['object']).apply(lambda x: x.astype('category'))
cat_columns = df_label.select_dtypes(['category']).columns
df_label[cat_columns] = df_label[cat_columns].apply(lambda x: x.cat.codes)
train_label = df_label[:train_total]
test_label = df_label[train_total:]

In [153]:
train_label.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
  

In [154]:
test_label.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
  

In [158]:
model_label = train_model(train_label, y)
y_label = model_label.predict(test_label)
write_csv("house_price_predict_3.csv", test_label['Id'], y_label)
FileLink("house_price_predict_3.csv")

Got 0.13793 not an improvement with the previous numbers

In [165]:
base_score = np.median(y)

model_dummy = train_model(train_dummy, y, base_score=base_score, n_estimators=200, max_depth=4,learning_rate=0.01)
y_dummy = model_dummy.predict(test_dummy)
write_csv("house_price_predict_4.csv", test_dummy['Id'], y_dummy)
FileLink("house_price_predict_4.csv")

In [166]:
model_label = train_model(train_label, y, base_score=base_score, n_estimators=200, max_depth=4,learning_rate=0.01)
y_label = model_label.predict(test_label)
write_csv("house_price_predict_5.csv", test_label['Id'], y_label)
FileLink("house_price_predict_5.csv")

In [12]:
traindf.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
  

In [21]:
base_score = np.median(y)

cols_to_ignore = ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", 
                  "LotFrontage", "GarageCond", "GarageType", "GarageYrBlt",
                  "GarageFinish", "GarageQual", "BsmtExposure", "BsmtFinType2",
                  "BsmtFinType1", "BsmtCond", "BsmtQual", "MasVnrArea", "MasVnrType"]
y = traindf.SalePrice

train_total = len(traindf)
df_concat = pd.concat([traindf.drop(["SalePrice"], axis=1), testdf], axis=0)
df_concat = df_concat.drop(cols_to_ignore, axis=1)
df_concat = pd.get_dummies(df_concat)
train_dummy = df_concat[:train_total]
test_dummy = df_concat[train_total:]

model_dummy = train_model(train_dummy, y, base_score=base_score, n_estimators=400, max_depth=4,learning_rate=0.05,  gamma=0)
y_dummy = model_dummy.predict(test_dummy)

write_csv("house_price_ignore_dummies.csv", test_dummy['Id'], y_dummy)
FileLink("house_price_ignore_dummies.csv")

In [51]:
base_score = np.median(y)

cols_to_ignore = ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", 
                  "LotFrontage", "GarageCond", "GarageType", "GarageYrBlt",
                  "GarageFinish", "GarageQual", "BsmtExposure", "BsmtFinType2",
                  "BsmtFinType1", "BsmtCond", "BsmtQual", "MasVnrArea", "MasVnrType"]

# Remove outliers. 
df_train = traindf.drop(traindf[traindf['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)

y = df_train.SalePrice

train_total = len(df_train)

df_concat = pd.concat([df_train.drop(['SalePrice'], axis=1), testdf], axis=0)
df_concat = df_concat.drop(cols_to_ignore, axis=1)

# Add transformations here. 
y_log = np.log(y)
df_concat['GrLivArea'] = np.log(df_concat['GrLivArea'])

df_concat['HasBsmt'] = pd.Series(len(df_concat['TotalBsmtSF']), index=df_concat.index)
df_concat['HasBsmt'] = 0 
df_concat.loc[df_concat['TotalBsmtSF']>0,'HasBsmt'] = 1
df_concat['TotalBsmtSF'] = np.where(df_concat['TotalBsmtSF']>0, np.log(df_concat['TotalBsmtSF']), 0)
#df_concat.loc[df_concat['TotalBsmtSF']>0,'TotalBsmtSF'] = np.log(df_concat['TotalBsmtSF'])

df_concat = pd.get_dummies(df_concat)


train_dummy = df_concat[:train_total]
test_dummy = df_concat[train_total:]

model_dummy = train_model(train_dummy, y_log, base_score=base_score, n_estimators=700, max_depth=4, learning_rate=0.05, gamma=0)
y_dummy = model_dummy.predict(test_dummy)
y_pred = np.exp(y_dummy)

write_csv("house_price_ignore_dummies_trans.csv", test_dummy['Id'], y_pred)
FileLink("house_price_ignore_dummies_trans.csv")



In [35]:
print(df_concat.shape)
print(len(traindf))
print(len(testdf))
print(len(traindf)+len(testdf))

(2917, 221)
1460
1459
2919


In [46]:
print(y_pred[:20])
print(y_dummy[:20])

[132836.6  167921.19 190279.62 190279.62 222459.42 175979.89 187329.61
 167921.19 208981.28 130777.15 215615.06 105082.39 108418.06 167921.19
 150523.58 390428.44 256049.23 281215.06 276855.22 456457.1 ]
[11.796875 12.03125  12.15625  12.15625  12.3125   12.078125 12.140625
 12.03125  12.25     11.78125  12.28125  11.5625   11.59375  12.03125
 11.921875 12.875    12.453125 12.546875 12.53125  13.03125 ]


In [144]:
def show_missing_data(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1)
    display(missing_data.head(30))

show_missing_data(traindf)
show_missing_data(testdf)

Unnamed: 0,0,1
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageCond,81,0.055479
GarageType,81,0.055479
GarageYrBlt,81,0.055479
GarageFinish,81,0.055479


Unnamed: 0,0,1
PoolQC,1456,0.997944
MiscFeature,1408,0.965045
Alley,1352,0.926662
Fence,1169,0.801234
FireplaceQu,730,0.500343
LotFrontage,227,0.155586
GarageCond,78,0.053461
GarageQual,78,0.053461
GarageYrBlt,78,0.053461
GarageFinish,78,0.053461


You can see that data is missing in some columns. These columns are different for test vs train. We need to come up with some strategy to replace the null fields. For convenience we will merge the train and test sets together and apply the same transformations for both the datasets.

In [8]:
testdf = pd.read_csv(f"{base_path}/test.csv")
traindf = pd.read_csv(f"{base_path}/train.csv")

# Remove outliers. 
traindf = traindf.drop(traindf[traindf['Id'] == 1299].index)
traindf = traindf.drop(traindf[traindf['Id'] == 524].index)

y = traindf.SalePrice
df = pd.concat([traindf.drop(['SalePrice'], axis=1), testdf], axis=0)

print(f"DataFrame Shape: {df.shape}")

DataFrame Shape: (2917, 80)


In [9]:
neighborhood_lot = dict(df.groupby('Neighborhood')['LotFrontage'].mean())
cols_to_drop = ['MiscFeature', 'Utilities']

df['LotFrontage'] = df.apply(lambda row: row['LotFrontage'] if pd.notnull(row['LotFrontage']) else neighborhood_lot[row['Neighborhood']], axis=1)
df['GarageYrBlt'] = df.apply(lambda row: row['GarageYrBlt'] if pd.notnull(row['GarageYrBlt']) else row['YearBuilt'], axis=1)

print(f"DataFrame Shape: {df.shape}")

DataFrame Shape: (2917, 80)


In [10]:
cols_to_drop = ['MiscFeature', 'Utilities']
df = df.drop(cols_to_drop, axis=1)
print(f"DataFrame Shape: {df.shape}")

DataFrame Shape: (2917, 78)


In [11]:
import category_encoders
category_encoders.__file__

'/Users/pulikunt/myfiles/notebooks/category_encoders/__init__.py'

In [12]:
#from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn_pandas import DataFrameMapper

rating = [
    ("Ex",5), 
    ("Gd",4), 
    ("TA",3), 
    ("Fa",2), 
    ("Po",1),
    (np.NaN,0),
]

basement_type = [
    ("GLQ", 6),
    ("ALQ", 5),
    ("BLQ", 4),
    ("Rec", 3),
    ("LwQ", 2),
    ("Unf", 1),
    (np.NaN, 0),            
]

cols_mapping = [
    {
        "col": "PoolQC",    
        "mapping": rating
    },
    {
        "col": "FireplaceQu",
        "mapping": rating
    },
    {
        "col": "LotShape",
        "mapping": [
            ("Reg", 3),
            ("IR1", 2),
            ("IR2", 1),
            ("IR3", 0),
        ]
    },
    {
        "col": "GarageCond",
        "mapping": rating
    },
    {
        "col": "GarageQual",
        "mapping": rating
    },
    {
        "col": "BsmtQual",
        "mapping": rating
    },
    {
        "col": "BsmtCond",
        "mapping": rating
    },
    {
        "col": "BsmtExposure",
        "mapping": rating
    },
    {
        "col": "BsmtFinType1",
        "mapping": basement_type
    },
    {
        "col": "BsmtFinType2",
        "mapping": basement_type
    },
    {
        "col": "MasVnrType",
        "mapping": [
            ("Stone", 2),
            ("BrkFace", 1),
            ("BrkCmn", 0),
            (np.NaN, 0),
        ]
    },
    {
        "col": "KitchenQual",
        "mapping": rating
    },
    {
        "col": "Functional",
        "mapping": [
            (np.NaN, 0),
            ("Sal", 0), 
            ("Sev", 1), 
            ("Maj2", 2),
            ("Maj1", 3),
            ("Mod", 4),
            ("Min2", 5),
            ("Min1", 6),
            ("Typ", 7),
        ]
    },
    {
        "col": "ExterQual",
        "mapping": rating
    },
    {
        "col": "ExterCond",
        "mapping": rating
    },    
]

cols_category =  ['Alley', 'Fence', 'GarageFinish', 'LotConfig', 'GarageType', 'MSZoning', 'Exterior1st', 'Exterior2nd', 'Electrical', 'SaleType', 'SaleCondition']
cols_ordinal = [row['col'] for row in cols_mapping]
cols_numeric = ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'MasVnrArea', "1stFlrSF", "2ndFlrSF", "Id", "YearBuilt", "YearRemodAdd", "YrSold", "PoolArea", "LotArea", "GarageArea", "GarageYrBlt", "OverallCond", "OverallQual", "WoodDeckSF", "TotRmsAbvGrd", "3SsnPorch", "BedroomAbvGr", "EnclosedPorch", "Fireplaces", "FullBath", "GarageCars", "HalfBath", "KitchenAbvGr", "LotFrontage", "LowQualFinSF", "MSSubClass", "MiscVal", "MoSold", "OpenPorchSF", "ScreenPorch", "TotalBsmtSF", "GrLivArea"]
cols_ordinal_convert = ["BldgType", "CentralAir", "Condition1", "Condition2", "Foundation", "Heating", "HeatingQC", "HouseStyle", "LandContour", "LandSlope", "Neighborhood", "PavedDrive", "RoofMatl", "RoofStyle", "SaleCondition", "Street"]

col_trans = make_column_transformer(
    (
        OrdinalEncoder(mapping=cols_mapping), cols_ordinal
    ),
    (
        Pipeline([
            ('si1', SimpleImputer(strategy='constant', fill_value='NA')),
            ('oe1', OneHotEncoder(sparse=False))
        ]),
        cols_category
    ),
    (
        SimpleImputer(strategy='constant', fill_value=0), cols_numeric
    ),
    (
        Pipeline([
            ('si2', SimpleImputer(strategy='constant', fill_value='NA')),
            ('oe2', OneHotEncoder(sparse=False))
        ]),
        cols_ordinal_convert
    ),
    remainder='passthrough')



In [52]:
rating = {
    "Ex": 5,
    "Gd": 4,
    "TA": 3,
    "Fa": 2,
    "Po": 1,
    np.NaN: 0,
    "None": 0,
    
}

basement_type = {
    "GLQ": 6,
    "ALQ": 5,
    "BLQ": 4,
    "Rec": 3,
    "LwQ": 2,
    "Unf": 1,
    np.NaN: 0,
    "None": 0,
    
}

MAPPING = [
    ("PoolQC",  rating),
    ("FireplaceQu", rating),
    ("GarageCond", rating),
    ("GarageQual", rating),
    ("BsmtQual", rating),
    ("BsmtCond", rating),
    ("BsmtExposure", {
        "Gd": 4,
        "Av": 3,
        "Mn": 2,
        "No": 1,
        "None": 0,
        np.NaN: 0,
    }),
    ("KitchenQual", rating),
    ("ExterQual", rating),
    ("ExterCond", rating),
    ("LotShape", {
        "Reg": 3,
        "IR1": 2,
        "IR2": 1,
        "IR3": 0,
    }),
    ("BsmtFinType1", basement_type),
    ("BsmtFinType2", basement_type),
    ("MasVnrType", {
        "Stone": 2,
        "BrkFace": 1,
        "BrkCmn": 0,
        "None": 0,
        np.NaN: 0,
    }),
    ("Functional", {
        np.NaN: 0,
        "Sal": 0,
        "Sev": 1,
        "Maj2": 2,
        "Maj1": 3,
        "Mod": 4,
        "Min2": 5,
        "Min1": 6,
        "Typ": 7,
        "None": 0,        
    })
]

def encoder(mapping):
    def mapper(k, d):
        return d[k]
    
    _mapper = np.vectorize(mapper)
    _mapping = [value for key, value in mapping]
    
    def encode_mapping(arr):
        return np.stack(map(_mapper, arr.T, _mapping), axis=1)

    return encode_mapping
    
col_trans_new = DataFrameMapper([
    (cols_category, [SimpleImputer(strategy='constant', fill_value='NA'), OneHotEncoder(sparse=False)]),
    (cols_numeric, SimpleImputer(strategy='constant', fill_value=0)),
    (cols_ordinal_convert, [SimpleImputer(strategy='constant', fill_value='NA'), OneHotEncoder(sparse=False)]),
    ([key for key, value in MAPPING], FunctionTransformer(encoder(MAPPING), validate=False) )
    
], df_out=True)


In [62]:
aa = col_trans_new.fit_transform(df)



In [68]:
col_trans_new.transformed_names_

['Alley_Fence_GarageFinish_LotConfig_GarageType_MSZoning_Exterior1st_Exterior2nd_Electrical_SaleType_SaleCondition_x0_Grvl',
 'Alley_Fence_GarageFinish_LotConfig_GarageType_MSZoning_Exterior1st_Exterior2nd_Electrical_SaleType_SaleCondition_x0_NA',
 'Alley_Fence_GarageFinish_LotConfig_GarageType_MSZoning_Exterior1st_Exterior2nd_Electrical_SaleType_SaleCondition_x0_Pave',
 'Alley_Fence_GarageFinish_LotConfig_GarageType_MSZoning_Exterior1st_Exterior2nd_Electrical_SaleType_SaleCondition_x1_GdPrv',
 'Alley_Fence_GarageFinish_LotConfig_GarageType_MSZoning_Exterior1st_Exterior2nd_Electrical_SaleType_SaleCondition_x1_GdWo',
 'Alley_Fence_GarageFinish_LotConfig_GarageType_MSZoning_Exterior1st_Exterior2nd_Electrical_SaleType_SaleCondition_x1_MnPrv',
 'Alley_Fence_GarageFinish_LotConfig_GarageType_MSZoning_Exterior1st_Exterior2nd_Electrical_SaleType_SaleCondition_x1_MnWw',
 'Alley_Fence_GarageFinish_LotConfig_GarageType_MSZoning_Exterior1st_Exterior2nd_Electrical_SaleType_SaleCondition_x1_NA',
 '

In [13]:
col_trans_newly = DataFrameMapper([
    (cols_category, [SimpleImputer(strategy='constant', fill_value='NA'), OneHotEncoder(sparse=False)]),
    (cols_numeric, SimpleImputer(strategy='constant', fill_value=0)),
    (cols_ordinal_convert, [SimpleImputer(strategy='constant', fill_value='NA'), OneHotEncoder(sparse=False)]),
    (['FireplaceQu'], OrdinalEncoder(mapping=cols_mapping)),
])



In [16]:
%%debug


NOTE: Enter 'c' at the ipdb>  prompt to continue execution.
> [0;32m<string>[0m(2)[0;36m<module>[0;34m()[0m

ipdb> b category_encoders/ordinal.py:254
Breakpoint 1 at /Users/pulikunt/.local/share/virtualenvs/notebooks-K7RSS7Ya/lib/python3.7/site-packages/category_encoders/ordinal.py:254
ipdb> c
> [0;32m/Users/pulikunt/.local/share/virtualenvs/notebooks-K7RSS7Ya/lib/python3.7/site-packages/category_encoders/ordinal.py[0m(254)[0;36mordinal_encoding[0;34m()[0m
[0;32m    252 [0;31m                [0mcategories_dict[0m [0;34m=[0m [0mdict[0m[0;34m([0m[0mswitch[0m[0;34m.[0m[0mget[0m[0;34m([0m[0;34m'mapping'[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    253 [0;31m                [0mcolumn[0m [0;34m=[0m [0mswitch[0m[0;34m.[0m[0mget[0m[0;34m([0m[0;34m'col'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;31m1[0;32m-> 254 [0;31m                [0mtransformed_column[0m [0;34m=[0m [0mX[0m[0;34m[[0m[0mcolumn[0m[0;34m][0m

ipdb> 
[1;32m    250 [0m            [0mmapping_out[0m [0;34m=[0m [0mmapping[0m[0;34m[0m[0;34m[0m[0m
[1;32m    251 [0m            [0;32mfor[0m [0mswitch[0m [0;32min[0m [0mmapping[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m    252 [0m                [0mcategories_dict[0m [0;34m=[0m [0mdict[0m[0;34m([0m[0mswitch[0m[0;34m.[0m[0mget[0m[0;34m([0m[0;34m'mapping'[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m    253 [0m                [0mcolumn[0m [0;34m=[0m [0mswitch[0m[0;34m.[0m[0mget[0m[0;34m([0m[0;34m'col'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;31m1[0;32m-> 254 [0;31m                [0mtransformed_column[0m [0;34m=[0m [0mX[0m[0;34m[[0m[0mcolumn[0m[0;34m][0m[0;34m.[0m[0mmap[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m [0mcategories_dict[0m[0;34m.[0m[0mget[0m[0;34m([0m[0mx[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m    255 [0m[0;34m[0m[0m
[1;32m    

In [34]:
output_cols = cols_ordinal + cols_category + cols_numeric + cols_ordinal_convert
print(output_cols)

['PoolQC', 'FireplaceQu', 'LotShape', 'GarageCond', 'GarageQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'KitchenQual', 'Functional', 'ExterQual', 'ExterCond', 'Alley', 'Fence', 'GarageFinish', 'LotConfig', 'GarageType', 'MSZoning', 'Exterior1st', 'Exterior2nd', 'Electrical', 'SaleType', 'SaleCondition', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'MasVnrArea', '1stFlrSF', '2ndFlrSF', 'Id', 'YearBuilt', 'YearRemodAdd', 'YrSold', 'PoolArea', 'LotArea', 'GarageArea', 'GarageYrBlt', 'OverallCond', 'OverallQual', 'WoodDeckSF', 'TotRmsAbvGrd', '3SsnPorch', 'BedroomAbvGr', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageCars', 'HalfBath', 'KitchenAbvGr', 'LotFrontage', 'LowQualFinSF', 'MSSubClass', 'MiscVal', 'MoSold', 'OpenPorchSF', 'ScreenPorch', 'TotalBsmtSF', 'GrLivArea', 'BldgType', 'CentralAir', 'Condition1', 'Condition2', 'Foundation', 'Heating', 'HeatingQC', 'HouseStyle', 'LandContour', 'LandSlope', 'Neighb

In [40]:
sum(df_trans[:,0])

36.0

In [43]:
from sklearn_pandas import DataFrameMapper

In [12]:
col_trans.get_feature_names()

AttributeError: Transformer ordinalencoder (type OrdinalEncoder) does not provide get_feature_names.

In [13]:
"""
    (
        Pipeline([
            ('si2', SimpleImputer(strategy='constant', fill_value=0)),
            ('oe2', FunctionTransformer(np.log1p, validate=True))
        ]),
        cols_log_trans
    ),    
"""
df_trans = col_trans.fit_transform(df)
print(f"Shape before splliting: {df_trans.shape}")

no_train = len(y)
X_train = df_trans[:no_train]
X_test = df_trans[no_train:]
print(f"X_train: {X_train.shape}, y: {no_train}, X_test: {X_test.shape}")

TypeError: 'list' object is not callable

In [15]:
model = train_model(X_train, y, base_score=y.mean(), n_estimators=700, max_depth=4, learning_rate=0.05)
y_pred = model.predict(X_test)

test_data_id = df[no_train:]['Id']
write_csv("house_price_after_tune.csv", test_data_id, y_pred)
FileLink("house_price_after_tune.csv")

In [19]:
model.feature_importances_

array([0.00446271, 0.03311803, 0.01691133, 0.00916031, 0.00504991,
       0.01432766, 0.00352319, 0.00728127, 0.01890781, 0.00540223,
       0.00751615, 0.00963006, 0.00869055, 0.00504991, 0.00516735,
       0.00082208, 0.00129184, 0.00152672, 0.00023488, 0.00129184,
       0.00129184, 0.        , 0.0017616 , 0.00364063, 0.        ,
       0.00317087, 0.00317087, 0.00258368, 0.00387551, 0.00352319,
       0.00305344, 0.00364063, 0.00011744, 0.00199648, 0.        ,
       0.00070464, 0.00093952, 0.00152672, 0.        , 0.00634175,
       0.00140928, 0.        , 0.00046976, 0.00082208, 0.0005872 ,
       0.0023488 , 0.        , 0.        , 0.00422783, 0.        ,
       0.0005872 , 0.0023488 , 0.        , 0.00728127, 0.        ,
       0.00246624, 0.        , 0.00011744, 0.00317087, 0.002936  ,
       0.00093952, 0.00023488, 0.        , 0.00011744, 0.00035232,
       0.        , 0.00046976, 0.00070464, 0.00011744, 0.0017616 ,
       0.        , 0.        , 0.00105696, 0.        , 0.00023

In [20]:
dir(model)

['_Booster',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_estimator_type',
 '_get_param_names',
 'apply',
 'base_score',
 'booster',
 'coef_',
 'colsample_bylevel',
 'colsample_bytree',
 'evals_result',
 'feature_importances_',
 'fit',
 'gamma',
 'get_booster',
 'get_params',
 'get_xgb_params',
 'intercept_',
 'kwargs',
 'learning_rate',
 'load_model',
 'max_delta_step',
 'max_depth',
 'min_child_weight',
 'missing',
 'n_estimators',
 'n_jobs',
 'nthread',
 'objective',
 'predict',
 'random_state',
 'reg_alpha',
 'reg_lambda',
 'save_model',
 'scale_pos_weight',
 'score',
 'seed',
 'set_params',
 'silent',
 'subsample']

After above split i got 0.13139 

After removing the outliers it is improved to 0.12993

After setting every numberic to 0 and every non numeric to 'NA' 0.12987

After log transform it increased to 0.13376. 

In [69]:
# Label encoding
??OrdinalEncoder

1. http://fastml.com/how-to-use-pd-dot-get-dummies-with-the-test-set/
1. https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data
1. https://stackoverflow.com/questions/32011359/convert-categorical-data-in-pandas-dataframe
1. https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
1. https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python 
1. https://www.kaggle.com/erikbruin/house-prices-lasso-xgboost-and-a-detailed-eda