In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Read the data
file_path = '/Users/chuwen/Desktop/kaggle/House_pricing/data'
X = pd.read_csv(file_path+'/train.csv', index_col='Id') 
X_test = pd.read_csv(file_path+'/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
X_missing = [col for col in X.columns if X[col].isnull().any()]
test_missing = [col for col in X_test.columns if X_test[col].isnull().any()] 
cols_with_missing = list(set(X_missing).union(set(test_missing)))
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)
# X_train_numeric = X_train.select_dtypes(exclude=['object'])
# X_valid_numeric = X_valid.select_dtypes(exclude=['object'])

# # Use imputer to handle missing data for numeric data
# my_imputer = SimpleImputer()
# imp_X_train_numeric = pd.DataFrame(my_imputer.fit_transform(X_train_numeric))
# imp_X_valid_numeric = pd.DataFrame(my_imputer.transform(X_valid_numeric))
# imp_X_train_numeric.columns = X_train_numeric.columns
# imp_X_valid_numeric.columns = X_valid_numeric.columns


# X_train_categoric = X_train.select_dtypes(include=['object'])
# X_valid_categoric = X_valid.select_dtypes(include=['object'])
# categoric_without_missing = [col for col in X_train_categoric.columns if not X_train_categoric[col].isnull().any()]
# #print(categoric_without_missing)
# X_train_categoric = X_train_categoric[categoric_without_missing]
# X_valid_categoric = X_valid_categoric[categoric_without_missing]
# #print(X_train_categoric.head())
# #potantial improvement: drop cols with too many missing values
# X_train = pd.concat([imp_X_train_numeric, X_train_categoric], axis=1)
# X_valid = pd.concat([imp_X_valid_numeric, X_valid_categoric], axis=1)
# X_train_categoric

In [2]:
print(X_missing)
print(test_missing)
print(cols_with_missing)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType']
['MiscFeature', 'BsmtFullBath', 'TotalBsmtSF', 'MasVnrType', 'SaleType', 'Fence', 'BsmtFinType2', 'BsmtCond', 'Functional', 'Utilities', 'MSZoning', 'PoolQC', 'MasVnrArea', 'GarageType', 'GarageCars', 'BsmtUnfSF', 'BsmtExposure', 'GarageCond', 'BsmtFinSF1', 'KitchenQual', 'Garage

In [4]:
a = [col for col in X_test.columns if X_test[col].isnull().any()]
a

[]

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

Approach 1: Drop columns with categorical data

In [6]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
drop_X_train = X_train.drop(object_cols, axis=1)
drop_X_valid = X_valid.drop(object_cols, axis=1)
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
18934.51899543379


Approach 2: Ordinal encoding

In [7]:
# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'PavedDrive', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Condition2', 'RoofMatl']


In [8]:
from sklearn.preprocessing import OrdinalEncoder

# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Apply ordinal encoder 
ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])

In [9]:
print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
18222.102716894977


Approach 3:One-hot encoding

In [10]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

[('Street', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('PavedDrive', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('ExterQual', 4),
 ('LotConfig', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('Condition2', 6),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('Heating', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('Neighborhood', 25)]

In [11]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

Categorical columns that will be one-hot encoded: ['Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'PavedDrive', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Neighborhood']


In [12]:
from sklearn.preprocessing import OneHotEncoder

# Use as many lines of code as you need!
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

#object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train =  pd.concat([num_X_train, OH_cols_train], axis=1)# Your code here
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1) # Your code here

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):
18550.40561643836


In [16]:
# Use approach 3 as final strategy, create a new RF model which is trained on all training data
OH_cols_full = pd.DataFrame(OH_encoder.fit_transform(X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))

OH_cols_full.index = X.index
OH_cols_test.index = X_test.index

num_X_full = X.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

OH_X_full = pd.concat([num_X_full, OH_cols_full], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# build the model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_full, y)
test_preds = model.predict(OH_X_test)
output = pd.DataFrame({'Id':X_test.index, 'SalePrice':test_preds})
output.to_csv('one_hot_sub.csv', index=False)

!kaggle competitions submit -c home-data-for-ml-course -f one_hot_sub.csv -m "Message"

100%|██████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 32.3kB/s]
Successfully submitted to Housing Prices Competition for Kaggle Learn Users