In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datatable as dt

In [16]:
test_df = dt.fread('dataset/test.csv').to_pandas()

In [93]:
test_df['SalePrice'] = -1

In [94]:
train_df = dt.fread('dataset/train.csv').to_pandas()

In [95]:
dataset = pd.concat([train_df, test_df], axis=0)

In [39]:
dataset = test_df.copy()

In [40]:
# PRE-PROCESSING

# Replace NaNs with NA for categorical columns
na_cols = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
           'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'Fence', 
           'MiscFeature', 'PoolQC']
for col in na_cols:
    dataset[col] = dataset[col].fillna('None')
    
nan_cols = ['TotalBsmtSF', 'GarageArea', 'MasVnrArea']
for col in nan_cols:
    dataset[col] = dataset[col].fillna(0)

# Replace NaNs with 0 for categorical columns that can be converted to ordinals
qual_map = {'Ex': 5,'Gd': 4,'TA': 3,'Fa': 2,'Po': 1, 'NA': 0}
qual_cols = ['BsmtQual', 'BsmtCond', 'FireplaceQu', 'HeatingQC', 'GarageQual', 'GarageCond', 
             'ExterQual', 'ExterCond', 'KitchenQual', 'PoolQC']
for col in qual_cols:
    dataset[col] = dataset[col].map(qual_map).fillna(0)

In [41]:
numerical_cols = ['LotArea', 'LotFrontage', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'MasVnrArea', 
                  'BsmtQual', 'BsmtCond', 'TotalBsmtSF', 'HeatingQC', 'GrLivArea', 'TotRmsAbvGrd', 'Fireplaces', 
                  'GarageArea', 'GarageQual', 'GarageCond', 'GarageYrBlt', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
                  '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 
                  'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'BsmtUnfSF', 'GarageCars']

dataset = dataset[numerical_cols]

In [42]:
categorical_cols = ['MSSubClass', 'MSZoning', 'Alley', 'Street', 'LotConfig', 'LotShape', 
                    'LandContour', 'LandSlope', 'Neighborhood', 'BldgType', 'RoofStyle', 
                    'Foundation', 'CentralAir', 'GarageType', 'GarageFinish']
ohe = pd.get_dummies(test_df[categorical_cols])
dataset = dataset.join(ohe)
test_cols = dataset.columns.to_list()

In [35]:
import joblib
training_cols = joblib.load('models/training_cols.pkl')

In [37]:
for col in training_cols:
    if col not in test_cols:
        dataset[col] = 0

In [43]:
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

scaler = joblib.load("models/scaler.pkl")

scaled_df = scaler.transform(dataset)
scaled_df = pd.DataFrame(scaled_df, columns=test_cols)

In [44]:
regressor = joblib.load("models/xgb.pkl")
y_pred = regressor.predict(scaled_df)

In [45]:
y_test = dt.fread("dataset/submission_7.csv").to_pandas()['SalePrice'].values

In [46]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(np.log(y_test), np.log(y_pred))))

Root Mean Squared Error: 0.07854693819536568


In [47]:
output = pd.DataFrame({'Id': test_df.Id, 'SalePrice': y_pred})
output.to_csv('dataset/submission_9.csv', index=False)

In [84]:
dataset.columns.to_list()

['LotArea',
 'LotFrontage',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'TotalBsmtSF',
 'HeatingQC',
 'GrLivArea',
 'TotRmsAbvGrd',
 'FullBath',
 'Fireplaces',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 'MSSubClass',
 'MSZoning_C (all)',
 'MSZoning_FV',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM',
 'Alley_Grvl',
 'Alley_Pave',
 'Street_Grvl',
 'Street_Pave',
 'LotConfig_Corner',
 'LotConfig_CulDSac',
 'LotConfig_FR2',
 'LotConfig_FR3',
 'LotConfig_Inside',
 'LotShape_IR1',
 'LotShape_IR2',
 'LotShape_IR3',
 'LotShape_Reg',
 'LandContour_Bnk',
 'LandContour_HLS',
 'LandContour_Low',
 'LandContour_Lvl',
 'LandSlope_Gtl',
 'LandSlope_Mod',
 'LandSlope_Sev',
 'Neighborhood_Blmngtn',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_CollgCr',
 'Neighborhood_Crawfor',
 'Neighborhood_Edwards',
 'Neighborhood_Gilbert'