In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datatable as dt

In [2]:
test_df = dt.fread('dataset/test.csv').to_pandas()

In [27]:
dataset = test_df.copy()

In [28]:
# PRE-PROCESSING

# Replace NaNs with NA for categorical columns
na_cols = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
           'FireplaceQu', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'Fence', 
           'MiscFeature', 'PoolQC']
for col in na_cols:
    dataset[col] = dataset[col].fillna('None')
    
nan_cols = ['TotalBsmtSF', 'GarageArea']
for col in nan_cols:
    dataset[col] = dataset[col].fillna(0)

# Replace NaNs with 0 for categorical columns that can be converted to ordinals
qual_map = {'Ex': 5,'Gd': 4,'TA': 3,'Fa': 2,'Po': 1, 'NA': 0}
qual_cols = ['BsmtQual', 'BsmtCond', 'FireplaceQu', 'HeatingQC', 'GarageQual', 'GarageCond', 
             'ExterQual', 'ExterCond', 'KitchenQual', 'PoolQC']
for col in qual_cols:
    dataset[col] = dataset[col].map(qual_map).fillna(0)

In [29]:
numerical_cols = ['LotArea', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'BsmtQual', 'BsmtCond', 'TotalBsmtSF', 'GrLivArea', 
                  'TotRmsAbvGrd', 'FullBath', 'Fireplaces', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch']

dataset = dataset[numerical_cols]

In [30]:
categorical_cols = ['MSSubClass', 'MSZoning', 'LotConfig', 'Neighborhood', 'BldgType', 
                    'Foundation', 'CentralAir', 'GarageType', 'GarageFinish']
ohe = pd.get_dummies(test_df[categorical_cols])
dataset = dataset.join(ohe)
test_cols = dataset.columns.to_list()

In [31]:
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

scaler = joblib.load("models/scaler.pkl")

scaled_df = scaler.transform(dataset)
scaled_df = pd.DataFrame(scaled_df, columns=test_cols)

In [65]:
regressor = joblib.load("models/xgb.pkl")
y_pred = regressor.predict(scaled_df)

In [68]:
y_test = dt.fread("dataset/submission_5.csv").to_pandas()['SalePrice'].values

In [69]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(np.log(y_test), np.log(y_pred))))

Root Mean Squared Error: 0.14614065236960047


In [70]:
output = pd.DataFrame({'Id': test_df.Id, 'SalePrice': y_pred})
output.to_csv('dataset/submission_7.csv', index=False)