# Importing Data

In [1]:
import pandas as pd

train = pd.read_csv('.train.csv', index_col = 'Id')
test = pd.read_csv('./test.csv', index_col = 'Id')

# Finding and Dropping Missing Values

In [2]:
train.shape

(1460, 80)

In [3]:
train.isnull().sum().sort_values(ascending = False).head(10)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
dtype: int64

In [4]:
cols_to_drop_na = train.isnull().sum().sort_values(ascending = False).head(6).index.tolist()

train.drop(cols_to_drop_na, axis = 1, inplace = True)
test.drop(cols_to_drop_na, axis = 1, inplace = True)
train.shape

(1460, 74)

# Removing Outliers

In [5]:
print("SalePrice 99%: ", train['SalePrice'].quantile(0.99))
print("GrLivArea 99%: ", train['GrLivArea'].quantile(0.99))
print("LotArea 99%: ", train['LotArea'].quantile(0.99))

SalePrice 99%:  442567.01000000053
GrLivArea 99%:  3123.4800000000023
LotArea 99%:  37567.64000000021


Sale price outliers

In [6]:
rows_to_drop = train[train['SalePrice'] > 450000].index
train.drop(rows_to_drop, inplace = True)
train.shape

(1446, 74)

Ground living area outliers

In [7]:
rows_to_drop = train[train['GrLivArea'] > 3200].index
train.drop(rows_to_drop, inplace = True)
train.shape

(1438, 74)

Lot area outliers

In [8]:
rows_to_drop = train[train['LotArea'] > 38000].index
train.drop(rows_to_drop, inplace = True)
train.shape

(1426, 74)

# Preprocessing

In [9]:
y = train.pop('SalePrice')

In [10]:
categorical_cols = [cname for cname in train.columns if
                    train[cname].dtype == 'object']

numerical_cols = [cname for cname in train.columns if
                  train[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
train = train[my_cols]
test = test[my_cols]

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

numerical_transformer = SimpleImputer(strategy = 'constant', fill_value = 0)

categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Modeling

Finding the best model

In [12]:
from sklearn.model_selection import cross_val_score

def score_model(pipeline):
    scores = -1 * cross_val_score(pipeline, train, y,
                                  cv = 5,
                                  scoring = 'neg_mean_absolute_error')
    print(scores)
    print('Best MAE: ', scores.min())
    print('Average MAE: ', scores.mean())

In [13]:
rf = RandomForestRegressor(random_state = 0)

rf_pipeline = Pipeline(steps = [
    ('pre', preprocessor),
    ('rf', rf)
])

score_model(rf_pipeline)

[17127.42765734 15454.99659649 15511.67663158 14871.62768421
 16155.02070175]
Best MAE:  14871.627684210529
Average MAE:  15824.149854275549


In [14]:
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRFRegressor

gbr = GradientBoostingRegressor(random_state = 0)

gbr_pipeline = Pipeline(steps = [
    ('pre', preprocessor),
    ('gbr', gbr)
])

score_model(gbr_pipeline)

[14900.43355615 14563.60050176 14737.45512718 13682.99811779
 15103.91626968]
Best MAE:  13682.998117791432
Average MAE:  14597.680714511876


In [15]:
xgbrf = XGBRFRegressor(random_state = 0)

xgbrf_pipeline = Pipeline(steps = [
    ('pre', preprocessor),
    ('xgbrf', xgbrf)
])

score_model(xgbrf_pipeline)

[18102.1873361  17671.68289474 18062.70208333 17327.90633224
 17584.94180373]
Best MAE:  17327.906332236842
Average MAE:  17749.8840900273


Finding optimal options for gradient boosting regressor

In [18]:
gbr = GradientBoostingRegressor(random_state = 0, n_estimators = 800, subsample = 0.3, learning_rate = 0.03)

gbr_pipeline = Pipeline(steps = [
    ('pre', preprocessor),
    ('gbr', gbr)
])

score_model(gbr_pipeline)

[12860.30858119 13060.86178804 13596.37958971 12100.93633395
 13463.17933801]
Best MAE:  12100.936333951451
Average MAE:  13016.333126181187


# Submission

In [19]:
gbr_pipeline.fit(train, y)
predictions = gbr_pipeline.predict(test)

output = pd.DataFrame({'Id' : test.index,
                       'SalePrice' : predictions})
output.to_csv('submission.csv', index = False)