In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

filename = './data/train.csv'
home_data = pd.read_csv(filename)

features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

features = [
    'MSSubClass',
    'LotArea',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal',
    'MoSold',
    'YrSold'    
]

X = home_data[features].values
y = home_data.SalePrice.values

kf = KFold(n_splits=4, shuffle=True, random_state=42)

cv_mae = list()

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    cv_mae.append(mean_absolute_error(preds, y_test))

    print("Test MAE for fold {:,}: {:,.0f}".format(len(cv_mae), cv_mae[len(cv_mae) - 1]))
    
print(f'\nCross Validation MAE results: {np.array(cv_mae).mean():,.0f} +/- {np.array(cv_mae).std():,.0f}')

Test MAE for fold 1: 19,213
Test MAE for fold 2: 20,193
Test MAE for fold 3: 19,617
Test MAE for fold 4: 17,386

Cross Validation MAE results: 19,103 +/- 1,050


In [2]:
# -------------------------------------------------- #
# Creation of submission file
# -------------------------------------------------- #

filename = './data/test.csv'
submission_data = pd.read_csv(filename)

model = RandomForestRegressor(random_state=42)
model.fit(X, y)

preds = model.predict(submission_data[features].values)

output = pd.DataFrame({'Id': submission_data.Id,
                       'SalePrice': preds})

output.to_csv('./data/submission.csv', index=False)