In [17]:
import pandas as pd
from sklearn.metrics import mean_squared_log_error

In [2]:
train_data = pd.read_csv('housing-prices-competition/train.csv', index_col='Id')
test_data = pd.read_csv('housing-prices-competition/test.csv', index_col='Id')

In [3]:
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = train_data['SalePrice']
train_data.drop(['SalePrice'], axis=1, inplace=True)

In [4]:
# Separate numerical from categorical
numerical_cols = [col for col in train_data.columns
                 if train_data[col].dtype in ['int64', 'float64']]

# Get categorical cols with less than 10 unique values
categorical_cols = [col for col in train_data.columns
                   if train_data[col].dtype == 'object'
                   and train_data[col].nunique() < 10]

my_cols = numerical_cols + categorical_cols
X = train_data[my_cols].copy()
X_test = test_data[my_cols].copy()

In [5]:
X.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [6]:
# Create the Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Set up preprocessing
numerical_preprocessor = SimpleImputer(strategy='constant')

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessors = ColumnTransformer(transformers=[
    ('numerical_preprocessor', numerical_preprocessor, numerical_cols),
    ('categorical_preprocessor', categorical_preprocessor, categorical_cols)
])

In [7]:
# Tune the model
model = RandomForestRegressor()
print(model.get_params().keys())

dict_keys(['bootstrap', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])


In [8]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=500, num=5)]
max_features = ['auto', 'sqrt']
max_depth= [int(x) for x in np.linspace(start=10, stop=110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'model__bootstrap': bootstrap,
    'model__max_depth': max_depth,
    'model__max_features': max_features,
    'model__min_samples_leaf': min_samples_leaf,
    'model__min_samples_split': min_samples_split,
    'model__n_estimators': n_estimators,
}

print(random_grid)

{'model__bootstrap': [True, False], 'model__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'model__max_features': ['auto', 'sqrt'], 'model__min_samples_leaf': [1, 2, 4], 'model__min_samples_split': [2, 5, 10], 'model__n_estimators': [100, 200, 300, 400, 500]}


In [9]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessors),
    ('model', model)
])

# pipeline.get_params().keys()

model_random_params = RandomizedSearchCV(estimator=pipeline, param_distributions=random_grid, n_iter=10, cv=3, verbose=False)
model_random_params.fit(X, y)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocessor',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('numerical_preprocessor',
                                                                               SimpleImputer(add_indicator=False,
                                                                                             copy=True,
                                                                                             fill_value=None,
                                                             

In [10]:
model_random_params.best_params_

{'model__n_estimators': 200,
 'model__min_samples_split': 2,
 'model__min_samples_leaf': 2,
 'model__max_features': 'sqrt',
 'model__max_depth': 40,
 'model__bootstrap': False}

In [11]:
model_random_params.best_estimator_

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical_preprocessor',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                          

In [18]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    log_error = np.sqrt(mean_squared_log_error(test_labels, predictions))
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print('Mean Sq Log Error = {:0.6f}%.'.format(log_error))
    
base_model = RandomForestRegressor(n_estimators=100, random_state=0)

base_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessors),
    ('model', base_model)
])
base_pipeline.fit(X, y)
base_accuracy = evaluate(base_pipeline, X, y)

Model Performance
Average Error: 6543.0913 degrees.
Accuracy = 96.20%.
Mean Sq Log Error = 0.059109%.


In [20]:
best_model_accuracy = evaluate(model_random_params.best_estimator_, X, y)

Model Performance
Average Error: 5222.2194 degrees.
Accuracy = 96.92%.
Mean Sq Log Error = 0.052051%.


In [21]:
# Try it on the test data:
test_predictions = model_random_params.predict(X_test)

In [22]:
# Output predictions
output = pd.DataFrame({'Id': X_test.index,
                     'SalePrice': test_predictions})
output.to_csv('submission.csv', index=False)