# Prepare Submission

## 1. Load data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [3]:
#df = pd.read_csv('./data/housing_iteration_3_classification.csv')
df = pd.read_csv('./data/house-prices-advanced-regression-techniques/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
for col in df.select_dtypes(exclude="number").columns:
    #print(df[col].value_counts())
    pass

## 2. Train-test split

In [6]:
# define X and y
id = df.pop('Id')
y = df.pop('SalePrice')
X = df.copy()
#X = X.drop(['Alley', 'PoolQC', 'MiscFeature'], axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                   random_state=0)

In [8]:
#X_train.info()
#y_test.info()

## 3. Build Pipelines

### 3.1 Define preprocessing steps

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = Pipeline(
    steps=[('num_imputer', SimpleImputer()), 
           ('num_scaler', MinMaxScaler())
          ])

 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = Pipeline(
    steps=[('cat_imputer', SimpleImputer(strategy='constant', 
                                         fill_value='N_A')), 
           ('cat_encoder', OneHotEncoder(drop='first', 
                                         sparse_output=False, 
                                         handle_unknown='ignore'))
           #('cat_encoder', OrdinalEncoder())
          ])

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, X_num_columns), 
        ('cat', categoric_pipe, X_cat_columns)
    ],
    #remainder='passthrough'
).set_output(transform='pandas')

In [12]:
preprocessor

### 3.2 Define regressors to test

In [14]:
from sklearn.preprocessing import StandardScaler#, OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.feature_selection import SelectKBest, f_regression, RFECV, SelectFromModel
import xgboost as xgb

In [16]:
regressors = [
    #('RandForest', RandomForestRegressor()),
    #('GradBoost', GradientBoostingRegressor()),
    #('AdaBoost', AdaBoostRegressor()),
    #('ExtraTrees', ExtraTreesRegressor()),
    ('XGB', xgb.XGBRegressor())
]

### 3.3 Create a pipeline for each regressor

In [17]:
pipelines = {}
for name, reg in regressors:
    print(reg)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        #('selector', SelectKBest(score_func=f_regression)),
        #('selector', RFECV(estimator=reg)),
        #('selector', SelectFromModel(reg)),
        ('regressor', reg)
     ])
    pipelines[name] = pipeline


XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)


In [18]:
#pipelines['RandForest']

## 4. Exploratory hyperparameter search

In [19]:
from sklearn.model_selection import GridSearchCV

### 4.1 Define exploratory hyperparameters

In [21]:
param_grids_explore = {
    'DecisionTree': {
        #'selector__k': range(2, 40),
        'regressor__max_depth': range(2, 50, 10),
        'regressor__min_samples_leaf': range(3, 15, 3),
        'regressor__min_samples_split': [2, 5, 10],
    },
    'RandForest': {
        #'preprocessor__num__num_imputer__strategy': ['mean'],# 'median'],
        #'selector__k': [10, 20, 30, 60],
        'regressor__max_depth': [2, 5, 10, 20, 50],
        'regressor__min_samples_leaf': [2, 5, 10, 25],
        'regressor__min_samples_split': [2, 5, 10, 25],
    },
    'GradBoost': {
        #'selector__k': range(2, 40),
        'regressor__n_estimators': [50, 100, 150],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 7, 10]
    },
    'AdaBoost': {
        #'selector__k': range(2, 40),
        'regressor__n_estimators': [50, 100, 150],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__loss': ['linear', 'square', 'exponential']
    },
    'ExtraTrees': {
        #'selector__k': range(2, 40),
        'regressor__n_estimators': [50, 100, 150],
        'regressor__max_depth': [3, 5, 7, 10],
        'regressor__min_samples_split': [2, 5, 10]
    },
    'XGB': {
        #'selector__k': range(2, 40),
        'regressor__n_estimators': [50, 100, 150],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 7, 10]
    }
}


### 4.2 Fit and evaluate models

In [22]:
# Fit and evaluate each pipeline
scores_explore = {}
params_explore = {}
estimator_explore = {}

fit_models = True # for turning off costly exploratory fitting

for name in pipelines:
    print(f"\nTraining and evaluating {name}...")
    pipe = pipelines[name]
    grid_search = GridSearchCV(pipe, 
                               param_grids_explore[name], 
                               cv=5, 
                               #scoring='neg_mean_absolute_error',
                               scoring='neg_mean_squared_error',
                               #scoring='r2',
                               verbose=1, n_jobs=-1)
    
    if fit_models == True:
        grid_search.fit(X_train, y_train)
        
        #print("Best parameters found:")
        #print(grid_search.best_params_)
        
        scores_explore[name] = grid_search.best_score_
        params_explore[name] = grid_search.best_params_
        estimator_explore[name] = grid_search


Training and evaluating XGB...
Fitting 5 folds for each of 36 candidates, totalling 180 fits




In [23]:
scores_explore

{'XGB': -853247806.6551933}

In [24]:
params_explore

{'XGB': {'regressor__learning_rate': 0.1,
  'regressor__max_depth': 3,
  'regressor__n_estimators': 150}}

In [25]:
from sklearn.metrics import r2_score

for name in pipelines:
    print(name)
    search = estimator_explore[name]

    score = search.score(X_test, y_test)
    print("Best parameters:", search.best_params_)
    print("Best score on test data:", score)

    perf_train = r2_score(y_true=y_train, y_pred=search.predict(X_train))
    perf_test = r2_score(y_true=y_test, y_pred=search.predict(X_test))
    print('train test r2:', perf_train, perf_test)

XGB
Best parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 150}
Best score on test data: -935896747.806171
train test r2: 0.974677290366537 0.8644776888168015


## 5. Refined hyperparameter search

### 5.1 Define exploratory hyperparameters

In [26]:
param_grids_refine = {
    'DecisionTree': {
        'selector__k': range(2, 40),
        'regressor__max_depth': range(2, 50, 10),
        'regressor__min_samples_leaf': range(3, 15, 3),
        'regressor__min_samples_split': [2, 5, 10],
    },
    'RandForest': {
        #'preprocessor__num__num_imputer__strategy': ['mean'],# 'median'],
        #'selector__k': range(28, 36, 1),
        'regressor__max_depth': range(30, 61, 5),
        'regressor__min_samples_leaf': range(2, 7, 1),
        'regressor__min_samples_split': [2, 7, 1],
    },
    'GradBoost': {
        #'selector__k': range(2, 40),
        'regressor__n_estimators': [150, 200],
        'regressor__learning_rate': [0.1],
        'regressor__max_depth': [3, 4]
    },
    'AdaBoost': {
        #'selector__k': range(2, 40),
        'regressor__n_estimators': [150, 200],
        'regressor__learning_rate': [0.2, 0.3, 0.5],
        'regressor__loss': ['linear']
    },
    'ExtraTrees': {
        #'selector__k': range(2, 40),
        'regressor__n_estimators': [90, 100, 110],
        'regressor__max_depth': [9, 10, 11, 15],
        'regressor__min_samples_split': [4, 5, 6]
    },
    'XGB': {
        #'selector__k': range(2, 40),
        'regressor__n_estimators': [150, 200],
        'regressor__learning_rate': [0.08, 0.1, 0.12],
        'regressor__max_depth': [3, 4]
    }
}


### 5.2 Fit and evaluate models

In [27]:
# Fit and evaluate each pipeline
scores_refine = {}
params_refine = {}
estimator_refine = {}

fit_models = True # for turning off costly exploratory fitting

for name in pipelines:
    print(f"\nTraining and evaluating {name}...")
    pipe = pipelines[name]
    grid_search = GridSearchCV(pipe, 
                               param_grids_refine[name], 
                               cv=5, 
                               #scoring='neg_mean_absolute_error',
                               scoring='neg_mean_squared_error',
                               #scoring='r2',
                               verbose=1, n_jobs=-1)
    
    if fit_models == True:
        grid_search.fit(X_train, y_train)
        
        #print("Best parameters found:")
        #print(grid_search.best_params_)
        
        scores_refine[name] = grid_search.best_score_
        params_refine[name] = grid_search.best_params_
        estimator_refine[name] = grid_search


Training and evaluating XGB...
Fitting 5 folds for each of 12 candidates, totalling 60 fits




In [28]:
scores_refine

{'XGB': -822385997.7426722}

In [29]:
params_refine

{'XGB': {'regressor__learning_rate': 0.12,
  'regressor__max_depth': 4,
  'regressor__n_estimators': 200}}

In [30]:
from sklearn.metrics import r2_score

for name in pipelines:
    print(name)
    search = estimator_refine[name]

    score = search.score(X_test, y_test)
    print("Best parameters:", search.best_params_)
    print("Best score on test data:", score)

    perf_train = r2_score(y_true=y_train, y_pred=search.predict(X_train))
    perf_test = r2_score(y_true=y_test, y_pred=search.predict(X_test))
    print('train test r2:', perf_train, perf_test)

XGB
Best parameters: {'regressor__learning_rate': 0.12, 'regressor__max_depth': 4, 'regressor__n_estimators': 200}
Best score on test data: -1054103867.6772674
train test r2: 0.9930367684736017 0.847360734280105


## Run best model with production data

In [31]:
# load production/comp data
comp = pd.read_csv('./data/house-prices-advanced-regression-techniques/test.csv')
comp.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [32]:
#comp.info()

In [33]:
id = comp.pop('Id')
#comp = comp.drop(['Alley', 'PoolQC', 'MiscFeature'], axis=1)

In [34]:
for n in estimator_refine:
    est = estimator_refine[n]
    submission_pred = list(est.predict(comp))
    
    # Create the Dataframe for the submission. It must contain two columns: "Id" and "Expensive"
    submission_df = pd.DataFrame()
    submission_df['Id'] = id
    submission_df['SalePrice'] = submission_pred

    submission_df.to_csv(f'./submission/regr/{n}.csv', index=False)
