## Hyperparameter Tuning

In [1]:
import pandas as pd
import numpy as np
import itertools
import pickle

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor

from functions_variables import *


cities = pd.read_csv('../data/processed/cities.csv')
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')



Setup for cross validation
We need the cities label, but NOT the computed price from before

In [2]:
CV_df = pd.concat([cities, X_train, y_train], axis = 1)
CV_df.drop(columns= 'description.sold_price_city_mean', inplace= True)

Custom CV Folds

In [3]:
def custom_cross_validation(train_df, n_splits=5):
    '''creates n_splits sets of training and validation folds

    Args:
      training_data: the dataframe of features and target to be divided into folds
      n_splits: the number of sets of folds to be created

    Returns:
      A tuple of lists, where the first index is a list of the training folds, 
      and the second the corresponding validation fold

    Example:
        >>> output = custom_cross_validation(train_df, n_splits = 10)
        >>> output[0][0] # The first training fold
        >>> output[1][0] # The first validation fold
        >>> output[0][1] # The second training fold
        >>> output[1][1] # The second validation fold... etc.
    '''
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    train_df_copy = train_df.copy()
    
    training_folds = []
    validation_folds = []
    
    for train_index, val_index in kf.split(train_df_copy):
        train_fold, val_fold = train_df_copy.iloc[train_index], train_df_copy.iloc[val_index]

        city_mean_train = train_fold.groupby('location.address.city')['description.sold_price'].mean()

        # Merge mean price into both training and validation folds
        train_fold = train_fold.merge(city_mean_train, left_on='location.address.city', right_index=True, how='left', suffixes=('', '_city_mean'))
        val_fold = val_fold.merge(city_mean_train, left_on='location.address.city', right_index=True, how='left', suffixes=('', '_city_mean'))

        # Fill missing values in both folds with global mean sold price
        global_mean = train_df_copy['description.sold_price'].mean()
        train_fold['description.sold_price_city_mean'] = train_fold['description.sold_price_city_mean'].fillna(global_mean)
        val_fold['description.sold_price_city_mean'] = val_fold['description.sold_price_city_mean'].fillna(global_mean)

        # Drop the city column from both folds
        train_fold = train_fold.drop(columns=['location.address.city'])
        val_fold = val_fold.drop(columns=['location.address.city'])

        training_folds.append(train_fold)
        validation_folds.append(val_fold)

    return training_folds, validation_folds

In [4]:
training_folds, validation_folds = custom_cross_validation(CV_df)

Example param grid (for XGBoost) 
- Can use default for some of these and/or use more values in others
- I picked a few important parameters for XGBoost, but there are alot in reality, and we might consider tuning them one at a time, or using a complex approach (i.e. Bayesian), but this will do for now. 

In [5]:
param_grid_xgb = {
    'max_depth': [3, 5, 7, 10, 15, 20, 25],  # control complexity (depth of trees)
    'gamma': [0, 1, 3, 5, 7, 10],  # minimum loss reduction required to make a split
    'learning_rate': [0.01, 0.05, 0.1],  # step size shrinkage used to prevent overfitting
}


Check to make sure we didn't mess anything up

In [6]:
for i, train_fold in enumerate(training_folds):
    nan_indices = train_fold.isnull().any(axis=1)
    if nan_indices.any():
        print(f"Training fold {i+1} contains NaN values.")
        print(train_fold[nan_indices])
    else:
        print(f"Training fold {i+1} does not contain any NaN values.")

Training fold 1 does not contain any NaN values.
Training fold 2 does not contain any NaN values.
Training fold 3 does not contain any NaN values.
Training fold 4 does not contain any NaN values.
Training fold 5 does not contain any NaN values.


Setup a custom hyperparameter search

In [7]:
def hyperparameter_search(training_folds, validation_folds, param_grid):
    '''outputs the best combination of hyperparameter settings in the param grid, 
    given the training and validation folds

    Args:
      training_folds: the list of training fold dataframes
      validation_folds: the list of validation fold dataframes
      param_grid: the dictionary of possible hyperparameter values for the chosen model

    Returns:
      A list of the best hyperparameter settings based on the chosen metric

    Example:
        >>> param_grid = {
          'max_depth': [None, 10, 20, 30],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 2, 4],
          'max_features': ['sqrt', 'log2']} # for random forest
        >>> hyperparameter_search(output[0], output[1], param_grid = param_grid) 
        # assuming 'ouput' is the output of custom_cross_validation()
        [20, 5, 2, 'log2'] # hyperparams in order
    '''
    
    all_r2_scores = []
    all_best_params_list = []

    for params in itertools.product(*param_grid.values()):
        r2_scores = []
        best_params_list = []

        for train_fold, val_fold in zip(training_folds, validation_folds):
            rf = XGBRegressor(**dict(zip(param_grid.keys(), params)))

            X_train_fold = train_fold.drop(columns=['description.sold_price'])
            y_train_fold = train_fold['description.sold_price']
            X_val_fold = val_fold.drop(columns=['description.sold_price'])
            y_val_fold = val_fold['description.sold_price']

            rf.fit(X_train_fold, y_train_fold)

            r2_score = rf.score(X_val_fold, y_val_fold)

            r2_scores.append(r2_score)
            best_params_list.append(params)

        all_r2_scores.append(r2_scores)
        all_best_params_list.append(best_params_list)

    avg_r2_scores = np.mean(all_r2_scores, axis=1)
    best_params_idx = np.argmax(avg_r2_scores)
    best_params = all_best_params_list[best_params_idx][0]  

    return avg_r2_scores, best_params


In [8]:
avg_r2_scores, best_params = hyperparameter_search(training_folds, validation_folds, param_grid_xgb)

In [9]:
best_params

(10, 0, 0.1)

Refit our model on the original full trainset, using the best parameters

In [10]:
best_model = XGBRegressor(max_depth= 10, gamma = 0, learning_rate= 0.1).fit(X_train, np.array(y_train).ravel())

y_pred = best_model.predict(X_test)
mae = mean_absolute_error(np.exp(y_test), np.exp(y_pred))

print("Tuned R2 Score: ", best_model.score(X_test, y_test))
print("Tuned MAE: ", mae)


Tuned R2 Score:  0.9940429437935966
Tuned MAE:  3756.1948776463046


This is the score for our final model that we would report.

Lets save the model

In [11]:
with open('../models/tuned_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

## Building a Pipeline

Function arguments

In [12]:
import pandas as pd
import numpy as np
import itertools
import pickle

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor

from functions_variables import *

In [13]:
directory = '../data'
train = pd.read_csv('../data/processed/X_train.csv')
columns_to_keep = list(train.columns)

columns_to_drop_na = ['description.type', 'description.year_built',
                      'description.lot_sqft', 'description.sqft', 'location.address.coordinate.lon',
                      'location.address.coordinate.lat']

columns_fill_values = {
    'description.baths_3qtr': 0,
    'description.baths_full': 0,
    'description.baths_half': 0,
    'description.baths': 0,
    'description.garage': 0,
    'description.beds': 0,
    'description.sub_type': 'N/A',
    'location.address.city': 'N/A',
    'description.stories': 1
}

columns_to_log = ['description.lot_sqft', 'description.sqft']

This is each step of our preprocessing process:
- loading the dataframe
- encoding tags
- dropping/filling NAs
- transforming dtypes
- adding our mean city prices (computed from our original training data)
- log scaling
- selecting the correct columns
- scaling (with our pretrained scaler)
- predict (with our pretrained model)

Pipelines expect a class object with `.fit()` and `.transform()` methods. There are two ways to achieve this:
- `FunctionTransformer` is a wrapper around a simple function. It encodes this function as the `transform` method of a simple class
     - I did this for getting our dataframe from the JSON
- `TransformerMixin` provides a base sklearn class we can use. We can 'overwrite' its methods using inheritance (i.e. `class ColumnSelector(TransformerMixin):`). We can then overwrite the `__init__`, `fit` and `transform` method as needed.
     - I did this for everything else, its better for more complicated tasks

The actual classes are imported from `function_variables.py` 



In [14]:
pipeline = Pipeline([
    ('get_dataframe', FunctionTransformer(get_dataframe)),
    ('encode_tags', TagsEncoder()),
    ('drop_NAs', DropMissingValues(columns=columns_to_drop_na)),
    ('fill_NAs', FillMissingValues(fill_values_dict=columns_fill_values) ),
    ('transform_types', TypeTransformer()),
    ('merge_city_means', MergeAndImputeTransformer('../data/processed/city_means.csv')),
    ('log_transform', LogTransform(columns_to_log)),
    ('select_columns', ColumnSelector(columns_to_keep)),
    ('scale', PretrainedMinMaxScale('../models/scaler.pkl')),
    ('predict', PredictionsFromModel('../models/tuned_model.pkl'))
])

df, pred = pipeline.fit_transform(directory)

In [15]:
df.shape

(6354, 63)

In [16]:
pred.shape

(6354,)

#### Save the Final Pipeline

This pipeline should be able to take a directory of json data, similar to our original data, and output a cleaned dataframe, and a set of predictions

In [17]:
with open('../models/pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)