In [1]:
import pandas as pd
import os

HOUSING_TRAINING_DATA_PATH = os.path.join("data", "train.csv")

def load_csv_data(csv_path: str = HOUSING_TRAINING_DATA_PATH):
    """ Load data from a csv file.

    Args:
        csv_path (str): The file path of the csv file to be loaded.

    Returns:
        df (pandas.DataFrame): A Pandas DataFrame object containing the data loaded from the input csv file.
    """
    df = pd.read_csv(csv_path)
    return df

In [2]:
from sklearn.model_selection import train_test_split

housing_data = load_csv_data()
train_set, test_set = train_test_split(housing_data, test_size=0.2, random_state=42)

In [3]:
housing_data.info(max_cols=81)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
train_set.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
254,255,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,145000
1066,1067,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,...,0,,,,0,5,2009,WD,Normal,178000
638,639,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,5,2008,WD,Normal,85000
799,800,50,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2007,WD,Normal,175000
380,381,50,RL,50.0,5000,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,5,2010,WD,Normal,127000


## Helper Functions for Features
Below are functions defined to carry out some of the desired feature engineering and transformations performed in the data analysis stage (see data_analysis.ipynb for more)

In [5]:
import numpy as np

def replace_na_with_none(df: pd.DataFrame, features):
    """ Replace missing values with a 'none' string value.

    Args:
        df (pandas.DataFrame): The pandas dataframe object to replace missing values with 'none'.
        features (array-like object): Array-like object containing the name feature of features to apply the transformation to. 

    Returns:
        df (pandas.DataFrame): The pandas dataframe object with missing values replaces with 'none'.
    """
    df[features].fillna("none")
    return df

def createHasBsmtFullBath(df: pd.DataFrame):
    """ Create the HasBsmtFullBath categorical attribute.

    Args:
        df (pandas.DataFrame): The pandas dataframe object to create the HasBsmtFullBath attribute for.

    Returns:
        df (pandas.DataFrame): The given pandas dataframe object with the new attribute.
    """
    hasBsmtFullBath = np.zeros(df.shape[0])

    for i, numberofBsmtFullBaths in enumerate(df["BsmtFullBath"]):
        if numberofBsmtFullBaths > 0:
            hasBsmtFullBath[i] = 1

    df["HasBsmtFullBath"] = pd.Series(hasBsmtFullBath)
    df = df.drop(columns=["BsmtFullBath"])
    
    return df

def createHasHalfBath(df: pd.DataFrame):
    """ Create the HasHalfBath categorical attribute.

    Args:
        df (pandas.DataFrame): The pandas dataframe object to create the HasHalfBath attribute for.

    Returns:
        df (pandas.DataFrame): The given pandas dataframe object with the new attribute.
    """
    hasHalfBath = np.zeros(df.shape[0])

    for i, numberofHalfBaths in enumerate(df["HalfBath"]):
        if numberofHalfBaths > 0:
            hasHalfBath[i] = 1

    df["HasHalfBath"] = pd.Series(hasHalfBath)
    df = df.drop(columns=["HalfBath"])
    
    return df

def log_transform_features(df: pd.DataFrame, features):
    """ Log transform the input features

    Args:
        df (pandas.DataFrame): A pandas dataframe object containing the features used for a model.
        features (array-like object of strings): An array-like object containing the name of the features in df to be transformed. 

    Returns:
        df (pandas.DataFrame): The pandas dataframe object containing the features used for the model and with the desired features log-transformed.
    """
    try:
        features_iterator = iter(features)
        for feature in features_iterator:
            df[feature] = np.log(df[feature] + 0.001)
        
        return df

    except TypeError as error:
        print(error)

def apply_custom_transformations(df: pd.DataFrame, categorical_attributes, continuous_attributes, target_attribute: str):
    """ Apply the feature helper functions to a given pandas dataframe object to apply the custom data transformations.

    Args:
        df (pandas.DataFrame): A pandas dataframe object contiaining the dataset to be modelled.
        categorical_attributes: An array-like object containing the names of the desired categorical attributes.
        continuous attributes: An array-like object containing the names of the desired continuous attributes.
        target_attribute (string): The name of the target attribute.

    Returns:
        (X, y): A tuple of the transformed features X and the transformed target y.
    """
    partially_prepped_set = replace_na_with_none(df, categorical_attributes)
    partially_prepped_set = createHasBsmtFullBath(partially_prepped_set)
    partially_prepped_set = createHasHalfBath(partially_prepped_set)
    partially_prepped_set = log_transform_features(partially_prepped_set, continuous_attributes)

    X = partially_prepped_set.drop(columns=[target_attribute])
    y = partially_prepped_set[target_attribute]

    return X, y

## Pipeline for Data Preprocessing 
First, the helper functions are applied to perform initial feature engineering and data preprocessing. Then the data is passed through a scikit-learn pipeline for further preprocessing.

In [6]:
continuous_data_attributes = [
    "SalePrice",
    "LotFrontage",
    "MasVnrArea",
    "BsmtFinSF1",
    "TotalBsmtSF",
    "1stFlrSF",
    "2ndFlrSF",
    "GrLivArea",
    "GarageArea"
]

continuous_features = [
    "LotFrontage",
    "MasVnrArea",
    "BsmtFinSF1",
    "TotalBsmtSF",
    "1stFlrSF",
    "2ndFlrSF",
    "GrLivArea",
    "GarageArea"
]

categorical_features = [
    "MSSubClass",
    "MSZoning",
    "BldgType",
    "HouseStyle",
    "OverallQual",
    "OverallCond",
    "BsmtFullBath",
    "FullBath",
    "HalfBath",
    "TotRmsAbvGrd",
    "Fireplaces",
    "FireplaceQu",
    "GarageCars",
    "GarageType",
    "GarageFinish",
    "GarageQual",
    "GarageCond",
    "PavedDrive",
    "Street",
    "Alley",
    "LotShape",
    "LandContour",
    "LotConfig",
    "LandSlope",
    "Neighborhood",
    "Condition1",
    "Condition2",
    "RoofStyle",
    "RoofMatl",
    "Exterior1st",
    "Exterior2nd",
    "MasVnrType",
    "ExterQual",
    "ExterCond",
    "Foundation",
    "BsmtQual",
    "BsmtCond",
    "BsmtExposure",
    "BsmtFinType1",
    "BsmtFinType2",
    "Heating",
    "HeatingQC",
    "CentralAir",
    "Electrical",
    "PoolQC",
    "SaleType",
    "SaleCondition"
]

partially_prepped_categorical_features = [
    "MSSubClass",
    "MSZoning",
    "BldgType",
    "HouseStyle",
    "OverallQual",
    "OverallCond",
    "HasBsmtFullBath",
    "FullBath",
    "HasHalfBath",
    "TotRmsAbvGrd",
    "Fireplaces",
    "FireplaceQu",
    "GarageCars",
    "GarageType",
    "GarageFinish",
    "GarageQual",
    "GarageCond",
    "PavedDrive",
    "Street",
    "Alley",
    "LotShape",
    "LandContour",
    "LotConfig",
    "LandSlope",
    "Neighborhood",
    "Condition1",
    "Condition2",
    "RoofStyle",
    "RoofMatl",
    "Exterior1st",
    "Exterior2nd",
    "MasVnrType",
    "ExterQual",
    "ExterCond",
    "Foundation",
    "BsmtQual",
    "BsmtCond",
    "BsmtExposure",
    "BsmtFinType1",
    "BsmtFinType2",
    "Heating",
    "HeatingQC",
    "CentralAir",
    "Electrical",
    "PoolQC",
    "SaleType",
    "SaleCondition"
]

attributes = continuous_data_attributes + categorical_features
train_set = train_set[attributes]
test_set = test_set[attributes]

print(train_set.shape)
print(test_set.shape)

(1168, 56)
(292, 56)


In [7]:
X_train, y_train = apply_custom_transformations(train_set, categorical_features, continuous_data_attributes, "SalePrice")

print(X_train.shape)
print(y_train.shape)

(1168, 55)
(1168,)


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iterative_imputer = IterativeImputer(random_state=42)
min_max_scaler = MinMaxScaler()
feature_encoder = OneHotEncoder(handle_unknown="ignore")

continuous_pipeline = Pipeline([
    ("min_max_scaler", min_max_scaler),
    ("iterative_imputer", iterative_imputer)
])

categorical_pipeline = Pipeline([
    ("feature_encoder", feature_encoder)
])

full_pipeline = ColumnTransformer([
    ("continuous", continuous_pipeline, continuous_features),
    ("categorical", categorical_pipeline, partially_prepped_categorical_features)
])

In [11]:
X_train_prepared = full_pipeline.fit_transform(X_train)
y_train_prepared = iterative_imputer.fit_transform(y_train.to_numpy().reshape(-1, 1))

print(type(X_train_prepared))
print(type(y_train_prepared))

print(X_train_prepared.shape)
print(y_train_prepared.shape)

<class 'scipy.sparse._csr.csr_matrix'>
<class 'numpy.ndarray'>
(1168, 316)
(1168, 1)


## Baseline Model - Mean Prediction
A naive model that predicts the mean will be used as the baseline model to beat.

In [12]:
from sklearn.metrics import mean_squared_error, r2_score

X_test, y_test = apply_custom_transformations(test_set, categorical_features, continuous_data_attributes, "SalePrice")

print(X_test.shape)
print(y_test.shape)

X_test_prepared = full_pipeline.transform(X_test)
y_test_prepared = iterative_imputer.transform(y_test.to_numpy().reshape(-1, 1))

print(X_test_prepared.shape)
print(y_test_prepared.shape)

baseline_predictions = np.array([y_test_prepared.mean() for i in range(y_test_prepared.shape[0])]).reshape(-1, 1)

error = np.sqrt(mean_squared_error(y_test_prepared, baseline_predictions))
r2 = r2_score(y_test_prepared, baseline_predictions)

print(f"Root Mean Squared Error: {error}")
print(f"R-Squared Coefficient: {r2}")

(292, 55)
(292,)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.4319878652441567
R-Squared Coefficient: 0.0


### Helper function for displaying Root Mean-squared Error scores from Cross Validation

In [13]:
def display_cv_scores(scores):
    """ Display the error scores from cross validation and basic statistics.

    Args:
        scores (array-like): An array-like object containing the error scores from cross validation.

    Returns:
        None
    """
    print(f"Scores: {scores}")
    print(f"Mean: {scores.mean()}")
    print(f"Standard Deviation: {scores.std()}")

## Model Development - Exploration

### Ridge Regression

In [14]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

ridge_reg = Ridge(random_state=42)
ridge_reg.fit(X_train_prepared, y_train_prepared)
ridge_reg_preds = ridge_reg.predict(X_train_prepared)

ridge_reg_error = np.sqrt(mean_squared_error(y_train_prepared, ridge_reg_preds))
ridge_reg_r2 = r2_score(y_train_prepared, ridge_reg_preds)
print(f"Root Mean Squared Error: {ridge_reg_error}")
print(f"R-Squared Coefficient: {ridge_reg_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.10065524185825711
R-Squared Coefficient: 0.9335398438989518


In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


ridge_reg_cv_mse_scores = cross_val_score(ridge_reg, X_train_prepared, y_train_prepared, cv=10, scoring="neg_mean_squared_error")
ridge_reg_cv_rmse_scores = np.sqrt(abs(ridge_reg_cv_mse_scores))
ridge_reg_cv_r2_scores = cross_val_score(ridge_reg, X_train_prepared, y_train_prepared, cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(ridge_reg_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(ridge_reg_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.11561654 0.15405244 0.11995224 0.18179933 0.19491573 0.152178
 0.13768553 0.12024211 0.13452398 0.09531781]
Mean: 0.14062837082965374
Standard Deviation: 0.029194910681579544


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.88629525 0.87008705 0.89828613 0.77786317 0.73886865 0.89978099
 0.86667763 0.90267376 0.85112127 0.9328614 ]
Mean: 0.8624515304417212
Standard Deviation: 0.05696837958313602


### Linear SVM

In [16]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

linear_svm = LinearSVR(random_state=42, dual=False, loss="squared_epsilon_insensitive")
linear_svm.fit(X_train_prepared, y_train_prepared.flatten())
linear_svm_preds = linear_svm.predict(X_train_prepared)

linear_svm_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), linear_svm_preds))
linear_svm_r2 = r2_score(y_train_prepared.flatten(), linear_svm_preds)
print(f"Root Mean Squared Error: {linear_svm_error}")
print(f"R-Squared Coefficient: {linear_svm_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.10438714257352695
R-Squared Coefficient: 0.9285203227513172


In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


linear_svm_cv_mse_scores = cross_val_score(linear_svm, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
linear_svm_cv_rmse_scores = np.sqrt(abs(linear_svm_cv_mse_scores))
linear_svm_cv_r2_scores = cross_val_score(linear_svm, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(linear_svm_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(linear_svm_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.14024312 0.17421172 0.12045698 0.19440458 0.17407939 0.16050042
 0.14531016 0.13203258 0.16128425 0.11294856]
Mean: 0.1515471758352345
Standard Deviation: 0.02457044294615533


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.83269776 0.83386159 0.89742834 0.74599106 0.79171404 0.88851955
 0.85150275 0.88265112 0.78599832 0.9057274 ]
Mean: 0.8416091942315299
Standard Deviation: 0.05106288917922683


### RBF SVM

In [18]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

rbf_svm = SVR(kernel="rbf")
rbf_svm.fit(X_train_prepared, y_train_prepared.flatten())
rbf_svm_preds = rbf_svm.predict(X_train_prepared)

rbf_svm_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), rbf_svm_preds))
rbf_svm_r2 = r2_score(y_train_prepared.flatten(), rbf_svm_preds)
print(f"Root Mean Squared Error: {rbf_svm_error}")
print(f"R-Squared Coefficient: {rbf_svm_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.08495056077503223
R-Squared Coefficient: 0.9526607837206451


In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


rbf_svm_cv_mse_scores = cross_val_score(rbf_svm, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
rbf_svm_cv_rmse_scores = np.sqrt(abs(rbf_svm_cv_mse_scores))
rbf_svm_cv_r2_scores = cross_val_score(rbf_svm, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(rbf_svm_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(rbf_svm_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.13346413 0.15624561 0.13404105 0.1680172  0.14894035 0.18010465
 0.15231971 0.13369679 0.13663173 0.12698797]
Mean: 0.14704492024763402
Standard Deviation: 0.01640511337532559


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.84848076 0.86636171 0.87298964 0.81026672 0.84752803 0.85962292
 0.83683063 0.87967421 0.8464194  0.88083483]
Mean: 0.854900885070531
Standard Deviation: 0.020623757369495222


### Random Forest

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_prepared, y_train_prepared.flatten())
rf_preds = rf.predict(X_train_prepared)

rf_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), rf_preds))
rf_r2 = r2_score(y_train_prepared.flatten(), rf_preds)
print(f"Root Mean Squared Error: {rf_error}")
print(f"R-Squared Coefficient: {rf_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.05702459738550259
R-Squared Coefficient: 0.9786689114316451


In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


rf_cv_mse_scores = cross_val_score(rf, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
rf_cv_rmse_scores = np.sqrt(abs(rf_cv_mse_scores))
rf_cv_r2_scores = cross_val_score(rf, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(rf_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(rf_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.10987647 0.17139176 0.13087552 0.20228661 0.16377821 0.19708108
 0.16151663 0.15022191 0.13625269 0.12303185]
Mean: 0.1546312706178065
Standard Deviation: 0.028996676669735134


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.8973053  0.83919662 0.87891781 0.7249762  0.81563542 0.83191219
 0.81653176 0.84809107 0.84727034 0.88814402]
Mean: 0.8387980725346523
Standard Deviation: 0.04654152986986867


### Gradient Boosting - Gradient Boosting Regressor

In [22]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

gbrt = GradientBoostingRegressor(random_state=42)
gbrt.fit(X_train_prepared, y_train_prepared.flatten())
gbrt_preds = gbrt.predict(X_train_prepared)

gbrt_error = np.sqrt(mean_squared_error( y_train_prepared.flatten(), gbrt_preds))
gbrt_r2 = r2_score( y_train_prepared.flatten(), gbrt_preds)
print(f"Root Mean Squared Error: {gbrt_error}")
print(f"R-Squared Coefficient: {gbrt_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.0892122074980093
R-Squared Coefficient: 0.9477919905269637


In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


gbrt_cv_mse_scores = cross_val_score(gbrt, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
gbrt_cv_rmse_scores = np.sqrt(abs(gbrt_cv_mse_scores))
gbrt_cv_r2_scores = cross_val_score(gbrt, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(gbrt_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(gbrt_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.09281897 0.15672893 0.13148528 0.18663962 0.14216594 0.17078338
 0.14047582 0.12292933 0.13380592 0.10752161]
Mean: 0.1385354793564828
Standard Deviation: 0.02658828134357971


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.92671549 0.86553365 0.87778691 0.76587722 0.86108269 0.87377727
 0.86121913 0.89827498 0.8527064  0.91456898]
Mean: 0.8697542710877546
Standard Deviation: 0.041706123326991154


### Gradient Boosting - XGBoost

In [24]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

xgb_reg = XGBRegressor(random_state=42)
xgb_reg.fit(X_train_prepared, y_train_prepared.flatten())
xgb_reg_preds = xgb_reg.predict(X_train_prepared)

xgb_reg_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), xgb_reg_preds))
xgb_reg_r2 = r2_score(y_train_prepared.flatten(), xgb_reg_preds)
print(f"Root Mean Squared Error: {xgb_reg_error}")
print(f"R-Squared Coefficient: {xgb_reg_r2}")

  from pandas import MultiIndex, Int64Index


(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.014505439556389013
R-Squared Coefficient: 0.9986197735334046


In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


xgb_reg_cv_mse_scores = cross_val_score(xgb_reg, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
xgb_reg_cv_rmse_scores = np.sqrt(abs(xgb_reg_cv_mse_scores))
xgb_reg_cv_r2_scores = cross_val_score(xgb_reg, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(xgb_reg_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(xgb_reg_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.10489083 0.17869019 0.13695984 0.18394633 0.13207265 0.18568311
 0.14447382 0.13408719 0.12790194 0.11546787]
Mean: 0.14441737623442968
Standard Deviation: 0.02725422706467584


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.90641339 0.82520995 0.86739805 0.77258545 0.88010778 0.85079232
 0.8532072  0.87897047 0.86541784 0.901475  ]
Mean: 0.8601577451810624
Standard Deviation: 0.0370202500911056


### Top Candidate Models
Based on initial model exploration, below are the top 3 candidate models to further fine-tune and compare results:
- Gradient Boosting Regression
- XGBoost
- Ridge Regression

## Model Development - Fine-tuning/Optimization

### Paramater Optimization - Gradient Boosting Regression 

In [39]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

gbrt = GradientBoostingRegressor(random_state=42)
param_distributions = {
    "learning_rate": [0.001, 0.01, 0.1, 1, 10],
    "n_estimators": [100, 500, 1000, 1500, 2000],
    "max_depth": [1, 2, 3, 4],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
    "subsample": [0.5, 0.75, 1]
}
gbrt_reg = RandomizedSearchCV(gbrt, param_distributions=param_distributions, n_iter=500, scoring="neg_root_mean_squared_error", n_jobs=-1, random_state=42)
gbrt_search = gbrt_reg.fit(X_train_prepared, y_train_prepared.flatten())

 -1.58095206e-01 -2.70319514e-01 -2.37975084e-01 -2.25850922e-01
             nan -1.40896308e-01             nan -1.42011853e-01
 -2.28634711e-01 -1.63877377e-01 -1.35679329e-01 -1.55212355e-01
 -1.52252861e-01 -1.39094892e-01 -1.58386390e-01             nan
 -1.63877377e-01 -1.38782817e-01 -1.36913324e-01 -1.97237797e-01
 -2.56598726e-01 -1.51590156e-01 -1.33465895e-01 -1.60503555e-01
             nan             nan             nan -2.12018376e-01
 -1.43152682e-01 -4.93677210e-01 -3.03147097e-01 -8.04756139e+94
             nan -1.45956057e-01 -2.17847124e-01 -2.29812528e-01
 -6.17937362e+94 -1.52120968e-01 -8.06255317e+94 -2.25850922e-01
 -1.57321604e-01 -3.35846530e-01 -2.48358709e-01 -1.89760734e-01
 -1.79055710e-01             nan -2.04835614e-01 -1.78989581e-01
 -1.33781275e-01 -1.97779281e-01 -1.38590842e-01 -2.47522916e-01
 -2.48385200e-01 -2.52380273e-01 -1.58095206e-01             nan
 -3.69856470e-01 -1.39159692e-01             nan -1.44467420e-01
             nan         

In [40]:
print(f"Negative RMSE: {gbrt_search.score(X_train_prepared, y_train_prepared.flatten())}")
print(f"Parameters for the best estimator: \n{gbrt_search.best_params_}")

Negative RMSE: -0.06654574787561507
Parameters for the best estimator: 
{'subsample': 0.5, 'n_estimators': 2000, 'max_leaf_nodes': 10, 'max_depth': 3, 'learning_rate': 0.01}


In [41]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

gbrt_optimized = GradientBoostingRegressor(
    random_state=42, 
    learning_rate=0.01, 
    max_depth=3,
    max_leaf_nodes=10, 
    n_estimators=2000,
    subsample=0.5
)
gbrt_optimized.fit(X_train_prepared, y_train_prepared.flatten())
gbrt_optimized_preds = gbrt_optimized.predict(X_train_prepared)


gbrt_optimized_error = np.sqrt(mean_squared_error( y_train_prepared.flatten(), gbrt_optimized_preds))
gbrt_optimized_r2 = r2_score( y_train_prepared.flatten(), gbrt_optimized_preds)
print(f"Root Mean Squared Error: {gbrt_optimized_error}")
print(f"R-Squared Coefficient: {gbrt_optimized_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.06654574787561507
R-Squared Coefficient: 0.9709511339423144


In [42]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


gbrt_optimized_cv_mse_scores = cross_val_score(gbrt_optimized, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
gbrt_optimized_cv_rmse_scores = np.sqrt(abs(gbrt_optimized_cv_mse_scores))
gbrt_optimized_cv_r2_scores = cross_val_score(gbrt_optimized, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(gbrt_optimized_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(gbrt_optimized_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.09092648 0.14380362 0.11859793 0.18425085 0.13426207 0.16034311
 0.12992157 0.11232161 0.1266227  0.09336753]
Mean: 0.12944174783251697
Standard Deviation: 0.027188363427820276


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.92967343 0.88679778 0.90056995 0.77183185 0.87609982 0.88873797
 0.88128955 0.91507347 0.86809647 0.93558071]
Mean: 0.8853751015806182
Standard Deviation: 0.043477004431628136


With this combination of parameters obtained through hyperparameter tuning, the training performance of the Gradient Boosting Regression model has increased significantly. The increase to validation performance has increased marginally, with slight reductions in variance/standard deviation.

### Paramater Optimization - Ridge Regression 

In [30]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

ridge_optimized = RidgeCV(alphas=np.linspace(start=0.01, stop=10, num=100), cv=10)
ridge_optimized.fit(X_train_prepared, y_train_prepared)
ridge_optimized_preds = ridge_optimized.predict(X_train_prepared)

ridge_optimized_error = np.sqrt(mean_squared_error(y_train_prepared, ridge_optimized_preds))
ridge_optimized_r2 = r2_score(y_train_prepared, ridge_optimized_preds)
print(f"Root Mean Squared Error: {ridge_optimized_error}")
print(f"R-Squared Coefficient: {ridge_optimized_r2}")
print(f"Regularization parameter (alpha): {ridge_optimized.alpha_}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.11103968740973512
R-Squared Coefficient: 0.9191192770342501
Regularization parameter (alpha): 6.367272727272727


In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

ridge_optimized = Ridge(alpha=5.055454545454545, random_state=42)

ridge_optimized_cv_mse_scores = cross_val_score(ridge_optimized, X_train_prepared, y_train_prepared, cv=10, scoring="neg_mean_squared_error")
ridge_optimized_cv_rmse_scores = np.sqrt(abs(ridge_optimized_cv_mse_scores))
ridge_optimized_cv_r2_scores = cross_val_score(ridge_optimized, X_train_prepared, y_train_prepared, cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(ridge_optimized_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(ridge_optimized_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.03915932 0.04990561 0.03912958 0.05668971 0.05629608 0.05295406
 0.04350696 0.03936525 0.0414996  0.03409737]
Mean: 0.045260355289290725
Standard Deviation: 0.007645783013186892


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.87779007 0.87226452 0.89859217 0.7976319  0.79591121 0.88630474
 0.87527848 0.90226719 0.86725484 0.91950601]
Mean: 0.8692801134792203
Standard Deviation: 0.03923989584151503


With the tuned value of the regularization parameter (alpha), we see marginal decrease in training performance. There is, however, a slight increase to the validation performance and a decrease to its standard deviation/variance.

### Paramater Optimization - XGBoost

In [45]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

xgb = XGBRegressor(random_state=42)
param_distributions = {
    "n_estimators": [100, 500, 1000, 1500, 2000],
    "max_depth": [1, 2, 3, 4],
    "subsample": [0.5, 0.75, 1],
    "colsample_bytree": [0.5, 0.75, 1],
    "reg_alpha": np.linspace(start=0.01, stop=10, num=100),
    "reg_lambda": np.linspace(start=0.01, stop=10, num=100),
    "gamma": np.linspace(start=0.01, stop=10, num=100)
}
xgb_reg = RandomizedSearchCV(xgb, param_distributions=param_distributions, n_iter=500, scoring="neg_root_mean_squared_error", n_jobs=-1, random_state=42)
xgb_search = xgb_reg.fit(X_train_prepared, y_train_prepared.flatten())

In [46]:
print(f"Negative RMSE: {xgb_search.score(X_train_prepared, y_train_prepared.flatten())}")
print(f"Parameters for the best estimator: \n{xgb_search.best_params_}")

Negative RMSE: -0.09642469280211605
Parameters for the best estimator: 
{'subsample': 0.75, 'reg_lambda': 5.156363636363636, 'reg_alpha': 1.5236363636363637, 'n_estimators': 100, 'max_depth': 3, 'gamma': 0.01, 'colsample_bytree': 0.75}


In [47]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

xgb_reg_optimized = XGBRegressor(
    random_state=42,
    colsample_bytree=0.75,
    gamma=0.01,
    max_depth=3,
    n_estimators=100,
    reg_alpha=1.5236363636363637,
    reg_lambda=5.156363636363636,
    subsample=0.75
)
xgb_reg_optimized.fit(X_train_prepared, y_train_prepared.flatten())
xgb_reg_optimized_preds = xgb_reg.predict(X_train_prepared)

xgb_reg_optimized_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), xgb_reg_optimized_preds))
xgb_reg_optimized_r2 = r2_score(y_train_prepared.flatten(), xgb_reg_optimized_preds)
print(f"Root Mean Squared Error: {xgb_reg_optimized_error}")
print(f"R-Squared Coefficient: {xgb_reg_optimized_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.09642469280211605
R-Squared Coefficient: 0.9390090930561361


In [48]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


xgb_reg_optimized_cv_mse_scores = cross_val_score(xgb_reg_optimized, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
xgb_reg_optimized_cv_rmse_scores = np.sqrt(abs(xgb_reg_optimized_cv_mse_scores))
xgb_reg_optimized_cv_r2_scores = cross_val_score(xgb_reg_optimized, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(xgb_reg_optimized_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(xgb_reg_optimized_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.10436418 0.15913883 0.1457126  0.18324052 0.15091409 0.18900752
 0.16754806 0.12605592 0.11359456 0.11464463]
Mean: 0.14542208974450363
Standard Deviation: 0.028388239989789992


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.90735082 0.86136668 0.84990796 0.7743273  0.84346018 0.84540176
 0.80257359 0.89303462 0.89384309 0.90287489]
Mean: 0.8574140891935829
Standard Deviation: 0.0418068440109111


When fitted with the above combination of parameters found via RandomSearchCV, both the training and validation performance has decreased. The standard deviation/variance has also increased.

### Conclusions based of hyperparameter tuning of candidate models

- Based on the results of the hyperparameter optimized candidate models, the Gradient Boosting Regression model still maintains the highest performance in terms of the training set and validation set even though it has overfitted more than the other two candidate models. By fitting the model with the tuned hyperparameters, the variance of the Gradient Boosting Regression model had also decreased from the initial model fit prior to hyperparameter tuning.

- Tuning the Regularization parameter of the Ridge Regression model saw little changes to their performance. However, their variances were observed to decrease as a result of using tuned Regularization parameters.

- A combination of hyperparameters that would improve the existing XGBoost model could not be found with RandomSearchCV.

## Model Development - Further Ensemble Methods

### Voting Regression with top candidate models

In [55]:
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

gbrt_reg = GradientBoostingRegressor(
    random_state=42, 
    learning_rate=0.06810390304059954, 
    max_depth=3,
    max_leaf_nodes=10, 
    n_estimators=1000,
    subsample=0.8112174950633936
)
xgb_reg = XGBRegressor(random_state=42)
ridge_reg = Ridge(alpha=5.055454545454545, random_state=42)

voting_reg = VotingRegressor(
    estimators=[
        ("gbrt", gbrt_reg),
        ("xgb", xgb_reg),
        ("ridge", ridge_reg)
    ],
    n_jobs=-1
)
voting_reg.fit(X_train_prepared, y_train_prepared.flatten())
voting_reg_preds = voting_reg.predict(X_train_prepared)

voting_reg_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), voting_reg_preds))
voting_reg_r2 = r2_score(y_train_prepared.flatten(), voting_reg_preds)
print(f"Root Mean Squared Error: {voting_reg_error}")
print(f"R-Squared Coefficient: {voting_reg_r2}")

Root Mean Squared Error: 0.04586960141138212
R-Squared Coefficient: 0.9861981120852197


In [56]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


voting_reg_cv_mse_scores = cross_val_score(voting_reg, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
voting_reg_cv_rmse_scores = np.sqrt(abs(voting_reg_cv_mse_scores))
voting_reg_cv_r2_scores = cross_val_score(voting_reg, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(voting_reg_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(voting_reg_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.09699093 0.14978879 0.11752215 0.17191779 0.12975768 0.16218745
 0.11998173 0.10904191 0.11665891 0.09170372]
Mean: 0.12655510595144356
Standard Deviation: 0.025486482278121732


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.91997956 0.87717864 0.90236559 0.80135499 0.88427388 0.88616368
 0.89875894 0.91996063 0.88803841 0.93785617]
Mean: 0.8915930490503495
Standard Deviation: 0.03516326743991818


A voting ensemble model combining the tuned candidate models seems to have slightly better performance than the individual respective candidate models and has the lowest standard deviation/variance out of all the models that make up the Voting Regressor. However, interpreting which features are the most important becomes harder with using this ensemble model, i.e. no simple way of getting feature weights or feature importances by using estimator's attributes.

## Model Development - Overall Conclusions

- In terms of overall performance, the Voting Regression model that combines the hyperparameter-tuned candidate models scores the highest in terms of R-Squared Coefficient and its variance/standard deviation remains similar to that of the individual models it consists of.

- Though it scales more poorly in computational complexity compared to Ridge Regression model, the Gradient Boosting Regression model offers the best performance and still has a way to output which features are "important" to predicting house sale price. It is also less strict in its assumptions about the data's behaviour and relationships. Ridge Regression still follows the typical assumptions for Linear Regression, i.e. constant variance, normally distributed residuals, linear relationships between the dependent and indepdent variables, independent and identically distributed data, etc.

- The candidate model that will most likely scale the best in terms of computational complexity with decent prediction performance (although less than the Gradient Boosting Regression and Voting Regression models) is the Ridge Regression model. 

### Model to deploy - Gradient Boosting Regression model

- The chosen model to deploy in the web service to be built is the Gradient Boosting Regression model. 
- It has performance only marginally less than the Voting Regression model but still has some interetability in terms of determining which features are important in predicting house sale price and assumes less about the input features' behaviour.
- Once houses are built, their main features are unlikely to change regardless of how many times it is sold. For example, a house built with 2 bathrooms, 4 bedrooms and a garage is likely to retain them for a very long time, and the chances of the house losing one of them is quite low. In addition to this, houses and properties take a long time to be built and sold. This means, we are unlikely to see much drift in terms of the feature data's behaviour, relationships and distributions, so it is a reasonable assumption that we would not have to retrain and redeploy the model very frequently. 

### Model to use for Kaggle submission - Voting Regression Model

- The Voting Regression model will be used for the Kaggle competition submission as it provides the best performance (highest R-Squared Coefficient) out of all the models that were trained during model development with similar variance/standard deviation.

## Kaggle Submission

### Helper Functions

In [99]:
import pandas as pd
import numpy as np

def create_kaggle_submission(ids: np.ndarray, preds: np.ndarray, csv_path: str):
    """ Create a csv file containing the predictions of the test set data for kaggle submission.

    Args:
        ids (numpy.ndarray): An array-like object containing the record IDs of each record in the test data set for Kaggle.

        preds (numpy.ndarray): An array-like object containing the predictions of the test data set for Kaggle.

        csv_path (string): The path in which to save the Kaggle submission csv file.

    Returns:
        None
    """
    preds_df = pd.DataFrame(
        data={
            "Id": ids,
            "SalePrice": preds
        }
    )
    preds_df.set_index("Id", inplace=True)
    preds_df.to_csv(csv_path)

def prepare_test_set_data(test_set: pd.DataFrame, categorical_attributes, continuous_attributes):
    """ Apply feature transformations, engineering and scaling to the test set for kaggle submission.

    Args:
        test_set (pandas.DataFrame): The dataframe containing the test data set for kaggle submission.

        categorical_attributes: An array-like object containing the names of the desired categorical attributes.

        continuous attributes: An array-like object containing the names of the desired continuous attributes.

    Returns:
        X_prepared (pandas.DataFrame): A dataframe containing the test set after applying feature transformation, scaling and engineering.
    """
    X_partially_prepared = replace_na_with_none(test_set, categorical_attributes)
    X_partially_prepared = createHasBsmtFullBath(X_partially_prepared)
    X_partially_prepared = createHasHalfBath(X_partially_prepared)
    X_partially_prepared = log_transform_features(X_partially_prepared, continuous_attributes)
    X_prepared = full_pipeline.transform(X_partially_prepared)

    return X_prepared

def invert_min_max_norm(x: np.ndarray, lower: float, upper: float):
    """ Invert min-max normalization performed on the given input numpy array object.

    Args:
        x (ndarray): A nnmpy array containing the values that were transformed via min-max normalization.

        lower (float): The lower bound used for the min-max normalization of x.
        
        upper (float): The upper bound used for the min-max normalization of x.

    Returns:
        x_inverse (ndarray): A numpy array containing the values from the result of applying an inverse min-max normalization operation on the input array.
    """
    x_inverse = x * (upper - lower) + lower

    return x_inverse

a = np.min(y_train)
b = np.max(y_train)
log_preds = invert_min_max_norm(voting_reg_preds, lower=a, upper=b)
preds = np.exp(log_preds) - 0.001
print(f"Untransformed training set predictions:\n{preds}")


Untransformed training set predictions:
[138359.24832127 181792.86813025  87389.67938757 ... 121105.83736849
 170549.66016812 190112.30618507]


### Submission

In [100]:
import pandas as pd
import os

HOUSING_TESTING_DATA_PATH = os.path.join("data", "test.csv")
housing_test_data = load_csv_data(csv_path=HOUSING_TESTING_DATA_PATH)

print(housing_test_data.head())

     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC  Fence MiscFeature  \
0         Lvl    AllPub  ...         120        0    NaN  MnPrv         NaN   
1         Lvl    AllPub  ...           0        0    NaN    NaN        Gar2   
2         Lvl    AllPub  ...           0        0    NaN  MnPrv         NaN   
3         Lvl    AllPub  ...           0        0    NaN    NaN         NaN   
4         HLS    AllPub  ...         144        0    NaN    NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  
0       0      6

In [101]:
X_test_kaggle = prepare_test_set_data(housing_test_data, categorical_features, continuous_features)

print(X_test_kaggle.shape)

(1459, 316)


In [102]:
voting_reg_preds_kaggle = voting_reg.predict(X_test_kaggle)
print(voting_reg_preds_kaggle.shape)

a = np.min(y_train)
b = np.max(y_train)
log_preds = invert_min_max_norm(voting_reg_preds_kaggle, lower=a, upper=b)
preds = np.exp(log_preds) - 0.001
print(f"Untransformed test set predictions:\n{preds}")
print(preds.shape)

(1459,)
Untransformed test set predictions:
[120554.24755041 150418.52432758 182732.49248091 ... 149753.13629367
 114591.49603669 220295.391393  ]
(1459,)


In [104]:
KAGGLE_SUBMISSION_FILEPATH = os.path.join("data", "submission.csv")
ids = housing_test_data["Id"].to_numpy()

create_kaggle_submission(ids, preds, KAGGLE_SUBMISSION_FILEPATH)