In [1]:
import pandas as pd
import os

HOUSING_TRAINING_DATA_PATH = os.path.join("data", "train.csv")

def load_csv_data(csv_path: str = HOUSING_TRAINING_DATA_PATH):
    """ Load data from a csv file.

    Args:
        csv_path (str): The file path of the csv file to be loaded.

    Returns:
        df (pandas.DataFrame): A Pandas DataFrame object containing the data loaded from the input csv file.
    """
    df = pd.read_csv(csv_path)
    return df

In [2]:
from sklearn.model_selection import train_test_split

housing_data = load_csv_data()
train_set, test_set = train_test_split(housing_data, test_size=0.2, random_state=42)

In [3]:
housing_data.info(max_cols=81)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
train_set.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
254,255,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,145000
1066,1067,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,...,0,,,,0,5,2009,WD,Normal,178000
638,639,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,5,2008,WD,Normal,85000
799,800,50,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2007,WD,Normal,175000
380,381,50,RL,50.0,5000,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,5,2010,WD,Normal,127000


## Helper Functions for Features
Below are functions defined to carry out some of the desired feature engineering and transformations performed in the data analysis stage (see data_analysis.ipynb for more)

In [5]:
import numpy as np

def replace_na_with_none(df: pd.DataFrame, features):
    """ Replace missing values with a 'none' string value.

    Args:
        df (pandas.DataFrame): The pandas dataframe object to replace missing values with 'none'.
        features (array-like object): Array-like object containing the name feature of features to apply the transformation to. 

    Returns:
        df (pandas.DataFrame): The pandas dataframe object with missing values replaces with 'none'.
    """
    df[features].fillna("none")
    return df

def createHasBsmtFullBath(df: pd.DataFrame):
    """ Create the HasBsmtFullBath categorical attribute.

    Args:
        df (pandas.DataFrame): The pandas dataframe object to create the HasBsmtFullBath attribute for.

    Returns:
        df (pandas.DataFrame): The given pandas dataframe object with the new attribute.
    """
    hasBsmtFullBath = np.zeros(df.shape[0])

    for i, numberofBsmtFullBaths in enumerate(df["BsmtFullBath"]):
        if numberofBsmtFullBaths > 0:
            hasBsmtFullBath[i] = 1

    df["HasBsmtFullBath"] = pd.Series(hasBsmtFullBath)
    df = df.drop(columns=["BsmtFullBath"])
    
    return df

def createHasHalfBath(df: pd.DataFrame):
    """ Create the HasHalfBath categorical attribute.

    Args:
        df (pandas.DataFrame): The pandas dataframe object to create the HasHalfBath attribute for.

    Returns:
        df (pandas.DataFrame): The given pandas dataframe object with the new attribute.
    """
    hasHalfBath = np.zeros(df.shape[0])

    for i, numberofHalfBaths in enumerate(df["HalfBath"]):
        if numberofHalfBaths > 0:
            hasHalfBath[i] = 1

    df["HasHalfBath"] = pd.Series(hasHalfBath)
    df = df.drop(columns=["HalfBath"])
    
    return df

def log_transform_features(df: pd.DataFrame, features):
    """ Log transform the input features

    Args:
        df (pandas.DataFrame): A pandas dataframe object containing the features used for a model.
        features (array-like object of strings): An array-like object containing the name of the features in df to be transformed. 

    Returns:
        df (pandas.DataFrame): The pandas dataframe object containing the features used for the model and with the desired features log-transformed.
    """
    try:
        features_iterator = iter(features)
        for feature in features_iterator:
            df[feature] = np.log(df[feature] + 0.001)
        
        return df

    except TypeError as error:
        print(error)

def apply_custom_transformations(df: pd.DataFrame, categorical_attributes, continuous_attributes, target_attribute: str):
    """ Apply the feature helper functions to a given pandas dataframe object to apply the custom data transformations.

    Args:
        df (pandas.DataFrame): A pandas dataframe object contiaining the dataset to be modelled.
        categorical_attributes: An array-like object containing the names of the desired categorical attributes.
        continuous attributes: An array-like object containing the names of the desired continuous attributes.
        target_attribute (string): The name of the target attribute.

    Returns:
        (X, y): A tuple of the transformed features X and the transformed target y.
    """
    partially_prepped_set = replace_na_with_none(df, categorical_attributes)
    partially_prepped_set = createHasBsmtFullBath(partially_prepped_set)
    partially_prepped_set = createHasHalfBath(partially_prepped_set)
    partially_prepped_set = log_transform_features(partially_prepped_set, continuous_attributes)

    X = partially_prepped_set.drop(columns=[target_attribute])
    y = partially_prepped_set[target_attribute]

    return X, y

## Pipeline for Data Preprocessing 
First, the helper functions are applied to perform initial feature engineering and data preprocessing. Then the data is passed through a scikit-learn pipeline for further preprocessing.

In [6]:
continuous_data_attributes = [
    "SalePrice",
    "LotFrontage",
    "MasVnrArea",
    "BsmtFinSF1",
    "TotalBsmtSF",
    "1stFlrSF",
    "2ndFlrSF",
    "GrLivArea",
    "GarageArea"
]

continuous_features = [
    "LotFrontage",
    "MasVnrArea",
    "BsmtFinSF1",
    "TotalBsmtSF",
    "1stFlrSF",
    "2ndFlrSF",
    "GrLivArea",
    "GarageArea"
]

categorical_features = [
    "MSSubClass",
    "MSZoning",
    "BldgType",
    "HouseStyle",
    "OverallQual",
    "OverallCond",
    "BsmtFullBath",
    "FullBath",
    "HalfBath",
    "TotRmsAbvGrd",
    "Fireplaces",
    "FireplaceQu",
    "GarageCars",
    "GarageType",
    "GarageFinish",
    "GarageQual",
    "GarageCond",
    "PavedDrive",
    "Street",
    "Alley",
    "LotShape",
    "LandContour",
    "LotConfig",
    "LandSlope",
    "Neighborhood",
    "Condition1",
    "Condition2",
    "RoofStyle",
    "RoofMatl",
    "Exterior1st",
    "Exterior2nd",
    "MasVnrType",
    "ExterQual",
    "ExterCond",
    "Foundation",
    "BsmtQual",
    "BsmtCond",
    "BsmtExposure",
    "BsmtFinType1",
    "BsmtFinType2",
    "Heating",
    "HeatingQC",
    "CentralAir",
    "Electrical",
    "PoolQC",
    "SaleType",
    "SaleCondition"
]

partially_prepped_categorical_features = [
    "MSSubClass",
    "MSZoning",
    "BldgType",
    "HouseStyle",
    "OverallQual",
    "OverallCond",
    "HasBsmtFullBath",
    "FullBath",
    "HasHalfBath",
    "TotRmsAbvGrd",
    "Fireplaces",
    "FireplaceQu",
    "GarageCars",
    "GarageType",
    "GarageFinish",
    "GarageQual",
    "GarageCond",
    "PavedDrive",
    "Street",
    "Alley",
    "LotShape",
    "LandContour",
    "LotConfig",
    "LandSlope",
    "Neighborhood",
    "Condition1",
    "Condition2",
    "RoofStyle",
    "RoofMatl",
    "Exterior1st",
    "Exterior2nd",
    "MasVnrType",
    "ExterQual",
    "ExterCond",
    "Foundation",
    "BsmtQual",
    "BsmtCond",
    "BsmtExposure",
    "BsmtFinType1",
    "BsmtFinType2",
    "Heating",
    "HeatingQC",
    "CentralAir",
    "Electrical",
    "PoolQC",
    "SaleType",
    "SaleCondition"
]

attributes = continuous_data_attributes + categorical_features
train_set = train_set[attributes]
test_set = test_set[attributes]

print(train_set.shape)
print(test_set.shape)

(1168, 56)
(292, 56)


In [7]:
X_train, y_train = apply_custom_transformations(train_set, categorical_features, continuous_data_attributes, "SalePrice")

print(X_train.shape)
print(y_train.shape)

(1168, 55)
(1168,)


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

continuous_pipeline = Pipeline([
    ("min_max_scaler", MinMaxScaler()),
    ("imputer", IterativeImputer(random_state=42))
])

categorical_pipeline = Pipeline([
    ("feature_encoder", OneHotEncoder(handle_unknown="ignore"))
])

full_pipeline = ColumnTransformer([
    ("continuous", continuous_pipeline, continuous_features),
    ("categorical", categorical_pipeline, partially_prepped_categorical_features)
])

In [9]:
X_train_prepared = full_pipeline.fit_transform(X_train)
y_train_prepared = continuous_pipeline.fit_transform(y_train.to_numpy().reshape(-1, 1))

print(type(X_train_prepared))
print(type(y_train_prepared))

print(X_train_prepared.shape)
print(y_train_prepared.shape)

<class 'scipy.sparse._csr.csr_matrix'>
<class 'numpy.ndarray'>
(1168, 316)
(1168, 1)


## Baseline Model - Mean Prediction
A naive model that predicts the mean will be used as the baseline model to beat.

In [10]:
from sklearn.metrics import mean_squared_error, r2_score

X_test, y_test = apply_custom_transformations(test_set, categorical_features, continuous_data_attributes, "SalePrice")

print(X_test.shape)
print(y_test.shape)

X_test_prepared = full_pipeline.transform(X_test)
y_test_prepared = continuous_pipeline.transform(y_test.to_numpy().reshape(-1, 1))

print(X_test_prepared.shape)
print(y_test_prepared.shape)

baseline_predictions = np.array([y_test_prepared.mean() for i in range(y_test_prepared.shape[0])]).reshape(-1, 1)

error = np.sqrt(mean_squared_error(y_test_prepared, baseline_predictions))
r2 = r2_score(y_test_prepared, baseline_predictions)

print(f"Root Mean Squared Error: {error}")
print(f"R-Squared Coefficient: {r2}")

(292, 55)
(292,)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.1411311175039301
R-Squared Coefficient: 0.0


### Helper function for displaying Root Mean-squared Error scores from Cross Validation

In [11]:
def display_cv_scores(scores):
    """ Display the error scores from cross validation and basic statistics.

    Args:
        scores (array-like): An array-like object containing the error scores from cross validation.

    Returns:
        None
    """
    print(f"Scores: {scores}")
    print(f"Mean: {scores.mean()}")
    print(f"Standard Deviation: {scores.std()}")

## Model Development - Exploration

### Ridge Regression

In [16]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

ridge_reg = Ridge(random_state=42)
ridge_reg.fit(X_train_prepared, y_train_prepared)
ridge_reg_preds = ridge_reg.predict(X_train_prepared)

ridge_reg_error = np.sqrt(mean_squared_error(y_train_prepared, ridge_reg_preds))
ridge_reg_r2 = r2_score(y_train_prepared, ridge_reg_preds)
print(f"Root Mean Squared Error: {ridge_reg_error}")
print(f"R-Squared Coefficient: {ridge_reg_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.032866471260588906
R-Squared Coefficient: 0.9336115920541752


In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


ridge_reg_cv_mse_scores = cross_val_score(ridge_reg, X_train_prepared, y_train_prepared, cv=10, scoring="neg_mean_squared_error")
ridge_reg_cv_rmse_scores = np.sqrt(abs(ridge_reg_cv_mse_scores))
ridge_reg_cv_r2_scores = cross_val_score(ridge_reg, X_train_prepared, y_train_prepared, cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(ridge_reg_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(ridge_reg_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.03776917 0.05032468 0.03920957 0.05939019 0.06366347 0.04971921
 0.04502589 0.03928121 0.04401541 0.0311406 ]
Mean: 0.045953939942091265
Standard Deviation: 0.009531511949242335


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.8863129  0.87011025 0.89817716 0.77789265 0.73899829 0.89977123
 0.86641783 0.90268408 0.8506723  0.93286089]
Mean: 0.8623897585058501
Standard Deviation: 0.05693732216279896


### Linear SVM

In [17]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

linear_svm = LinearSVR(random_state=42, dual=False, loss="squared_epsilon_insensitive")
linear_svm.fit(X_train_prepared, y_train_prepared.flatten())
linear_svm_preds = linear_svm.predict(X_train_prepared)

linear_svm_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), linear_svm_preds))
linear_svm_r2 = r2_score(y_train_prepared.flatten(), linear_svm_preds)
print(f"Root Mean Squared Error: {linear_svm_error}")
print(f"R-Squared Coefficient: {linear_svm_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.03238732455020944
R-Squared Coefficient: 0.9355331805416471


In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


linear_svm_cv_mse_scores = cross_val_score(linear_svm, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
linear_svm_cv_rmse_scores = np.sqrt(abs(linear_svm_cv_mse_scores))
linear_svm_cv_r2_scores = cross_val_score(linear_svm, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(linear_svm_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(linear_svm_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.03861523 0.0505713  0.03950778 0.06106219 0.06621297 0.04893098
 0.04806584 0.0392905  0.04624776 0.03037083]
Mean: 0.046887536818622545
Standard Deviation: 0.010233988174234793


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.88116252 0.86883406 0.89662246 0.76521074 0.71767525 0.90292401
 0.84777115 0.90263802 0.83514122 0.93613912]
Mean: 0.8554118550636565
Standard Deviation: 0.06412726794384478


### RBF SVM

In [19]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

rbf_svm = SVR(kernel="rbf")
rbf_svm.fit(X_train_prepared, y_train_prepared.flatten())
rbf_svm_preds = rbf_svm.predict(X_train_prepared)

rbf_svm_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), rbf_svm_preds))
rbf_svm_r2 = r2_score(y_train_prepared.flatten(), rbf_svm_preds)
print(f"Root Mean Squared Error: {rbf_svm_error}")
print(f"R-Squared Coefficient: {rbf_svm_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.051464994008846784
R-Squared Coefficient: 0.8372166616186614


In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


rbf_svm_cv_mse_scores = cross_val_score(rbf_svm, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
rbf_svm_cv_rmse_scores = np.sqrt(abs(rbf_svm_cv_mse_scores))
rbf_svm_cv_r2_scores = cross_val_score(rbf_svm, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(rbf_svm_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(rbf_svm_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.0579433  0.06731077 0.05438561 0.06119562 0.05991809 0.07540789
 0.06511418 0.05900817 0.05399559 0.05584723]
Mean: 0.06101264597197378
Standard Deviation: 0.006310471368164176


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.7324268  0.767629   0.80410243 0.7641835  0.76880483 0.76944343
 0.72063297 0.78039686 0.77527705 0.78406403]
Mean: 0.7666960910683441
Standard Deviation: 0.022973021144320767


### Random Forest

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_prepared, y_train_prepared.flatten())
rf_preds = rf.predict(X_train_prepared)

rf_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), rf_preds))
rf_r2 = r2_score(y_train_prepared.flatten(), rf_preds)
print(f"Root Mean Squared Error: {rf_error}")
print(f"R-Squared Coefficient: {rf_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.018525086698494293
R-Squared Coefficient: 0.9789085423118229


In [22]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


rf_cv_mse_scores = cross_val_score(rf, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
rf_cv_rmse_scores = np.sqrt(abs(rf_cv_mse_scores))
rf_cv_r2_scores = cross_val_score(rf, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(rf_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(rf_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.03598903 0.05766059 0.04279861 0.0651481  0.05400511 0.0641665
 0.0522516  0.04887763 0.04423385 0.04037314]
Mean: 0.05055041584045972
Standard Deviation: 0.009392934929445212


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.89677697 0.82948169 0.87868339 0.73273812 0.81218402 0.83305992
 0.82010329 0.84932733 0.84918649 0.88714869]
Mean: 0.8388689917573702
Standard Deviation: 0.044648790414122945


### Gradient Boosting - Gradient Boosting Regressor

In [23]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

gbrt = GradientBoostingRegressor(random_state=42)
gbrt.fit(X_train_prepared, y_train_prepared.flatten())
gbrt_preds = gbrt.predict(X_train_prepared)

gbrt_error = np.sqrt(mean_squared_error( y_train_prepared.flatten(), gbrt_preds))
gbrt_r2 = r2_score( y_train_prepared.flatten(), gbrt_preds)
print(f"Root Mean Squared Error: {gbrt_error}")
print(f"R-Squared Coefficient: {gbrt_r2}")

(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.029145769018466383
R-Squared Coefficient: 0.9477919905269635


In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


gbrt_cv_mse_scores = cross_val_score(gbrt, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
gbrt_cv_rmse_scores = np.sqrt(abs(gbrt_cv_mse_scores))
gbrt_cv_r2_scores = cross_val_score(gbrt, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(gbrt_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(gbrt_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.03032411 0.05113177 0.04272181 0.06050483 0.04684522 0.0557952
 0.04581899 0.04016334 0.04375031 0.03512748]
Mean: 0.04521830514786847
Standard Deviation: 0.008608875830918608


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.92671549 0.86591059 0.87911836 0.76947734 0.85868334 0.87377727
 0.86167044 0.89826418 0.85246571 0.91456898]
Mean: 0.8700651695666057
Standard Deviation: 0.04089706783233382


### Gradient Boosting - XGBoost

In [25]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print(X_train_prepared.shape)
print(y_train_prepared.shape)
print(X_test_prepared.shape)
print(y_test_prepared.shape)

xgb_reg = XGBRegressor(random_state=42)
xgb_reg.fit(X_train_prepared, y_train_prepared.flatten())
xgb_reg_preds = xgb_reg.predict(X_train_prepared)

xgb_reg_error = np.sqrt(mean_squared_error(y_train_prepared.flatten(), xgb_reg_preds))
xgb_reg_r2 = r2_score(y_train_prepared.flatten(), xgb_reg_preds)
print(f"Root Mean Squared Error: {xgb_reg_error}")
print(f"R-Squared Coefficient: {xgb_reg_r2}")

  from pandas import MultiIndex, Int64Index


(1168, 316)
(1168, 1)
(292, 316)
(292, 1)
Root Mean Squared Error: 0.0036241775138005075
R-Squared Coefficient: 0.9991927556377475


In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


xgb_reg_cv_mse_scores = cross_val_score(xgb_reg, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="neg_mean_squared_error")
xgb_reg_cv_rmse_scores = np.sqrt(abs(xgb_reg_cv_mse_scores))
xgb_reg_cv_r2_scores = cross_val_score(xgb_reg, X_train_prepared, y_train_prepared.flatten(), cv=10, scoring="r2")

print("CV Scores and Statistics for RMSE:\n")
display_cv_scores(xgb_reg_cv_rmse_scores)
print("\n")
print("CV Scores and Statistics for R-Squared Coefficient:\n")
display_cv_scores(xgb_reg_cv_r2_scores)

CV Scores and Statistics for RMSE:

Scores: [0.03998845 0.06043425 0.04509955 0.06781546 0.04883675 0.05829147
 0.04681353 0.04452062 0.04154433 0.0390204 ]
Mean: 0.049236481497167674
Standard Deviation: 0.009203307857140321


CV Scores and Statistics for R-Squared Coefficient:

Scores: [0.87256013 0.81268217 0.86528825 0.71040508 0.84641235 0.86223025
 0.85560015 0.87499231 0.86696854 0.89458435]
Mean: 0.8461723570695316
Standard Deviation: 0.04956779948666943


### Top Candidate Models
Based on initial model exploration, below are the top 3 candidate models to further fine-tune and compare results:
- Gradient Boosting Regression
- Ridge Regression
- Linear SVM

Random Forest and XGBoost seem to be more heavily prone to overfitting than the above, but could potentially be worth fine-tuning to see if they can be generlized further.

## Model Development - Fine-tuning/Optimization

### Helper functions

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves(model, X, y):
    """ Plot the learning curves for the training set error and the validation set error

    Args:
        model: The model to fit onto a training set that will be generated.
        X: The features of the data set used predict the target attribute.
        y: The target attribute to predict.

    Returns:
        None
    """
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    train_errors, val_errors = [], []

    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])

        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)

        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")