# KAGGLE COMPETITION
Seth Peterson

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

_df = pd.read_csv('data/train.csv')
_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# KEY OBSERVATION:

<b>There are null and otherwise missing values.</b> The following list are those we need to pay special attention to as we clean our data:

Incomplete Observations:

 3   LotFrontage    1201 non-null   float64

 6   Alley          91 non-null     object 

 30  BsmtQual       1423 non-null   object 

 31  BsmtCond       1423 non-null   object 

 32  BsmtExposure   1422 non-null   object 

 33  BsmtFinType1   1423 non-null   object 

 35  BsmtFinType2   1422 non-null   object 

 42  Electrical     1459 non-null   object 

 57  FireplaceQu    770 non-null    object 

 58  GarageType     1379 non-null   object 

 59  GarageYrBlt    1379 non-null   float64

 60  GarageFinish   1379 non-null   object

 72  PoolQC         7 non-null      object 

 73  Fence          281 non-null    object 
 
 74  MiscFeature    54 non-null     object

# SECTION 1: CLEANING THE DATA

In [2]:
def clean_data(df : pd.DataFrame) -> pd.DataFrame:
    """
    Clean the data and return a new dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to clean.

    Returns
    -------
    pd.DataFrame
        The cleaned dataframe.

    Methodology
    -----------
    1. Convert MSSubClass to string
    2. Convert MoSold to string
    3. Fill LotFrontage with 0
    4. Fill MasVnrArea with 0
    5. Fill GarageYrBlt with -1
    6. Fill all NaN with NA
    7. Drop Id and Utilities
    """
    df = df.copy()

    # Categorical features
    df['MSSubClass'] = df.MSSubClass.astype(str)
    df['MoSold'] = df['MoSold'].astype(str)

    # Numerical features
    df.LotFrontage.fillna(0, inplace=True)
    df.MasVnrArea.fillna(0, inplace=True)
    df.GarageYrBlt.fillna(-1, inplace=True)

    # Should be NA not NAN
    df.Alley.fillna('NA', inplace=True)
    df.BsmtQual.fillna('NA', inplace=True)
    df.BsmtCond.fillna('NA', inplace=True)
    df.BsmtExposure.fillna('NA', inplace=True)
    df.BsmtFinType1.fillna('NA', inplace=True)
    df.BsmtFinType2.fillna('NA', inplace=True)
    df.FireplaceQu.fillna('NA', inplace=True)
    df.GarageType.fillna('NA', inplace=True)
    df.GarageFinish.fillna('NA', inplace=True)
    df.PoolQC.fillna('NA', inplace=True)
    df.Fence.fillna('NA', inplace=True)
    df.MiscFeature.fillna('NA', inplace=True)

    df = df.drop(['Id', 'Utilities'], axis=1)

    return df

In [3]:
def get_numerical_features(df : pd.DataFrame) -> list:
    """
    Return a list of numerical features.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to get the numerical features from.

    Returns
    -------
    list
        The list of numerical features.
    """
    numeric_dtypes = ['int64', 'float64']
    numeric = []
    for i in df.columns:
        if df[i].dtype in numeric_dtypes:
            numeric.append(i)
    return numeric

In [4]:
def preprocess(df : pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the data and return a new dataframe.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to preprocess.

    Returns
    -------
    pd.DataFrame
        The preprocessed dataframe.

    Methodology
    -----------
    1. Get numerical features
    2. Scale numerical features
    3. One-hot encode categorical features
    4. Concatenate numerical and categorical features
    """
    numeric = get_numerical_features(df)
    num = df[numeric]

    mapper = DataFrameMapper([(num.columns, StandardScaler())]).fit(num)
    scaled_features = mapper.transform(num)
    num = pd.DataFrame(scaled_features, index=num.index, columns=num.columns)
    num.head()

    cat = df[df.columns[~df.columns.isin(numeric)]]
    cat = pd.get_dummies(cat)

    df = pd.concat([num, cat], axis=1)
    return df, numeric

In [5]:
def XY_split(df : pd.DataFrame, unscaled_df : pd.DataFrame = None) -> tuple:
    """
    Split the dataframe into X and y.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to split.
    unscaled_df : pd.DataFrame, optional
        The unscaled dataframe, by default None

    Returns
    -------
    tuple
        The X and y dataframes.
    """
    df = df.drop('SalePrice', axis=1)
    if type(unscaled_df) == pd.DataFrame:
        y = unscaled_df.SalePrice
    else: 
        y = df.SalePrice
    return df, y

In [6]:
# Clean the data, preprocess it, and split it into X and y
df = _df.copy()
df = clean_data(df)
df, numeric = preprocess(df)
X, y = XY_split(df, _df)

In [7]:
# No null observations!
assert (X.isnull().sum().sort_values(ascending=False).head(20)[0] == 0)

In [9]:
# Building features
def has_features(df : pd.DataFrame) -> pd.DataFrame:
    """
    Add features to the dataframe.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to add features to.
        
    Returns
    -------
    pd.DataFrame
        The dataframe with added features.
        
    Methodology
    -----------
    1. Add features for each of the following:
        - 3SsnPorch, ScreenPorch, Pool, Garage, EnclosedPorch, 2ndFloor, WoodDeck, OpenPorch, Bsmt, Fireplace,
    """
    df = df.copy()

    df['3SsnPorch'] = df['3SsnPorch'].apply(lambda x: 1 if x > 0 else 0)
    df['ScreenPorch'] = df['ScreenPorch'].apply(lambda x: 1 if x > 0 else 0)
    df['Pool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    df['Garage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    df['EnclosedPorch'] = df['EnclosedPorch'].apply(lambda x: 1 if x > 0 else 0)
    df['2ndFloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    df['WoodDeck'] = df['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)
    df['OpenPorch'] = df['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0)
    df['Bsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    df['Fireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

    return df

def interaction_terms(df : pd.DataFrame, cols : list) -> pd.DataFrame:
    """
    Add interaction terms to the dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to add interaction terms to.
    cols : list
        The list of columns to add interaction terms for.

    Returns
    -------
    pd.DataFrame
        The dataframe with added interaction terms.

    Methodology
    -----------
    1. Iterate through each column in cols
    2. Iterate through each column in cols
    3. If the columns are not the same, add a new column to the dataframe
         that is the product of the two columns
    """
    # it's way too computationally expensive to do this on the whole dataset
    # so we'll just do it on the numerical columns
    df = df.copy()
    for index, i in enumerate(cols):
        print("Progress: {}/{}".format(index + 1, len(cols)), end='\r')
        for j in cols:
            if i != j:
                df[i + '*' + j] = df.apply(lambda row: row[i] * row[j], axis = 1)
    return df

In [10]:
# Add features
X = has_features(X)

# Add interaction terms
X = interaction_terms(X, [n for n in numeric if n != 'SalePrice'])
X['MSSubClass_150'] = 0 # There's 150 in the test set, but not the train set.

Progress: 34/34

In [11]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,YrSold*GarageCars,YrSold*GarageArea,YrSold*WoodDeckSF,YrSold*OpenPorchSF,YrSold*EnclosedPorch,YrSold*3SsnPorch,YrSold*ScreenPorch,YrSold*PoolArea,YrSold*MiscVal,MSSubClass_150
1023,-0.421999,-0.735111,0.651479,-0.5172,1.117235,1.024029,-0.493261,-0.937926,-0.288653,1.726812,...,0.04326,-0.023362,0.0684,-0.055861,0.0,0.0,0.0,-0.009533,-0.012169,0
810,0.588031,-0.037766,-0.071836,0.381743,0.090492,0.684854,-0.022788,0.481115,2.049128,-1.284176,...,-0.426332,-0.070515,-1.863865,0.96349,-0.0,-0.0,-0.0,-21.971841,0.119927,0
1384,0.068587,-0.146006,-0.071836,-0.5172,-1.068734,-1.689368,-0.57075,-0.525593,-0.288653,-0.478227,...,-0.91595,-0.805389,-0.670936,-0.628395,0.0,0.0,0.0,-0.061273,-0.078217,0
626,-1.662893,0.182922,-0.795151,-0.5172,-0.373198,-0.33267,-0.57075,-0.973018,-0.288653,0.929919,...,0.630941,0.537533,0.462166,0.432862,-0.614439,-0.0,-0.0,0.042207,-0.689464,0
813,0.501457,-0.076853,-0.071836,0.381743,-0.43944,-1.30174,0.774247,0.360485,-0.288653,0.603917,...,0.630941,0.494411,0.462166,0.432862,-0.614439,-0.0,-0.0,0.042207,-0.565574,0


In [12]:
# The multiple of two mean-zero variance-one features is also mean-zero variance-one
X['YrSold*GarageYrBlt'].describe()

count    1460.000000
mean       -0.011134
std         0.974549
min        -6.778807
25%        -0.185936
50%         0.028080
75%         0.212920
max         5.635189
Name: YrSold*GarageYrBlt, dtype: float64

In [13]:
# LASSO time!
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
# define grid for alpha
alpha_grid = {'alpha': np.logspace(1, 5, 20)}
lasso_grid = GridSearchCV(Lasso(),alpha_grid,cv=5,return_train_score=True, n_jobs=-1)
best_lasso=lasso_grid.fit(X_train,y_train)
print("Best alpha: ",best_lasso.best_estimator_.get_params()['alpha'])
print("Train set score: {:.2f}".format(lasso_grid.score(X_train,y_train)))

# Now score the test set
print("Test set score: {:.2f}".format(lasso_grid.score(X_test,y_test)))

# And the MSE:
y_pred = lasso_grid.predict(X_test)
print("MSE: {:.2f}".format(mean_squared_error(y_test, y_pred)))

Best alpha:  14384.498882876629
Train set score: 0.73
Test set score: 0.75
MSE: 1741945153.26


In [28]:
# Time for more ML models!
from sklearn.ensemble import RandomForestRegressor

# define grid for alpha
alpha_grid = {'n_estimators': [200], 'max_depth': [10, 15, 20]}
forest_grid = GridSearchCV(RandomForestRegressor(),alpha_grid,cv=5,return_train_score=True, n_jobs = -1, verbose=3)
best_forest=forest_grid.fit(X_train,y_train)
print("Best Max Depth: ",best_forest.best_estimator_.get_params()['max_depth'])
print("Train set score: {:.2f}".format(forest_grid.score(X_train,y_train)))

# Now score the test set
print("Test set score: {:.2f}".format(forest_grid.score(X_test,y_test)))

# And the MSE:
y_pred = forest_grid.predict(X_test)
print("MSE: {:.2f}".format(mean_squared_error(y_test, y_pred)))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Max Depth:  10
Train set score: 0.97
Test set score: 0.88
MSE: 871191523.78


In [15]:
# Gradient Boosting (Hist)
from sklearn.ensemble import HistGradientBoostingRegressor

# define grid for alpha
alpha_grid = {'max_depth': [3, 5, 10], 'learning_rate': [0.01, 0.1, 1]}
gradient_grid = GridSearchCV(HistGradientBoostingRegressor(),alpha_grid,cv=5,return_train_score=True, n_jobs = -1, verbose=3)
best_gradient=gradient_grid.fit(X_train,y_train)
print("Best Max Depth: ",best_gradient.best_estimator_.get_params()['max_depth'])
print("Best Learning Rate: ",best_gradient.best_estimator_.get_params()['learning_rate'])
print("Train set score: {:.2f}".format(gradient_grid.score(X_train,y_train)))

# Now score the test set
print("Test set score: {:.2f}".format(gradient_grid.score(X_test,y_test)))

# And the MSE:
y_pred = gradient_grid.predict(X_test)
print("MSE: {:.2f}".format(mean_squared_error(y_test, y_pred)))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Max Depth:  3
Best Learning Rate:  0.1
Train set score: 0.96
Test set score: 0.89
MSE: 759595297.52


In [16]:
# Let's combine all of the models into an ensemble
from sklearn.ensemble import VotingRegressor

# Use all of the best parameters from the previous models
voting = VotingRegressor([('lasso', Lasso(alpha=best_lasso.best_estimator_.get_params()['alpha'])),
                            ('forest', RandomForestRegressor(n_estimators=best_forest.best_estimator_.get_params()['n_estimators'],
                                                                max_depth=best_forest.best_estimator_.get_params()['max_depth'])),
                            ('gradient', HistGradientBoostingRegressor(max_depth=best_gradient.best_estimator_.get_params()['max_depth'],
                                                                        learning_rate=best_gradient.best_estimator_.get_params()['learning_rate']))],
                            weights=[.15, .375, .475],
                            n_jobs=-1,
                            verbose=3)

voting.fit(X_train, y_train)

# Now score the test set
print("Test set score: {:.2f}".format(voting.score(X_test,y_test)))

# And the MSE:
y_pred = voting.predict(X_test)
print("MSE: {:.2f}".format(mean_squared_error(y_test, y_pred)))


Test set score: 0.88
MSE: 819388586.42


In [17]:
# Testing time!
_test_df = pd.read_csv('data/test.csv')
_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [27]:
# Redo the same preprocessing steps as before
test_df = _test_df.copy()
test_df = clean_data(test_df)
test_df, test_numeric = preprocess(test_df)

test_X = has_features(test_df)
# Add any missing columnns from the training set to the test set. 
# This is because the training set may have had a feature that the test set doesn't have, which is important for when we do the interaction terms.
# This occurs when we built the dummy variables for the categorical features.
test_X[list(set(X_train.columns) - set(test_X.columns))] = 0

# Add the interaction terms
test_X = interaction_terms(test_X, [n for n in test_numeric if n != 'SalePrice'])

# Fill in the missing values with 0
test_X.fillna(0, inplace=True)

# Rearrange the columns so that they match the training set
test_X = test_X[X_train.columns]

test_X.head(3)

Progress: 34/34

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,YrSold*GarageCars,YrSold*GarageArea,YrSold*WoodDeckSF,YrSold*OpenPorchSF,YrSold*EnclosedPorch,YrSold*3SsnPorch,YrSold*ScreenPorch,YrSold*PoolArea,YrSold*MiscVal,MSSubClass_150
0,0.684849,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.563316,0.063273,0.517171,-0.650396,...,-1.69278,2.0319,0.628452,-1.202524,0.0,0.0,1.713905,-0.098082,-0.158097,0
1,0.715852,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.047057,1.063027,-0.2978,-0.339262,...,-1.69278,-1.269933,4.024022,-0.306491,0.0,0.0,0.0,-0.098082,33.816104,0
2,0.498831,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.563316,0.772989,-0.2978,-0.954667,...,0.516775,0.072918,1.59478,-0.35627,0.0,0.0,0.0,-0.098082,-0.158097,0


In [19]:
assert (test_X.isnull().sum().sort_values(ascending=False).head(20)[0] == 0)

In [24]:
# Predict the test set
pred = voting.predict(test_X)

In [25]:
# Get the ID list from the sample submission
SAMPLE = pd.read_csv('data/sample_submission.csv')
ids = SAMPLE.Id

In [26]:
# Create the submission file
pred = pd.DataFrame(pred, index=ids, columns=['SalePrice'])
pred.to_csv('voting_submission.csv')

# That's all!

Overall, this model did okay. I really wanted to investigate how important it was to have interaction terms in a Machine Learning problem. The plethora of interaction terms certainly added a lot to the runtime of each training algorithm, but I'm honestly not absolutely certain they accounted for much of the MSE reduction. Overall, I'm quite happy with the test score of the ensemble voting model, at .88, I feel as though we captured a lot of the variance. On Kaggle, my final rendition is a public score of .15575, which is alright. I feel as though I could better tune this model with more data cleaning and exploration, but I'm very happy with how it all went.