In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
import joblib
import dill
warnings.filterwarnings("ignore")

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer, PowerTransformer
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, SGDRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_validate, StratifiedKFold, KFold, GridSearchCV, LearningCurveDisplay
from sklearn.metrics import PredictionErrorDisplay, root_mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree

from preprocessors import missing_imputers, feat_engg

import optuna
from plotly.io import show


from utils import plot_outlier_analysis, residual_plots

pd.set_option('display.float_format', '{:.3f}'.format)


In [3]:
dataset = pd.read_csv('train.csv')

y = dataset[['SalePrice']]
log1p_y = np.log1p(y)
X = dataset.drop(['SalePrice', 'Id'], axis = 1)

fulltestset = pd.read_csv('test.csv')
testset = fulltestset.drop(['Id'], axis = 1)


In [4]:
chg_to_category = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',
                   'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 
                   'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                   'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                   'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 
                   'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                   'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 
                   'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
                   'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
                   'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 
                'SaleType', 'SaleCondition']

chg_to_numer = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
                'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
                'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
                'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 
                'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
                'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
                'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
                '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold',
                'MoSold'
                ]

numerical_selector = make_column_selector(dtype_include=['int64', 'float64'])
cat_selector = make_column_selector(dtype_include=['object'])

X[chg_to_numer] = X[chg_to_numer].astype('float64')
X[chg_to_category] = X[chg_to_category].astype('object')

testset[chg_to_numer] = testset[chg_to_numer].astype('float64')
testset[chg_to_category] = testset[chg_to_category].astype('object')

continous_num = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                 '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
                 'EnclosedPorch', '3SsnPorch',	'ScreenPorch',	'PoolArea',	'MiscVal']

time_based_num = ['MoSold', 'YrSold', 'GarageYrBlt', 'YearRemodAdd', 'YearBuilt']

discrete_num = list(set(chg_to_numer) - set(continous_num) - set(time_based_num))


In [None]:


imputer = missing_imputers()
data_prep = feat_engg(trainset=X, testset=testset)

cols_list = [numerical_selector, chg_to_category]
prep8 = data_prep.prep8(*cols_list)


dtr = Pipeline(
    [
        ('prep', prep8),
        ('tree', DecisionTreeRegressor(random_state=0))
    ]
)

dtr.fit(X, y)


tree_preds = pd.DataFrame(dtr.predict(testset), columns=['SalePrice'])
tree_preds


In [None]:
cv_results = cross_validate(dtr,
                                X, y,
                                cv=5,
                                scoring='neg_mean_squared_error',
                                return_train_score=True)
pd.DataFrame(cv_results)

In [None]:
pd.set_option('display.max_rows', 50)

path = dtr[1].cost_complexity_pruning_path(dtr[0].fit_transform(X), y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
# ccp_alphas[:-1]
# ccp_alphas
path = pd.DataFrame(path)
path.sort_values(by=['ccp_alphas'])
# path = path[path['impurities'] <= 100000000]
ccp_alphas, impurities = path.ccp_alphas, path.impurities


In [None]:
# to optimise
# ccp_alphas, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None

# Define an objective function to be minimized.
def objective(trial):

    
    ccp_alphas_trials = trial.suggest_categorical('tree__ccp_alpha', ccp_alphas)
    max_depth_trials = trial.suggest_categorical('tree__max_depth', np.arange(5, 36, 5))
    min_samples_split_trials = trial.suggest_categorical('tree__min_samples_split', np.arange(30, 100, 10))
    min_samples_leaf_trials = trial.suggest_categorical('tree__min_samples_leaf', np.arange(20, 100, 10))
    # min_impurity_decrease_trials = trial.suggest_categorical('min_impurity_decrease', np.arange(0, 2, 5))

    dtr = Pipeline(
        [
            ('prep', prep8),
            ('tree', DecisionTreeRegressor(random_state=0, 
                                           ccp_alpha = ccp_alphas_trials,
                                           max_depth=max_depth_trials,
                                           min_samples_split = min_samples_split_trials,
                                           min_samples_leaf=min_samples_leaf_trials)
                                           )
        ]
    )

    cv_results = cross_validate(dtr,
                                X, y,
                                cv=5,
                                scoring='neg_mean_squared_error',
                                return_train_score=True)

    # regressor_obj.fit(X_train, y_train)
    # y_pred = regressor_obj.predict(X_val)

    error = pd.DataFrame(cv_results)['test_score'].mean()

    return error  # An objective value linked with the Trial object.

study = optuna.create_study(direction='maximize')  # Create a new study.
study.optimize(objective, n_trials=100)  # Invoke optimization of the objective function.


In [None]:
from plotly.io import show

# display(study.best_params)
# dir(study)
dtr.set_params(**study.best_params)
dtr.get_params()
# cv_res = cross_validate(dtr,
#                X, y, cv = 5, return_train_score=True, scoring='neg_mean_squared_error')

# pd.DataFrame(cv_res)

# dtr[1].feature_importances_
# dtr[1].feature_names_in_
# feat_imp = pd.DataFrame(
#     {
#         'feature_names_in': dtr[1].feature_names_in_,
#         'feature_importances_': dtr[1].feature_importances_
#     }
# ).sort_values(by=['feature_importances_'], ascending=False)
# feat_imp[feat_imp['feature_importances_'] > 0.1]
# pd.DataFrame(dtr[1].tree_.impurity[np.where(dtr[1].tree_.children_left == -1)[0]]).mean()
# pd.DataFrame(dtr[1].tree_.impurity)
pd.DataFrame(dtr[1].tree_.n_node_samples[np.where(dtr[1].tree_.children_left != -1)[0]]).plot()


In [None]:
# tree__ccp_alpha, tree__max_depth, tree__min_samples_split, tree__min_samples_leaf

fig = optuna.visualization.plot_contour(study, params=["tree__max_depth", "tree__min_samples_split", "tree__min_samples_leaf", "tree__ccp_alpha"])
show(fig)

# SAVING MODELS AND PREDICT FOR SUBMISSIONS

In [99]:
# dtr[1].get_params()
# with open('models/base_decisiontree_1_pre_post_pruning.pkl', 'wb') as f: 
#     dill.dump(dtr, f)

# tree_preds = pd.DataFrame(dtr.predict(testset), columns=['SalePrice'])
# fulltestset[['Id']].join(tree_preds)
# fulltestset[['Id']].join(tree_preds).to_csv('submissions/base_decisiontree_1_pre_post_pruning.csv')
