In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
import joblib
import dill
warnings.filterwarnings("ignore")

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer, PowerTransformer
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, SGDRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_validate, StratifiedKFold, KFold, GridSearchCV, LearningCurveDisplay
from sklearn.metrics import PredictionErrorDisplay, root_mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree

from utils import plot_outlier_analysis, residual_plots

pd.set_option('display.float_format', '{:.3f}'.format)

In [2]:
dataset = pd.read_csv('train.csv')

y = dataset[['SalePrice']]
log1p_y = np.log1p(y)
X = dataset.drop(['SalePrice', 'Id'], axis = 1)

fulltestset = pd.read_csv('test.csv')
testset = fulltestset.drop(['Id'], axis = 1)


In [3]:
chg_to_category = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',
                   'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 
                   'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                   'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                   'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 
                   'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                   'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 
                   'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
                   'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
                   'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 
                'SaleType', 'SaleCondition']

chg_to_numer = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
                'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
                'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
                'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 
                'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
                'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
                'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
                '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold',
                'MoSold'
                ]

numerical_selector = make_column_selector(dtype_include=['int64', 'float64'])
cat_selector = make_column_selector(dtype_include=['object'])

X[chg_to_numer] = X[chg_to_numer].astype('float64')
X[chg_to_category] = X[chg_to_category].astype('object')

testset[chg_to_numer] = testset[chg_to_numer].astype('float64')
testset[chg_to_category] = testset[chg_to_category].astype('object')

continous_num = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                 '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
                 'EnclosedPorch', '3SsnPorch',	'ScreenPorch',	'PoolArea',	'MiscVal']

time_based_num = ['MoSold', 'YrSold', 'GarageYrBlt', 'YearRemodAdd', 'YearBuilt']

discrete_num = list(set(chg_to_numer) - set(continous_num) - set(time_based_num))


In [None]:
# Closer Outlier Analysis for making robust trees
for col in continous_num:
    plot_outlier_analysis(X[col], y['SalePrice'])


In [None]:

# Assuming X is your DataFrame and y is your Series
# X = pd.DataFrame([...])
# y = pd.Series([...])

# One-hot encode the categorical variables in X
X_encoded = pd.get_dummies(X, drop_first=True)

# Concatenate X_encoded and y
data = pd.concat([X_encoded, y], axis=1)
data2 = pd.concat([X[chg_to_numer], y], axis=1)

# Calculate the correlation matrix
corr_matrix = data2.corr()

# Plot the heatmap
# plt.figure(figsize=(12, 8))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
# plt.title('Feature and Target Correlation Heatmap')
# plt.show()
(~corr_matrix[corr_matrix > 0.8].isnull()).sum().sort_values(ascending=False)
# corr_matrix['GrLivArea'].sort_values(ascending=False)

# f, ax = plt.subplots(figsize=(12, 9))
# sns.heatmap(corrmat, vmax=.8, square=True);


In [None]:
# import preprocessors
# from importlib import reload
# reload(preprocessors)

from preprocessors import missing_imputers, feat_engg

imputer = missing_imputers()
feat_engg = feat_engg(trainset=X, testset=testset)

cols_list = [numerical_selector, chg_to_category]
prep8 = feat_engg.prep8(*cols_list)

# X_prep = prep8.fit_transform(X)

# dtr = DecisionTreeRegressor().fit(X_prep, y)

dtr = Pipeline(
    [
        ('prep', prep8),
        ('tree', DecisionTreeRegressor())
    ]
)

dtr.fit(X, y)

# joblib.dump(dtr, 'models/base_decisiontree.pkl')


tree_preds = pd.DataFrame(dtr.predict(testset), columns=['SalePrice'])
# tree_preds = np.exp(tree_preds)
tree_preds


In [None]:
pd.set_option('display.max_rows', 50)

path = dtr[1].cost_complexity_pruning_path(dtr[0].fit_transform(X), y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
# ccp_alphas[:-1]
# ccp_alphas
path = pd.DataFrame(path)
path.sort_values(by=['ccp_alphas'])
# path = path[path['impurities'] <= 100000000]
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
from tqdm import tqdm

clfs = []
for ccp_alpha in tqdm(ccp_alphas):

    dtr = Pipeline(
        [
            ('prep', prep8),
            ('tree', DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha))
        ]
    )

    # clf = dtr[1].set_params(random_state=0, ccp_alpha=ccp_alpha)
    dtr.fit(X, y)
    clfs.append(dtr)

In [None]:
# clfs[-1][1].tree_.node_count
# ccp_alphas[:100]
# clfs[4]
# for pipeline in clfs:
#     node_count = pipeline[1].tree_.node_count
node_counts = [node_count for pipeline in clfs for node_count in [pipeline[1].tree_.node_count]]
depth = [max_depth for pipeline in clfs for max_depth in [pipeline[1].tree_.max_depth]]
# depth = [clf.tree_.max_depth for clf in clfs]


In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [node_count for pipeline in clfs for node_count in [pipeline[1].tree_.node_count]]
depth = [max_depth for pipeline in clfs for max_depth in [pipeline[1].tree_.max_depth]]
fig, ax = plt.subplots(2, 1)
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()


In [None]:
### run gridsearch over values of alpha {0, 100000000}
dir(clfs[1])

#params
# 'tree__ccp_alpha': 0.8561643835616438,
#  'tree__criterion': 'squared_error',
#  'tree__max_depth': None, full tree 25
#  'tree__max_features': None,
#  'tree__max_leaf_nodes': None, <3000
#  'tree__min_impurity_decrease': 0.0,
#  'tree__min_samples_leaf': 1,
#  'tree__min_samples_split': 2,
#  'tree__min_weight_fraction_leaf': 0.0,
#  'tree__monotonic_cst': None,
#  'tree__random_state': 0,
#  'tree__splitter': 'best'

clfs[1].get_params()

dtr = Pipeline(
        [
            ('prep', prep8),
            ('tree', DecisionTreeRegressor(random_state=0))
        ]
    )


params = {
    'tree__ccp_alpha': ccp_alphas.to_list(),
    'tree__max_depth': [15],
    'tree__min_samples_split': [50],
    'tree__min_samples_leaf': [10],
    'tree__random_state': [0],
    
}

gridcv_res = GridSearchCV(estimator = dtr, 
             param_grid = params,
            scoring=('neg_mean_squared_error'), 
            n_jobs=-1,
            refit=True, 
            cv=None, 
            verbose=1, 
            return_train_score=True)


gridcv_res.fit(X, y)


In [None]:
# pd.DataFrame(gridcv_res.cv_results_).sort_values(by = ['rank_test_score'])
gridcv_res.best_params_

In [None]:
dtr.set_params(**gridcv_res.best_params_)
# dtr.set_params(tree__ccp_alpha = 3764042.0495960824, tree__min_samples_split= 50)
# dtr.set_params(tree__ccp_alpha = 13981182.229896478)
# dtr.set_params(tree__ccp_alpha = 699162.1819960608)
# dtr.set_params(tree__ccp_alpha = 0)
# dtr.fit(X, y)
cv_res_best_model = cross_validate(dtr, 
               X, y,
               cv = 5,
               scoring=('neg_mean_squared_error'),
               return_train_score=True)
pd.DataFrame(cv_res_best_model)

dtr.fit(X, y)

residual_plots(dtr, X, y)


In [None]:
# dtr[1].tree_.max_depth
df = pd.DataFrame(gridcv_res.cv_results_).sort_values(by=['rank_test_score'])[['param_tree__ccp_alpha', 'mean_test_score', 'mean_train_score', 'std_test_score', 'std_train_score', 'rank_test_score']]
df

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Sample data (replace this with your actual DataFrame)
# df = pd.DataFrame({
#     'param_tree__ccp_alpha': [...],
#     'mean_test_score': [...],
#     'mean_train_score': [...],
#     'std_test_score': [...],
#     'std_train_score': [...]
# })

# Plotting
plt.figure(figsize=(10, 6))

# Plot mean train score
sns.lineplot(x='param_tree__ccp_alpha', y='mean_train_score', data=df, label='Mean Train Score', marker='o')

# Plot mean test score
sns.lineplot(x='param_tree__ccp_alpha', y='mean_test_score', data=df, label='Mean Test Score', marker='o')

# Adding titles and labels
plt.title('Effect of param_tree__ccp_alpha on Mean Train and Test Scores')
plt.xlabel('param_tree__ccp_alpha')
plt.ylabel('Mean Score')
plt.legend()

# Show plot
plt.show()
df

# SAVING MODELS AND PREDICT FOR SUBMISSIONS

In [99]:
# dtr[1].get_params()
# with open('models/base_decisiontree_1_pre_post_pruning.pkl', 'wb') as f: 
#     dill.dump(dtr, f)

# tree_preds = pd.DataFrame(dtr.predict(testset), columns=['SalePrice'])
# fulltestset[['Id']].join(tree_preds)
# fulltestset[['Id']].join(tree_preds).to_csv('submissions/base_decisiontree_1_pre_post_pruning.csv')
