<a href="https://colab.research.google.com/github/reygaferdiansyah/Decision_Tree/blob/main/HousePrices_DecisionTree_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


plt.rcParams["figure.figsize"] = (20, 10)

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df_train.shape

(1460, 81)

In [None]:
df_test.shape

(1459, 80)

In [None]:
def get_missed_values_stat(df):
    missed_stat = df.isna().sum().sort_values(ascending=False).reset_index()
    missed_stat.columns = ['feature', 'NaN count']
    missed_stat['NaN share'] = missed_stat['NaN count'] / df.shape[0]
    return missed_stat[missed_stat['NaN count'] > 0]

In [None]:
def get_common_missed_data(df_train, df_test):
    missed_train = get_missed_values_stat(df_train)
    missed_test = get_missed_values_stat(df_test)

    missed_data = missed_train.merge(missed_test, how='outer', on='feature', suffixes=['_train', '_test'])

    return missed_data

In [None]:
missed_data = get_common_missed_data(df_train, df_test)
missed_data

Unnamed: 0,feature,NaN count_train,NaN share_train,NaN count_test,NaN share_test
0,PoolQC,1453.0,0.995205,1456.0,0.997944
1,MiscFeature,1406.0,0.963014,1408.0,0.965045
2,Alley,1369.0,0.937671,1352.0,0.926662
3,Fence,1179.0,0.807534,1169.0,0.801234
4,FireplaceQu,690.0,0.472603,730.0,0.500343
5,LotFrontage,259.0,0.177397,227.0,0.155586
6,GarageYrBlt,81.0,0.055479,78.0,0.053461
7,GarageCond,81.0,0.055479,78.0,0.053461
8,GarageType,81.0,0.055479,76.0,0.05209
9,GarageFinish,81.0,0.055479,78.0,0.053461


In [None]:
columns_to_drop = missed_data.loc[missed_data['NaN share_train'] > 0.1, 'feature'].values
columns_to_drop

array(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
       'LotFrontage'], dtype=object)

In [None]:
df_train = df_train.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [None]:
df_train.shape, df_test.shape

((1460, 75), (1459, 74))

In [None]:
missed_data = get_common_missed_data(df_train, df_test)
missed_data

Unnamed: 0,feature,NaN count_train,NaN share_train,NaN count_test,NaN share_test
0,GarageType,81.0,0.055479,76.0,0.05209
1,GarageCond,81.0,0.055479,78.0,0.053461
2,GarageYrBlt,81.0,0.055479,78.0,0.053461
3,GarageFinish,81.0,0.055479,78.0,0.053461
4,GarageQual,81.0,0.055479,78.0,0.053461
5,BsmtFinType2,38.0,0.026027,42.0,0.028787
6,BsmtExposure,38.0,0.026027,44.0,0.030158
7,BsmtFinType1,37.0,0.025342,42.0,0.028787
8,BsmtCond,37.0,0.025342,45.0,0.030843
9,BsmtQual,37.0,0.025342,44.0,0.030158


In [None]:
garage_cat_features = ['GarageType', 'GarageCond', 'GarageFinish', 'GarageQual']

In [None]:
for feature in garage_cat_features:
    df_train[feature] = df_train[feature].fillna('None')
    df_test[feature] = df_test[feature].fillna('None')

In [None]:
garage_num_features = ['GarageYrBlt', 'GarageCars', 'GarageArea']

In [None]:
for feature in garage_num_features:
    df_train[feature] = df_train[feature].fillna(0)
    df_test[feature] = df_test[feature].fillna(0)

In [None]:
missed_data = get_common_missed_data(df_train, df_test)
missed_data

Unnamed: 0,feature,NaN count_train,NaN share_train,NaN count_test,NaN share_test
0,BsmtFinType2,38.0,0.026027,42.0,0.028787
1,BsmtExposure,38.0,0.026027,44.0,0.030158
2,BsmtFinType1,37.0,0.025342,42.0,0.028787
3,BsmtQual,37.0,0.025342,44.0,0.030158
4,BsmtCond,37.0,0.025342,45.0,0.030843
5,MasVnrType,8.0,0.005479,16.0,0.010966
6,MasVnrArea,8.0,0.005479,15.0,0.010281
7,Electrical,1.0,0.000685,,
8,MSZoning,,,4.0,0.002742
9,BsmtHalfBath,,,2.0,0.001371


In [None]:
bsmt_cat_features = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

In [None]:
for feature in bsmt_cat_features:
    df_train[feature] = df_train[feature].fillna('None')
    df_test[feature] = df_test[feature].fillna('None')

In [None]:
bsmt_num_features = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']

In [None]:
for feature in bsmt_num_features:
    df_train[feature] = df_train[feature].fillna(0)
    df_test[feature] = df_test[feature].fillna(0)

In [None]:
missed_data = get_common_missed_data(df_train, df_test)
missed_data

Unnamed: 0,feature,NaN count_train,NaN share_train,NaN count_test,NaN share_test
0,MasVnrType,8.0,0.005479,16.0,0.010966
1,MasVnrArea,8.0,0.005479,15.0,0.010281
2,Electrical,1.0,0.000685,,
3,MSZoning,,,4.0,0.002742
4,Utilities,,,2.0,0.001371
5,Functional,,,2.0,0.001371
6,Exterior1st,,,1.0,0.000685
7,SaleType,,,1.0,0.000685
8,Exterior2nd,,,1.0,0.000685
9,KitchenQual,,,1.0,0.000685


In [None]:
df_train[missed_data['feature']]

Unnamed: 0,MasVnrType,MasVnrArea,Electrical,MSZoning,Utilities,Functional,Exterior1st,SaleType,Exterior2nd,KitchenQual
0,BrkFace,196.0,SBrkr,RL,AllPub,Typ,VinylSd,WD,VinylSd,Gd
1,,0.0,SBrkr,RL,AllPub,Typ,MetalSd,WD,MetalSd,TA
2,BrkFace,162.0,SBrkr,RL,AllPub,Typ,VinylSd,WD,VinylSd,Gd
3,,0.0,SBrkr,RL,AllPub,Typ,Wd Sdng,WD,Wd Shng,Gd
4,BrkFace,350.0,SBrkr,RL,AllPub,Typ,VinylSd,WD,VinylSd,Gd
...,...,...,...,...,...,...,...,...,...,...
1455,,0.0,SBrkr,RL,AllPub,Typ,VinylSd,WD,VinylSd,TA
1456,Stone,119.0,SBrkr,RL,AllPub,Min1,Plywood,WD,Plywood,TA
1457,,0.0,SBrkr,RL,AllPub,Typ,CemntBd,WD,CmentBd,Gd
1458,,0.0,FuseA,RL,AllPub,Typ,MetalSd,WD,MetalSd,Gd


In [None]:
df_train['MasVnrArea'] = df_train['MasVnrArea'].fillna(0)
df_test['MasVnrArea'] = df_test['MasVnrArea'].fillna(0)

In [None]:
df_train['Electrical'].mode()[0]

'SBrkr'

In [None]:
df_train['Electrical'] = df_train['Electrical'].fillna(df_train['Electrical'].mode()[0])

In [None]:
missed_data = get_common_missed_data(df_train, df_test)
missed_data

Unnamed: 0,feature,NaN count_train,NaN share_train,NaN count_test,NaN share_test
0,MasVnrType,8.0,0.005479,16,0.010966
1,MSZoning,,,4,0.002742
2,Utilities,,,2,0.001371
3,Functional,,,2,0.001371
4,Exterior2nd,,,1,0.000685
5,SaleType,,,1,0.000685
6,Exterior1st,,,1,0.000685
7,KitchenQual,,,1,0.000685


In [None]:
for feature in missed_data['feature']:
    print(feature, df_test[feature].mode()[0])
    df_test[feature] = df_test[feature].fillna(df_test[feature].mode()[0])

MasVnrType None
MSZoning RL
Utilities AllPub
Functional Typ
Exterior2nd VinylSd
SaleType WD
Exterior1st VinylSd
KitchenQual TA


In [None]:
missed_data = get_common_missed_data(df_train, df_test)
missed_data

Unnamed: 0,feature,NaN count_train,NaN share_train,NaN count_test,NaN share_test
0,MasVnrType,8,0.005479,,


In [None]:
num_to_cat_features = ['MSSubClass', 'OverallQual', 'OverallCond']

In [None]:
for feature in num_to_cat_features:
    df_train[feature] = df_train[feature].astype(str)
    df_test[feature] = df_test[feature].astype(str)

In [None]:
TARGET = 'SalePrice'
COLUMNS_TO_DROP = ['Id', 'SalePrice']

def preprocess_data(data, columns_to_drop, target):
    X = data.drop(columns=columns_to_drop)
    y = data[target]
    return X, y

X_train, y_train = preprocess_data(df_train, COLUMNS_TO_DROP, TARGET)
ohe = OneHotEncoder(handle_unknown='ignore')
X_train = ohe.fit_transform(X_train)

In [None]:
def log_rmse(y_true, y_pred, **kwargs):
    return mean_squared_error(np.log(y_true), np.log(y_pred), squared=False)

In [None]:
log_rmse_scorer = make_scorer(log_rmse, greater_is_better=False)

In [None]:
best_score = None
best_depth = None

for max_depth in np.arange(1, 20):
    reg = DecisionTreeRegressor(max_depth=max_depth, random_state=42)

    scores = -cross_val_score(reg, X_train, y_train, scoring=log_rmse_scorer)

    current_score = np.mean(scores)

    if best_score is None or current_score < best_score:
        best_score = current_score
        best_depth = max_depth


print(best_depth, best_score)

9 0.23539015068165803


In [None]:
param_grid = [{'criterion': ['squared_error', 'friedman_mse'], 'max_depth': range(1, 20, 1)},
              {'criterion': ['squared_error', 'friedman_mse'], 'min_samples_leaf': range(1, 5, 1)}]

In [None]:
reg = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid=param_grid, scoring=log_rmse_scorer)

In [None]:
reg.fit(X_train, y_train)

In [None]:
print(reg.best_params_)
print(reg.best_score_)

{'criterion': 'squared_error', 'min_samples_leaf': 4}
-0.22790944082063377


In [None]:
lin_reg_param_grid = {'alpha': [0.001, 0.01, 0.1]}

In [None]:
lin_reg = GridSearchCV(Ridge(), param_grid=lin_reg_param_grid, scoring=log_rmse_scorer)

In [None]:
lin_reg.fit(X_train, y_train)

In [None]:
print(lin_reg.best_params_)
print(lin_reg.best_score_)

{'alpha': 0.1}
-0.17070034506948337


In [None]:
X_test = df_test.drop(columns=COLUMNS_TO_DROP, errors='ignore')
X_test = ohe.transform(X_test)

In [None]:
y_test_pred = lin_reg.predict(X_test)

In [None]:
y_test_pred

array([121045.25485794, 148479.71350149, 195021.12914627, ...,
       167818.59686053, 104643.10526456, 245455.33130642])

In [None]:
output = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': y_test_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
