In [1]:
import pandas as pd
ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values="?")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

In [2]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

In [3]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

model = make_pipeline(
    StandardScaler(),
    SimpleImputer(),
    LinearRegression())

cv_result = cross_validate(model, data_numerical, target,
                           cv=10, scoring='r2')
print(f"{cv_result['test_score'].mean():.3f} +/- {cv_result['test_score'].std():.3f}")

0.719 +/- 0.141


In [4]:
from sklearn.tree import DecisionTreeRegressor

model = make_pipeline(SimpleImputer(),
    DecisionTreeRegressor())

cv_result = cross_validate(model, data_numerical, target,
                           cv=10,scoring='r2')
print(f"{cv_result['test_score'].mean():.3f} +/- {cv_result['test_score'].std():.3f}")

0.637 +/- 0.086


In [5]:
#Q2
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {"decisiontreeregressor__max_depth": np.arange(1, 16, 1)}


tree_reg = GridSearchCV(model, 
                        param_grid=param_grid,cv = 10, scoring='r2')


#print(f"{cv_result['test_score'].mean():.3f} +/- {cv_result['test_score'].std():.3f}")
tree_reg = tree_reg.fit(data_numerical, target)
tree_reg.cv_results_

{'mean_fit_time': array([0.004879  , 0.0057874 , 0.00673494, 0.00770848, 0.00858746,
        0.00946727, 0.01028898, 0.01106436, 0.01184275, 0.01252937,
        0.01316512, 0.01371272, 0.01426158, 0.01455791, 0.01488969]),
 'std_fit_time': array([2.77734313e-04, 4.45091729e-05, 2.98101796e-05, 3.72798612e-05,
        1.68486886e-05, 4.96012879e-05, 6.83994981e-05, 7.53226931e-05,
        7.92964890e-05, 6.08074889e-05, 5.72114431e-05, 1.11823400e-04,
        1.41618436e-04, 8.43003762e-05, 6.58793678e-05]),
 'mean_score_time': array([0.00154414, 0.00149415, 0.00147495, 0.00147669, 0.00147712,
        0.00147903, 0.00148141, 0.00149875, 0.00147288, 0.00147693,
        0.00149264, 0.00149515, 0.00148573, 0.00148146, 0.00148582]),
 'std_score_time': array([4.30286026e-05, 1.48473531e-05, 6.29732028e-06, 4.73493536e-06,
        6.42691764e-06, 3.35018403e-06, 6.28652424e-06, 2.44654640e-05,
        4.86636792e-06, 6.33857215e-06, 1.76172455e-05, 1.91693430e-05,
        1.79999553e-05, 7.10

In [7]:
results = (
    pd.DataFrame(tree_reg.cv_results_)
    .sort_values(by="mean_test_score", ascending=False)
)

results = results[
    [c for c in results.columns if c.startswith("param_")]
    + ["mean_test_score", "std_test_score"]]

In [8]:
results

Unnamed: 0,param_decisiontreeregressor__max_depth,mean_test_score,std_test_score
5,6,0.704576,0.045693
6,7,0.698767,0.059153
4,5,0.684881,0.089453
8,9,0.665907,0.07812
3,4,0.662526,0.104945
9,10,0.648911,0.084595
7,8,0.647331,0.113612
11,12,0.632029,0.078489
13,14,0.622802,0.095768
10,11,0.608375,0.100468


In [9]:
#Q4
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_transformer = make_pipeline(SimpleImputer())

cat_transformer = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder(handle_unknown='ignore'))

preprocessor = ColumnTransformer(transformers=[
    ("num-preprocessor", num_transformer, numerical_columns),
    ('cat-preprocessor',cat_transformer, categorical_columns)
])

model = make_pipeline(preprocessor, DecisionTreeRegressor(random_state=0))

cv_result = cross_validate(model, data, target, cv=10)

scores1 = cv_result["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores1.mean():.3f} +/- {scores1.std():.3f}")

The mean cross-validation accuracy is: 0.743 +/- 0.129
