#### **Step 1: Import Libraries**

In [None]:
import pandas as pd
import sklearn

#### **Step 2: Load the Data**

In [None]:
# House Prices Dataset from Kaggle
iowa_file_path = "data/Housing-Prices-Competition-for Kaggle-Learn-Users/train.csv" 
home_data = pd.read_csv(iowa_file_path)

#### **Step 3: Review the Data**

In [35]:
# Review data
home_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


#### **Step 4: Basic Data Cleaning**

In [119]:
# Remove column with missing data (NaN)
home_data = home_data.dropna(axis=1)   # axis = [row, col]

#### **Step 5: Specify Feature Matrix (X) and Prediction Target (y)**

In [71]:
# or y = home_data['SalePrice']
y = home_data.SalePrice

features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd'] 
X = home_data[features]

#### **Step 6: Split the Dataset into Training and Validation dataset**

In [38]:
from sklearn.model_selection import train_test_split

# Split into validation and training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

#### **Step 7: Specify and fit the model**
(fit = train = Learn the pattern)

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtree_model = DecisionTreeRegressor(random_state=1)

dtree_model.fit(X_train, y_train)

#### **Step 8: Make (In-Sample) Prediction**

In [79]:
# Make a prediction of y based on the feature of validation dataset (X_valid) 
y_pred = dtree_model.predict(X_valid) 
y_pred[:10], y_pred.shape

(array([186500., 184000., 130000.,  92000., 164500., 220000., 335000.,
        144152., 215000., 262000.]),
 (365,))

#### **Step 9: Model Evaluation**

In [40]:
from sklearn.metrics import mean_absolute_error

val_mae = mean_absolute_error(y_pred, y_valid)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(val_mae))

Validation MAE when not specifying max_leaf_nodes: 29,653


#### **Step 10: Model Tuning - Approach 1: Use function-loop-list**

##### **Step 10.1: For Decision Tree Model - max_leaf_nodes (tree size)** 

In [41]:
# Using best value for max_leaf_nodes
def dtree_compare_max_leaf_nodes(max_leaf_nodes=100):
    dtree_model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    dtree_model.fit(X_train, y_train)
    y_pred = dtree_model.predict(X_valid)
    val_mae = mean_absolute_error(y_valid, y_pred)
    return round(val_mae,2)

list_max_leaf_nodes = []
for i in range(20, 200, 20):
    print(dtree_compare_max_leaf_nodes(i))
    list_max_leaf_nodes.append(dtree_compare_max_leaf_nodes(i))
print(f"Best Validation MAE using the best max_leaf_nodes = {min(list_max_leaf_nodes)}")

28707.31
28106.18
27110.9
27389.89
27282.51
27322.73
27093.34
27159.35
27425.13
Best Validation MAE using the best max_leaf_nodes = 27093.34


##### **Step 10.2: For Random Forest Model - n_estimators** 

In [42]:
from sklearn.ensemble import RandomForestRegressor

# Define a function for Random Forest Tree to compare its parameters
def rf_compare_n_estimators(n_estimators=100):    
    # Define the model. Set random_state to 1
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=1)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_valid)
    val_mae = mean_absolute_error(y_valid, y_pred)
    return round(val_mae,2)


list_n_estimators = []
for i in range(20, 200, 20):
    print(rf_compare_n_estimators(i))
    list_n_estimators.append(rf_compare_n_estimators(i))
print(f"Best Validation MAE using the best n_estimators = {min(list_n_estimators)}")

# later epoch takes longer time to train


21924.85
22108.63
21891.25
21758.53
21857.16
21832.84
21844.24
21929.09
21973.68
Best Validation MAE using the best n_estimators = 21758.53


#### **Step 11: Other Approaches to evaluate, test, and tune hyperparameters**

In [None]:
# Other approaches to find best hyper parameters

def get_mae(max_leaf_nodes, X_train, X_valid, y_train, y_valid): 
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0) 
    model.fit(X_train, y_train) 
    y_pred = model.predict(X_valid) 
    mae = mean_absolute_error(y_valid, y_pred) 
    return(mae)

In [None]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

# Method 1: Write loop to find the ideal tree size from candidate_max_leaf_nodes
scores = {}
for leaf_size in candidate_max_leaf_nodes: 
    mae = get_mae(leaf_size, X_train, X_valid, y_train, y_valid) 
    
    # dict[key] = value
    scores[leaf_size] = mae 

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500) 
best_tree_size = min(scores, key=scores.get)        
    # .get to retrieve value from that key

# ---------------------------------------------------------------------------------------
print(f"Best Tree Size = {best_tree_size}\n")

val_mae_dtree_best_size = get_mae(best_tree_size, X_train, X_valid, y_train, y_valid)
print("Printing method 2: MAE: {:,.02f}\n".format(val_mae_dtree_best_size))

Best Tree Size = 100

Printing method 2: MAE: 27,282.51



In [117]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
             
# Method 2: creating dictionary {key: value for item in iterable} in one line
scores = {leaf_size: get_mae(leaf_size, X_train, X_valid, y_train, y_valid) for leaf_size in candidate_max_leaf_nodes}

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500) 
best_tree_size = min(scores, key=scores.get)        
    # .get to retrieve value from that key

# ---------------------------------------------------------------------------------------
print(f"Best Tree Size = {best_tree_size}\n")

val_mae_dtree_best_size = get_mae(best_tree_size, X_train, X_valid, y_train, y_valid)

print(f"Printing method 3: MAE: {val_mae_dtree_best_size:,.2f}\n")
# val_mae_rf = mean_absolute_error()

Best Tree Size = 100

Printing method 3: MAE: 27,282.51



In [None]:
# Note: 3 Ways of Printing:
print(f"1: MAE: {round(val_mae_dtree_best_size, 2)}")

print("2: MAE: {:,.02f}".format(val_mae_dtree_best_size))

print(f"3: MAE: {val_mae_dtree_best_size:,.2f}")

3 Ways of Printing:

1: MAE: 27282.51
2: MAE: 27,282.51
3: MAE: 27,282.51


### **Python Tips:**

In [104]:
# Good to know 1: Built-in library: datetime
from datetime import datetime

newest_home_age = datetime.now().year - 2010

newest_home_age

15

In [121]:
# Good to know 2: Creating a List using .split() method
feature_names = 'LotArea * YearBuilt * 1stFlrSF * 2ndFlrSF * FullBath * BedroomAbvGr * TotRmsAbvGrd' 
feature_names = feature_names.split(' * ')
feature_names

['LotArea',
 'YearBuilt',
 '1stFlrSF',
 '2ndFlrSF',
 'FullBath',
 'BedroomAbvGr',
 'TotRmsAbvGrd']