In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Copy the datafiles
!mkdir AmesIowaHousingData
!cp drive/MyDrive/cs32-public/chap17/AmesIowaHousingData/AmesHousing.csv ./AmesIowaHousingData

### chap17/ames.ipynb

This interactive Python notebook creates a ML model based on the `DecisionTreeRegressor` from `scikit-learn`. You set the training data by changing the input CSV file in the following code block.

In [None]:
# Specify the input CSV file for building and testing the model
csv_file = 'AmesIowaHousingData/AmesHousing.csv'

### Working with `pandas`

Before we begin to build our machine learning model, let's practice using a few of the important `DataFrame` functions and methods.

In [None]:
import pandas as pd

In [None]:
# Read the CSV data into a DataFrame
df = pd.read_csv(csv_file)

# Print a summary of the data
df.describe()

If you want to see more of the columns, you can play with the `pandas` display option for maximum columns displayed. 20 is the default.

In [None]:
pd.options.display.max_columns = 20

In [None]:
# Review only the statistics for number of bedrooms
df.describe()['Bedroom AbvGr']

### Setting up the prediction model

In [None]:
# Setting the prediction target
# y = df['SalePrice']
y = df.SalePrice
y

### Picking some features

In [None]:
df.columns

In [None]:
# The model's input features
feature_names = ["Lot Area", "Year Built", "1st Flr SF", "2nd Flr SF", "Full Bath", "Bedroom AbvGr", "TotRms AbvGrd"]
X = df[feature_names]
X.describe()


In [None]:
X.head()

### Fit the model to our data

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Create an untrained model
my_model = DecisionTreeRegressor(random_state=42)

# Fit the model to the specified portion of the training data
my_model.fit(X, y)

### Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

# Split both features and target data into training and validation sets
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=42)

# Fit the model using the training data
my_model = DecisionTreeRegressor(random_state=42)
my_model.fit(train_X, train_y)

# Feed the model the test data and capture the resulting predictions
predictions = my_model.predict(test_X)

# Compare first 5 predictions against actual sale prices
actuals = test_y.to_list()
for i in range(5):
    p = int(predictions[i])
    a = actuals[i]
    d = p - a
    print(f'prediction = ${p}; actual = ${a}; diff = ${d:>6}')

### Model validation

In [None]:
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(test_y, predictions)
print(f'MAE = ${int(test_mae)}')

# Compare the MAE against the average home price
test_mean = test_y.describe()['mean']
print(f'Mean price = ${int(test_mean)}')
print(f'Percentage of price = {int(100 * test_mae / test_mean)}%')

### Finding a good fit

In [None]:
# Find the best model by varying the size of the decision tree
best_num_leaves = 0
lowest_mae = 9999999.9

# Run the experiment
print('Leaves\tMAE')
for leaves in [4, 16, 64, 128, 1024, 16384, 131072]:
    my_model = DecisionTreeRegressor(max_leaf_nodes=leaves, random_state=42)
    my_model.fit(train_X, train_y)
    predictions = my_model.predict(test_X)
    test_mae = mean_absolute_error(test_y, predictions)
    print(f'{leaves}\t${int(test_mae)}')

    if test_mae < lowest_mae:
        # Update best
        best_num_leaves = leaves
        lowest_mae = test_mae

# Report best
print(f'\nBEST model uses {best_num_leaves} leaves')

### Try a different regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Will a different regressor produce an even better model?
another_model = RandomForestRegressor(random_state=42)
another_model.fit(train_X, train_y)
predictions2 = another_model.predict(test_X)

test_mae = mean_absolute_error(test_y, predictions2)
print(f'MAE (Random Forest) = ${int(test_mae)}')

# Compare the MAE against the average home price
print(f'Mean price = ${int(test_mean)}')
print(f'Percentage of price = {int(100 * test_mae / test_mean)}%')