In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error


In [2]:
df = pd.read_csv('./src/csv/melb_data.csv')
# df.columns.values
df.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [3]:
y = df["Price"]
df["YearBuilt"] = df["YearBuilt"].fillna(0)
feature_columns = ['Landsize', 'YearBuilt', 'Bedroom2', 'Bedroom2', 'Lattitude', 'Longtitude']
X = df[feature_columns]
X.head()

Unnamed: 0,Landsize,YearBuilt,Bedroom2,Bedroom2.1,Lattitude,Longtitude
0,202.0,0.0,2.0,2.0,-37.7996,144.9984
1,156.0,1900.0,2.0,2.0,-37.8079,144.9934
2,134.0,1900.0,3.0,3.0,-37.8093,144.9944
3,94.0,0.0,3.0,3.0,-37.7969,144.9969
4,120.0,2014.0,3.0,3.0,-37.8072,144.9941


A tree's depth is a measure of how many splits it makes before coming to a prediction.

In [4]:
# Specify Model
iowa_model = DecisionTreeRegressor()
# Fit model
iowa_model.fit(X,y)

print("First in-sample predictions:", iowa_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())

First in-sample predictions: [1480000. 1035000. 1465000.  850000. 1600000.]
Actual target values for those homes: [1480000.0, 1035000.0, 1465000.0, 850000.0, 1600000.0]


In [5]:
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=1)
dtr = DecisionTreeRegressor(random_state=1)
dtr.fit(train_X,train_y)
prediction = dtr.predict(val_X)
print("prediccion", prediction)
print("Actual:", val_y.head().tolist())

prediccion [1402000.  387000. 2300000. ... 1400000. 1200000.  650000.]
Actual: [1640000.0, 675000.0, 2800000.0, 615000.0, 2700000.0]


In [6]:
val_mse = mean_absolute_error(val_y, prediction)
print(val_mse)

244277.00186548848


# Underfitting and Overfitting

- The **max_leaf_nodes** argument provides a very sensible way to control overfitting vs underfitting.

In [7]:
def get_mae(max_leaf_node, train_X, val_X, train_y, val_y):
    dt = DecisionTreeRegressor(max_leaf_nodes=max_leaf_node, random_state=0)
    dt.fit(train_X, train_y)
    prediction = dt.predict(val_X)
    val_mae = mean_absolute_error(val_y, prediction)
    return(val_mae)

In [9]:
for max_leaf_nodes in [5, 50, 100, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  359423
Max leaf nodes: 50  		 Mean Absolute Error:  262370
Max leaf nodes: 100  		 Mean Absolute Error:  250219
Max leaf nodes: 500  		 Mean Absolute Error:  225670
Max leaf nodes: 5000  		 Mean Absolute Error:  240711


Of the options listed, 500 is the optimal number of leaves.

**Overfitting:** capturing spurious patterns that won't recur in the future, leading to less accurate predictions, or
<br>
**Underfitting:** failing to capture relevant patterns, again leading to less accurate predictions.

In [13]:
dtr = DecisionTreeRegressor(max_leaf_nodes=500, random_state=0)
dtr.fit(X,y)

prediction_real = dtr.predict(val_X)
mae = mean_absolute_error(val_y, prediction_real)
print(f"mae: {mae}")

mae: 163902.08369090268
