In [None]:

import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Path of the file to read
iowa_file_path = '../input/home-data-for-ml-course/train.csv'

home_data = pd.read_csv(iowa_file_path)

## Using a decision tree model, I wish to perform regression to predict the price of houses in Iowa based on several pertinent features. I will optimize my model using a decision tree regressor.

In [None]:
# print the list of columns in the dataset to find the name of the prediction target
print(home_data.columns)

In [None]:
y = home_data.SalePrice


In [None]:
# Create the list of features below
feature_names = [ 'LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

# Select data corresponding to features in feature_names
X = home_data[feature_names]


In [None]:
# Review data
# print description or statistics from X
print(X.describe)

# print the top few lines
print(X.head)

In [None]:
from sklearn.tree import DecisionTreeRegressor
#specify the model. We will be using the Decision Tree Regression Algorithim
iowa_model = DecisionTreeRegressor(random_state=1)


# Fit the model
iowa_model.fit(X,y)


In [None]:
#making predictions with the data. Recall that we defined the data under the variable X.
predictions = iowa_model.predict(X)
print(predictions)


In [None]:
#Comparing the first few price predictions to the actual values, using head
print("Predictions for the first 5 houses:")
print(iowa_model.predict(X.head()))
print("Actual values:") 
print(y.head().tolist())

## Improving our predictions by training/testing & model validation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# split data into training and validation data
#uniform random state
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)
# Define model
iowa_model = DecisionTreeRegressor()
# Fit model
iowa_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = iowa_model.predict(val_X)
#error
print(mean_absolute_error(val_y, val_predictions))

Can we improve our model?

Since I am using a decision tree model, we can add more splits for the tree to increase the number of leaves we will have and the number of groups we are splitting up the houses into.

Overfitting: Lesser leaves make our model highly accurate to the training data, but a poor performer on new, unfamiliar data.
Underfitting: does even poorly on training data

# Optimization of our Model
Let's optimize our model to account for underfitting and overfitting.

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

#Mean absolute error to help us understand which number of leaves is optimum
#Using max_leaf_nodes function, which grows a tree with the "best" nodes first (best nodes are determined as having the
#lowest impurity value)
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
#Looping through an assortment of models to find the optimum number of leaves
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

for max_leaf_nodes in candidate_max_leaf_nodes:
    optimization_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, optimization_mae))

Seems to be that the optimum number of leaf nodes is 100. 

In [None]:
best_tree_size = 100

Now we will fit the model using all of our data in the set.

In [None]:
best_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
best_model.fit(X, y)

print("Predictions for the first 5 houses using our new model:")
print(best_model.predict(X.head()))
print("Actual values:") 
print(y.head().tolist())



In [None]:
#Accuracy on our validation data
best_model.score(val_X, val_y)