# Predictions for House Prices competition

This is part of the Machine Learning course. 

# Decision Tree Model

In [24]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Loading data
main_file_path = '../input/train.csv'
iowa_data = pd.read_csv(main_file_path)
y = iowa_data.SalePrice
predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 
              'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = iowa_data[predictors]

# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# get mae using decision tree regressor 
def get_mae_for_decision_tree(max_leaf_nodes, train_X, val_X, train_y, val_y):
    # Define model
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    # Fit model
    model.fit(train_X, train_y)
    # Make predictions
    predicted_vals = model.predict(val_X)
    # Calculate MAE
    mae = mean_absolute_error(val_y, predicted_vals)
    return(mae)
    
print("Model using decision tree regressor.")
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae_for_decision_tree(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))


Model using decision tree regressor.
Max leaf nodes: 5  		 Mean Absolute Error:  35190
Max leaf nodes: 50  		 Mean Absolute Error:  27825
Max leaf nodes: 500  		 Mean Absolute Error:  32662
Max leaf nodes: 5000  		 Mean Absolute Error:  33382


# Random Forest Model

In [25]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Loading data
main_file_path = '../input/train.csv'
iowa_data = pd.read_csv(main_file_path)
y = iowa_data.SalePrice
predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 
              'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = iowa_data[predictors]

# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

print("Model using random forest regressor.")
forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
iowa_predicted_vals = forest_model.predict(val_X)
print("Mean Absolute Error: %d" %(mean_absolute_error(val_y, iowa_predicted_vals)))


Model using random forest regressor.
Mean Absolute Error: 24065
