# Learning from Kaggle 1
## Melbourne Housing

In [18]:
import pandas as pd

melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [19]:
melbourne_data = melbourne_data.dropna(axis=0)

### Selecting The Predicition Target

In [20]:
y = melbourne_data.Price

### Choosing "Features"
considering features(columns) for your prediction target

In [21]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [22]:
X = melbourne_data[melbourne_features]

In [23]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [24]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


## Building the Model

use the scikit-learn library. This library is written as sklearn.

In [25]:
from sklearn.tree import DecisionTreeRegressor

#Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

#Fit model
melbourne_model.fit(X, y)

make predictions for the first few rows of the training data to see how the predict function works.

In [26]:
print("Making predictions for the following 5 houses:")
print(X.head())
print(melbourne_model.predict(X.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
[1035000. 1465000. 1600000. 1876000. 1636000.]


## Model Validation
calculate Mean Absolute Error(MAE)

error=actual−predicted


In [27]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

1115.7467183128902

The validation above is not good, because I used the same data to train and validate. For more accuracy, I need to split the data into two: one for train and one for validate to calculate mean_absolute_error(MAE)
To do this, train_test_split function from scikit-learn library

In [28]:
from sklearn.model_selection import train_test_split

#split data into training and validation data, for both features(X) and target(y)
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

#define model
melbourne_model = DecisionTreeRegressor()
#Fit model
melbourne_model.fit(train_X, train_y)

#get predicted prices on validation data
val_prediction = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_prediction))

272253.33225737035


## Overfitting and Underfitting
Overfitting: too many leaves. 
Underfitting: too few leaves.
Need to find the least Mean-Absolute-Error
Use a utility function to help compare MAE scores from different values for max_leaf_nodes

In [29]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [30]:
#compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5 		 Mean Absolute Error: 385696
Max leaf nodes: 50 		 Mean Absolute Error: 279794
Max leaf nodes: 500 		 Mean Absolute Error: 261718
Max leaf nodes: 5000 		 Mean Absolute Error: 271320


In [31]:
best_tree_size = 500

## Fit Model Using All Data
Use the best tree size for more accurate results

In [32]:
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state = 0)
final_model.fit(X,y)

## Random Forests
The random forest uses many trees, and it makes a prediction by averaging the predictions of each component tree.
This time, using the RandomForestRegressor class instead of DecisionTreeRegressor.

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

207190.6873773146
