In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
df=pd.read_csv('melb_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

# Droping all rows with null values (dropna(axis=0))

In [5]:
df=df.dropna(axis=0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6196 entries, 1 to 12212
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         6196 non-null   object 
 1   Address        6196 non-null   object 
 2   Rooms          6196 non-null   int64  
 3   Type           6196 non-null   object 
 4   Price          6196 non-null   float64
 5   Method         6196 non-null   object 
 6   SellerG        6196 non-null   object 
 7   Date           6196 non-null   object 
 8   Distance       6196 non-null   float64
 9   Postcode       6196 non-null   float64
 10  Bedroom2       6196 non-null   float64
 11  Bathroom       6196 non-null   float64
 12  Car            6196 non-null   float64
 13  Landsize       6196 non-null   float64
 14  BuildingArea   6196 non-null   float64
 15  YearBuilt      6196 non-null   float64
 16  CouncilArea    6196 non-null   object 
 17  Lattitude      6196 non-null   float64
 18  Longtit

# select the column we want to predict & Choosing "Features"

In [7]:
y=df.Price
features=['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
x=df[features]

# Scikitlearn Decision Tree


In [31]:
from sklearn.tree import DecisionTreeRegressor

tree_model=DecisionTreeRegressor(random_state=12)
tree_model.fit(x,y)

DecisionTreeRegressor(random_state=12)

In [32]:
print("Making predictions for the following 5 houses:")
print(x.head())
print("The predictions are")
print(tree_model.predict(x.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]


In [33]:
y.head()

1    1035000.0
2    1465000.0
4    1600000.0
6    1876000.0
7    1636000.0
Name: Price, dtype: float64

# mean absolute error mean(abs(actual-predicted))

In [34]:
from sklearn.metrics import mean_absolute_error
predicted_price=tree_model.predict(x)
mean_absolute_error(y,predicted_price)

1115.7467183128902

# train_test_split from sklearn.model_selection

In [51]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1)
des_tree=DecisionTreeRegressor(random_state=1)
des_tree.fit(x_train,y_train)
predicted=des_tree.predict(x_test)
print(predicted[:10])
print(y_test.head(10))

[ 503000. 1857000.  760000. 1395000. 4250000. 1195000.  905000. 2300000.
  810000.  710000.]
6048      620000.0
9186     2320000.0
3991      750000.0
5829     1120000.0
3616     6500000.0
2409      870000.0
10632     760000.0
5666     2600000.0
4287      830000.0
4246     1280000.0
Name: Price, dtype: float64


In [53]:
#mean absolute error
mae=mean_absolute_error(y_test,predicted)
mae

251688.7630729503

# max_leaf_nodes
 the max_leaf_nodes argument provides a very sensible way to control **overfitting vs underfitting**. The **more leaves we allow the model to make**, the more we move from the underfitting area in the above graph to the overfitting area.

In [60]:
def get_mae(max_leaf_nodes,train_x,train_y,test_x,test_y):
    model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0)
    model.fit(train_x,train_y)
    predicted_val=model.predict(test_x)
    mae=mean_absolute_error(test_y,predicted_val)
    return mae

train_x,test_x,train_y,test_y=train_test_split(x,y,random_state=0)

dic={}

for max_leaf in [5,50,25,100,150,200,250,500,550,700,750,5000]:
    new_mae=get_mae(max_leaf,train_x,train_y,test_x,test_y)
    print('max leaf node: %d \t\t mae: %d '%(max_leaf,new_mae))
    dic[max_leaf]=round(new_mae)
dic

max leaf node: 5 		 mae: 385696 
max leaf node: 50 		 mae: 279794 
max leaf node: 25 		 mae: 307919 
max leaf node: 100 		 mae: 269191 
max leaf node: 150 		 mae: 267918 
max leaf node: 200 		 mae: 270633 
max leaf node: 250 		 mae: 269945 
max leaf node: 500 		 mae: 261718 
max leaf node: 550 		 mae: 260545 
max leaf node: 700 		 mae: 261664 
max leaf node: 750 		 mae: 261713 
max leaf node: 5000 		 mae: 271996 


{5: 385697,
 50: 279795,
 25: 307920,
 100: 269192,
 150: 267918,
 200: 270633,
 250: 269945,
 500: 261718,
 550: 260545,
 700: 261664,
 750: 261713,
 5000: 271996}

#### Now we   know that max_leaf_nodes  is best at 550 by using that we fit the model to main x and y

In [62]:
final_model=DecisionTreeRegressor(max_leaf_nodes =550,random_state=0)
final_model.fit(x,y)

DecisionTreeRegressor(max_leaf_nodes=550, random_state=0)

.

.

# ########...........congratz you made your first ML model............########

.

.

.

# Random forest
Decision trees leave you with a difficult decision. A deep tree with lots of leaves will overfit because each prediction is coming from historical data from only the few houses at its leaf. But a shallow tree with few leaves will perform poorly because it fails to capture as many distinctions in the raw data.

Even today's most sophisticated modeling techniques face this tension between underfitting and overfitting. But, many models have clever ideas that can lead to better performance. We'll look at the random forest as an example.

The random forest uses many trees, and it makes a prediction by averaging the predictions of each component tree. It generally has much better predictive accuracy than a single decision tree and it works well with default parameters. If you keep modeling, you can learn more models with even better performance, but many of those are sensitive to getting the right parameters.

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split    
# Load data
#melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv'
melbourne_data = pd.read_csv('melb_data.csv') 
# Filter rows with missing values
melbourne_data = melbourne_data.dropna(axis=0)
# Choose target and features
y = melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]



# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

In [65]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

191669.7536453626


 #### max_leaf_nodes is best at *550* and mae=*260545*,   in random forest the mae=*191669*