In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
melbourne_data = pd.read_csv('C:\\Users\\seba_\\Documents\\melb_data.csv',index_col=0)
#pd.set_option("display.max_rows", 5)
melbourne_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [3]:
# hasnans permite saber si hay algun valor nan en la columna
melbourne_data.Price.hasnans

False

In [4]:
# target (predecir los precios)
y = melbourne_data.Price

In [5]:
imputed = pd.DataFrame()

imputed['Bathroom'] = melbourne_data.Bathroom.fillna(melbourne_data.Bathroom.mean())
imputed['Landsize'] = melbourne_data.Landsize.fillna(melbourne_data.Landsize.mean())
imputed['BuildingArea'] = melbourne_data.BuildingArea.fillna(melbourne_data.BuildingArea.mean())
imputed['YearBuilt'] = melbourne_data.YearBuilt.fillna(round(melbourne_data.YearBuilt.mean()))
imputed['Lattitude'] = melbourne_data.Lattitude.fillna(melbourne_data.Lattitude.mean())
imputed['Longtitude'] = melbourne_data.Longtitude.fillna(melbourne_data.Longtitude.mean())
imputed['Distance'] = melbourne_data.Distance.fillna(melbourne_data.Distance.mean())
imputed['Car'] = melbourne_data.Car.fillna(round(melbourne_data.Car.mean()))
imputed.head()

Unnamed: 0,Bathroom,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Distance,Car
1,1.0,202.0,151.220219,1966.0,-37.7996,144.9984,2.5,1.0
2,1.0,156.0,79.0,1900.0,-37.8079,144.9934,2.5,0.0
4,2.0,134.0,150.0,1900.0,-37.8093,144.9944,2.5,0.0
5,2.0,94.0,151.220219,1966.0,-37.7969,144.9969,2.5,1.0
6,1.0,120.0,142.0,2014.0,-37.8072,144.9941,2.5,2.0


In [6]:
typeDummies = pd.get_dummies(melbourne_data.Type,prefix='type')

In [7]:
regionNameDummies = pd.get_dummies(melbourne_data.Regionname,prefix='regionName')

In [8]:
#method = pd.get_dummies(melbourne_data.Method,prefix='method')
#no mejora el R2

In [9]:
X = pd.concat([imputed, typeDummies, regionNameDummies], axis=1)

In [10]:
xe, xv, ye, yv = train_test_split(X, y, train_size = 0.5, test_size = 0.5)

In [11]:
print('Dimensiones de X:',X.shape)
print('Corresponde al 0.5% de X:',xe.shape)

Dimensiones de X: (18396, 19)
Corresponde al 0.7% de X: (9198, 19)


In [12]:
# Define model
melbourne_model = DecisionTreeRegressor()

# Fit model
melbourne_model.fit(xe, ye)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [13]:
zv = melbourne_model.predict(xv)
print('MAE:',mean_absolute_error(yv, zv))
print('R2:',r2_score(yv,zv))

MAE: 247921.59208228838
R2: 0.568400603749394


In [14]:
forest_model = RandomForestRegressor()
forest_model.fit(xe, ye)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [15]:
zv2 = forest_model.predict(xv)
print('MAE:',mean_absolute_error(yv, zv2))
print('R2:',r2_score(yv,zv2))

MAE: 199950.12276997208
R2: 0.7087966999266533


In [16]:
X.describe()

Unnamed: 0,Bathroom,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Distance,Car,type_h,type_t,type_u,regionName_Eastern Metropolitan,regionName_Eastern Victoria,regionName_Northern Metropolitan,regionName_Northern Victoria,regionName_South-Eastern Metropolitan,regionName_Southern Metropolitan,regionName_Western Metropolitan,regionName_Western Victoria
count,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0,18396.0
mean,1.538492,558.116371,151.220219,1965.941563,-37.809849,144.996338,10.389986,1.690259,0.65748,0.108991,0.233529,0.108447,0.00424,0.288487,0.003425,0.036965,0.344803,0.211296,0.002283
std,0.62088,3428.730081,337.236125,25.827948,0.073436,0.09626,6.008886,0.871371,0.474565,0.311636,0.423087,0.310953,0.064979,0.453071,0.058422,0.18868,0.475317,0.408239,0.047729
min,0.0,0.0,0.0,1196.0,-38.18255,144.43181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,250.75,140.0,1966.0,-37.8454,144.950952,6.3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.538492,558.116371,151.220219,1966.0,-37.809849,144.996338,9.7,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,596.0,151.220219,1970.0,-37.7674,145.0469,13.3,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
max,8.0,433014.0,44515.0,2018.0,-37.40853,145.52635,48.1,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
