In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [10]:
url = 'https://raw.githubusercontent.com/mGalarnyk/Tutorial_Data/master/King_County/kingCountyHouseData.csv'

In [11]:
df = pd.read_csv(url)

In [12]:
df.shape

(21613, 21)

In [13]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [14]:
# Selecting columns I am interested in
columns = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','price']

In [15]:
df_pred = df.loc[:, columns]

In [16]:
df_pred.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,price
0,3,1.0,1180,5650,1.0,221900.0
1,3,2.25,2570,7242,2.0,538000.0
2,2,1.0,770,10000,1.0,180000.0
3,4,3.0,1960,5000,1.0,604000.0
4,3,2.0,1680,8080,1.0,510000.0


In [17]:
features = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors']
X = df_pred.loc[:, features]

In [18]:
y = df_pred.loc[:, ['price']]

In [19]:
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors
0,3,1.00,1180,5650,1.0
1,3,2.25,2570,7242,2.0
2,2,1.00,770,10000,1.0
3,4,3.00,1960,5000,1.0
4,3,2.00,1680,8080,1.0
...,...,...,...,...,...
21608,3,2.50,1530,1131,3.0
21609,4,2.50,2310,5813,2.0
21610,2,0.75,1020,1350,2.0
21611,3,2.50,1600,2388,2.0


In [20]:
y

Unnamed: 0,price
0,221900.0
1,538000.0
2,180000.0
3,604000.0
4,510000.0
...,...
21608,360000.0
21609,400000.0
21610,402101.0
21611,400000.0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)

In [22]:
X_train.shape

(16209, 5)

In [23]:
type(X_test)

pandas.core.frame.DataFrame

####2. MAKE AN INSTANCE OF THE MODEL

In [24]:
reg = DecisionTreeRegressor(max_depth = 2, random_state = 0)

####3. TRAIN THE MODEL ON THE DATA

In [25]:
reg.fit(X_train, y_train)

####4. PREDICT LABELS OF UNSEEN TEST DATA

In [26]:
# Predicting multiple observations
reg.predict(X_test[0:10])

array([ 406622.58288211, 1095030.54807692,  406622.58288211,
        406622.58288211,  657115.94280443,  406622.58288211,
        406622.58288211,  657115.94280443,  657115.94280443,
       1095030.54807692])

In [27]:
X_test.head(1)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors
17384,2,1.5,1430,1650,3.0


In [28]:
X_test.iloc[0]

bedrooms          2.0
bathrooms         1.5
sqft_living    1430.0
sqft_lot       1650.0
floors            3.0
Name: 17384, dtype: float64

In [29]:
# predict 1 observation.
reg.predict(X_test.iloc[0].values.reshape(1,-1))



array([406622.58288211])

In [30]:
score = reg.score(X_test, y_test)

In [31]:
score

0.4380405655348807

#Evaluate the model using additional metrics

In [32]:
# Predict on the test set
y_pred = reg.predict(X_test)

In [33]:
# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2:.4f}')

R-squared: 0.4380


In [34]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')


Mean Absolute Error: 177434.03


In [35]:
# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

Mean Squared Error: 74651021633.69


In [36]:
# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 273223.39


In [37]:
print(f'R-squared: {r2:.4f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'Mean Squared Error: {mse:.2f}')
print(f'Root Mean Squared Error: {rmse:.2f}')

R-squared: 0.4380
Mean Absolute Error: 177434.03
Mean Squared Error: 74651021633.69
Root Mean Squared Error: 273223.39
