In [1]:
import pandas as pd
from data_prep import preprocess_data

In [2]:
df = pd.read_csv('datasets/melbourne_house_price.csv', index_col=0)
df = preprocess_data(df)

In [3]:
df.describe()

Unnamed: 0,Rooms,Distance,Postcode,Bathroom,Car,Lattitude,Longtitude,Propertycount,Price_above_median,Sales_week,...,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Metropolitan,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
count,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0,...,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0,24194.0
mean,2.979954,11.069984,3112.256303,1.576678,1.688214,-29.446075,112.92771,7553.879474,0.49938,30.078532,...,0.035174,0.002232,0.117136,0.005704,0.288129,0.005415,0.046995,0.320369,0.212945,0.003307
std,0.96028,6.644837,107.622672,0.616044,0.862154,15.692217,60.180056,4496.632709,0.50001,12.285655,...,0.184223,0.047192,0.321589,0.07531,0.452901,0.073386,0.211633,0.466628,0.409398,0.057409
min,1.0,0.0,3000.0,0.0,0.0,-38.1856,0.0,83.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,6.4,3046.0,1.0,1.0,-37.8433,144.769485,4294.0,0.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,10.3,3088.0,1.576678,1.688214,-37.773925,144.96878,6567.0,0.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,13.9,3152.0,2.0,2.0,-37.663553,145.0478,10331.0,1.0,41.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
max,16.0,48.1,3978.0,9.0,18.0,0.0,145.52635,21650.0,1.0,49.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Data Partitioning
#### a) Describe training, validation & test dataset. What is the purpose of them?
Training is used to train the model. Validation is an unseen data set used after the models are trained and is used to pick a good model from the others. Test dataset is a final barrier for the model and is used to tell how well the model is capable of performing outside of test.
#### b) What is k-fold cv. What is the advantage and disadvantage?
K-Fold cross validation is where the training/validation set is split iterated over k times where some kth of the data set is selected as the training set and the rest is used as a validation set. Good for making robust and smarter models given the larger data set however takes at least k times longer.
#### c) What does it mean by stratification?
The act of organising some target variable evenly so the training isnt performed on some biased set of data.
#### d) What does random state do?
Provides a seed for the model to act on. Useful for a static (non-random) run of the model if visual tests on it are required.
#### e) Set rs to 0. Split df into X & Y then split data in a train and test set of 70/30.

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [5]:
rs = 0
y = df['Price_above_median']
x = df.drop(['Price_above_median'], axis=1)
x_mat = x.values
xtr, xtst, ytr, ytst = train_test_split(x_mat, y, test_size=0.3, stratify=y, random_state=rs)

## Decision Tree
#### a) Import and build a DT classifier. Fit against training data.
#### b) What is performance of model against training data & test data? Overfitting?
Training Data:  0.9995866548568054 <br />
Test Data:  0.8459842953574872 <br />
Training data almost reaches 100% accuracy which is exceedingly unlikely in itself. In addition to this the test data performance is significantly worse than the training data which is a pretty clear sign that the model is overfitting the training data.
#### c) What are the top 5 important features in the model?
Type_u  :  0.16587355094932293<br />
Distance  :  0.13869310033269366<br />
Regionname_Southern Metropolitan  :  0.13280849266181852<br />
Regionname_Eastern Metropolitan  :  0.07265460902203001<br />
Longtitude  :  0.06214654368587393<br />
#### d) Find best hyperparams using GSearchCV.

In [6]:
model = DecisionTreeClassifier()
model.fit(xtr, ytr)
print('Training Data: ', model.score(xtr, ytr))
print('Test Data: ', model.score(xtst, ytst))

Training Data:  0.9995866548568054
Test Data:  0.8424025347844055


In [7]:
importances = model.feature_importances_
feature_names = x.columns
indices = np.flip(np.argsort(importances), axis=0)
for i in indices[:5]:
    print(feature_names[i], ' : ', importances[i])

Type_u  :  0.16587355094932293
Distance  :  0.1388911594402163
Regionname_Southern Metropolitan  :  0.13293456893487882
Regionname_Eastern Metropolitan  :  0.07265460902203001
Longtitude  :  0.06177192290987284


In [None]:
#Model wouldnt finish training on laptop, gotta run it on a bigger machine AKA desktop
from sklearn.model_selection import GridSearchCV
params = {'criterion': ['gini', 'entropy'],
         'max_depth': range(2, 7),
         'min_samples_leaf': range(20,60,10)}
cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(), cv=10)
cv.fit(xtr, ytr)
print(classification_report(ytst, y_pred))
print(cv.best_params_)