In [None]:
# Import libraries

# First, let’s import all of the modules, functions and objects we are going to use in this tutorial.

# Pandas for data handling
import pandas # https://pandas.pydata.org/

# NumPy for numerical computing
import numpy as np # https://numpy.org/

# MatPlotLib for visualization
import matplotlib.pyplot as pl  # https://matplotlib.org/

# assessment
from sklearn import model_selection # for model comparisons
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# algorithms
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Let's set up our standard example problem: 
# Can we predict 'callSign' using these three features:  'Depth', 'Temperature', 'Salinity' ?

# Load the data
print('Loading data from file ...')  
dataset = pandas.read_csv('floats.csv')
print('Removing rows with missing data ...')  
dataset = dataset.dropna()
print('Reading list of problem variables X and y...')
X_name = [ 'Depth', 'Temperature', 'Salinity' ] 
y_name = 'callSign'
X = dataset[X_name]   
y = dataset[y_name]   

# setting the seed allows for repeatability
seed = 42 

print('Partitioning data into parts: formative (for development) and summative (for testing) ...')
test_size = 0.20   # means 20 percent
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)

print('done \n')

In [None]:
# Chose the formative scoring method
scoring = 'f1_micro'

In [None]:
# Chose the Algorithm and tune some hyperparameters

selected_model = DecisionTreeClassifier()
hyperparameters = {'max_depth':[4, 5, 6, 7], 'criterion':['gini', 'entropy'] }

print("Now tuning hyperparameters...")
clf = GridSearchCV(selected_model, hyperparameters, cv=5, scoring=scoring, verbose=4)
clf.fit(X_train, y_train)

print("Best hyperparameters found on development set:")
print(clf.best_params_)
print("Grid scores on development set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print('done \n')

tuned_model = clf.best_estimator_

In [None]:
print("Now testing the tuned model on the separate test set...")
print("Detailed classification report:")
print('\n')
y_true, y_pred = y_test, tuned_model.predict(X_test)
print(classification_report(y_true, y_pred))
print('done \n')

print(f'Tuned decision tree has {tuned_model.tree_.node_count} nodes with maximum depth {tuned_model.tree_.max_depth}.')

In [None]:
# Compare to the default, unrestricted tree

estimator = DecisionTreeClassifier()
estimator.fit(X_train, y_train)
print(f'Default Decision tree has {estimator.tree_.node_count} nodes with maximum depth {estimator.tree_.max_depth}.')
print("Detailed classification report:")
print('\n')
y_true, y_pred = y_test, estimator.predict(X_test)
print(classification_report(y_true, y_pred))
print('done \n')