In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

In [2]:
golf_data = pd.read_csv('golf_dataset.csv')

In [3]:
print('Dataset length:', len(golf_data))
print('Data shape:', golf_data.shape)

Dataset length: 14
Data shape: (14, 7)


In [4]:
golf_data.head()

Unnamed: 0,outlook,temp,temperature,hum,humidity,windy,play
0,overcast,83,hot,86,high,False,yes
1,overcast,64,cool,65,normal,True,yes
2,overcast,72,mild,90,high,True,yes
3,overcast,81,hot,75,normal,False,yes
4,rainy,70,mild,96,high,False,yes


In [5]:
from sklearn import preprocessing

le_outlook = preprocessing.LabelEncoder()
le_outlook.fit(golf_data['outlook'].unique())
le_temp = preprocessing.LabelEncoder()
le_temp.fit(golf_data['temperature'].unique())
le_hum = preprocessing.LabelEncoder()
le_hum.fit(golf_data['humidity'].unique())
le_windy = preprocessing.LabelEncoder()
le_windy.fit(golf_data['windy'].unique())
le_play = preprocessing.LabelEncoder()
le_play.fit(golf_data['play'].unique())

le_golf_data = pd.DataFrame()

le_golf_data['outlook'] = le_outlook.transform(golf_data['outlook'])
le_golf_data['temperature'] = le_temp.transform(golf_data['temperature'])
le_golf_data['humidity'] = le_hum.transform(golf_data['humidity'])
le_golf_data['windy'] = le_windy.transform(golf_data['windy'])
le_golf_data['play'] = le_play.transform(golf_data['play'])
le_golf_data

Unnamed: 0,outlook,temperature,humidity,windy,play
0,0,1,0,0,1
1,0,0,1,1,1
2,0,2,0,1,1
3,0,1,1,0,1
4,1,2,0,0,1
5,1,0,1,0,1
6,1,0,1,1,0
7,1,2,1,0,1
8,1,2,0,1,0
9,2,1,0,0,0


In [6]:
features = ['outlook', 'temperature', 'humidity', 'windy']
target = 'play'

x = le_golf_data[features]
y = le_golf_data[target]

In [7]:
clf_gini = DecisionTreeClassifier(criterion='gini', random_state=100,
                                  max_depth=5, min_samples_leaf=1)
clf_gini.fit(x, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [8]:
clf_gini.predict([[2, 0, 0, 1]])

array([0])

In [9]:
clf_gini.predict([[2, 1, 1, 0]])

array([1])

In [10]:
clf_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100,
                                  max_depth=5, min_samples_leaf=1)
clf_entropy.fit(x, y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [11]:
clf_entropy.predict([[2, 0, 0, 1]])

array([0])

In [12]:
clf_entropy.predict([[2, 1, 1, 0]])

array([1])

In [13]:
!pip install graphviz
#!pip3 install pydotplus

In [14]:
# visualize data
import graphviz
import pydotplus
import collections
#data_feature_names = features
dot_data = tree.export_graphviz(clf_entropy,
                                feature_names = features,
                                out_file = None,
                                filled = True,
                                rounded = True)
graph = pydotplus.graph_from_dot_data(dot_data)
colors = ('turquoise', 'orange')
edges = collections.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))
    
for edge in edges:
    edges[edge].sort()
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('golf_tree_01.png')

True