# Using Library


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pydotplus
from IPython.display import Image
data = pd.read_csv('data.csv')

X = data[['Temperature', 'Outlook', 'Humidity', 'Windy']]
y = data['Play Golf?']

X = pd.get_dummies(X, columns=['Temperature', 'Outlook', 'Humidity', 'Windy'])
print(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

results = X_test.copy()
results['Predicted_Play Golf?'] = y_pred
print(results)

dot_data = export_graphviz(clf, out_file=None,
                           feature_names=X.columns,
                           class_names=['No', 'Yes'],
                           filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

KeyError: ignored

# From Scratch

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv('data.csv')
df['Windy'] = df['Windy'].astype(int)
X = df.drop(columns=['Play Golf?'])
y = df['Play Golf?']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,Day,Temperature,Outlook,Humidity,Windy,Play Golf?
0,5-Jul,hot,sunny,high,False,no
1,6-Jul,hot,sunny,high,True,no
2,7-Jul,hot,overcast,high,False,yes
3,9-Jul,cool,rain,normal,False,yes
4,10-Jul,cool,overcast,normal,True,yes
5,12-Jul,mild,sunny,high,False,no
6,14-Jul,cool,sunny,normal,False,yes
7,15-Jul,mild,rain,normal,False,yes
8,20-Jul,mild,sunny,normal,True,yes
9,21-Jul,mild,overcast,high,True,yes


In [None]:
print('coumn names:', df.columns)

coumn names: Index(['Day', 'Temperature', 'Outlook', 'Humidity', 'Windy', 'Play Golf?'], dtype='object')


In [None]:
class DecisionTree:
    def __init__(self):
        self.tree = {}

    def entropy(self, target_col):
        elements, counts = np.unique(target_col, return_counts=True)
        entropy = np.sum([(-counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
        return entropy

    def information_gain(self, data, feature, target):
        total_entropy = self.entropy(data[target])
        vals, counts = np.unique(data[feature], return_counts=True)
        weighted_entropy = np.sum([(counts[i] / np.sum(counts)) * self.entropy(data[data[feature] == vals[i]][target]) for i in range(len(vals))])
        information_gain = total_entropy - weighted_entropy
        return information_gain

    def build_tree(self, data, features, target):
        if len(pd.unique(data[target])) == 1:
            return pd.unique(data[target])[0]

        if len(features) == 0:
            return pd.unique(data[target])[pd.argmax(pd.unique(data[target], return_counts=True)[1])]

        best_feature = max(features, key=lambda x: self.information_gain(data, x, target))
        tree = {best_feature: {}}

        features = [f for f in features if f != best_feature]

        for val in pd.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == val).dropna()
            subtree = self.build_tree(sub_data, features, target)
            tree[best_feature][val] = subtree

        return tree


In [None]:
dt = DecisionTree()
features = df.columns[1:-1]
target = 'Play Golf?'
dt.tree = dt.build_tree(df, features, target)
sample_input = {'Temperature': 'hot', 'Outlook': 'sunny', 'Humidity': 'high', 'Windy': 0}

In [None]:
sample_input = {'Temperature': 'hot', 'Outlook': 'sunny', 'Humidity': 'high', 'Windy': 0}


In [None]:
def predict(input, tree):
    for key in input.keys():
        if key in tree.keys():
            try:
                subtree = tree[key][input[key]]
                if isinstance(subtree, dict):
                    return predict(input, subtree)
                else:
                    return subtree
            except:
                return "Unable to make a prediction."

In [None]:
print("Sample input:", sample_input)
print("Prediction:", predict(sample_input, dt.tree))

Sample input: {'Temperature': 'hot', 'Outlook': 'sunny', 'Humidity': 'high', 'Windy': 0}
Prediction: no


In [None]:
from sklearn.metrics import accuracy_score

def calculate_accuracy(X_test, y_test, tree):
    predictions = [predict(X_test.iloc[i].to_dict(), tree) for i in range(len(X_test))]
    accuracy = accuracy_score(y_test, predictions)
    return accuracy


In [None]:
accuracy = calculate_accuracy(X_test, y_test, dt.tree)
print("Accuracy:", accuracy*100)

Accuracy: 1.0
