# Decision Tree - Golf Playing Dataset

In [12]:
import pandas as pd
import numpy as np
import math
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Create dataset
data = {
    'Outlook': ['Rainy', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Sunny', 'Overcast', 
                'Rainy', 'Rainy', 'Sunny', 'Rainy', 'Overcast', 'Overcast', 'Sunny'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 
                    'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 
                 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Windy': ['FALSE', 'TRUE', 'FALSE', 'FALSE', 'FALSE', 'TRUE', 'TRUE', 
              'FALSE', 'FALSE', 'FALSE', 'TRUE', 'TRUE', 'FALSE', 'TRUE'],
    'Play_Golf': ['No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 
                  'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
df = pd.DataFrame(data)
print("Golf Dataset:")
print(df)
print(f"\nTotal: {len(df)} | Yes: {(df['Play_Golf']=='Yes').sum()}, No: {(df['Play_Golf']=='No').sum()}")

# Step 2: Encode categorical data
le_outlook = LabelEncoder()
le_temp = LabelEncoder()
le_humidity = LabelEncoder()
le_windy = LabelEncoder()
le_play = LabelEncoder()

df['Outlook_Enc'] = le_outlook.fit_transform(df['Outlook'])
df['Temperature_Enc'] = le_temp.fit_transform(df['Temperature'])
df['Humidity_Enc'] = le_humidity.fit_transform(df['Humidity'])
df['Windy_Enc'] = le_windy.fit_transform(df['Windy'])
df['Play_Golf_Enc'] = le_play.fit_transform(df['Play_Golf'])

X = df[['Outlook_Enc', 'Temperature_Enc', 'Humidity_Enc', 'Windy_Enc']]
y = df['Play_Golf_Enc']

# Step 3: Train Decision Tree
clf = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf.fit(X, y)
y_pred = clf.predict(X)
accuracy = accuracy_score(y, y_pred)

print(f"\nModel Accuracy: {accuracy*100:.2f}%")
print("\nConfusion Matrix:")
print(confusion_matrix(y, y_pred))

# Step 4: Show tree rules
tree_rules = export_text(clf, feature_names=['Outlook', 'Temperature', 'Humidity', 'Windy'])
print("\nDecision Tree Rules:")
print(tree_rules)

# Step 5: Calculate entropy and information gain
def entropy(data):
    total = len(data)
    if total == 0:
        return 0
    counts = data.value_counts()
    ent = 0
    for count in counts:
        p = count / total
        if p > 0:
            ent -= p * math.log2(p)
    return ent

def information_gain(df, feature, target):
    total_entropy = entropy(df[target])
    values = df[feature].unique()
    weighted_entropy = 0
    for value in values:
        subset = df[df[feature] == value]
        weight = len(subset) / len(df)
        weighted_entropy += weight * entropy(subset[target])
    return total_entropy - weighted_entropy

total_ent = entropy(df['Play_Golf'])
print(f"\nTotal Entropy: {total_ent:.4f}")
print("\nInformation Gain:")
features = ['Outlook', 'Temperature', 'Humidity', 'Windy']
gains = {f: information_gain(df, f, 'Play_Golf') for f in features}
for feature in features:
    print(f"{feature}: {gains[feature]:.4f}")
best_feature = max(gains, key=gains.get)
print(f"\nBest Root: {best_feature} (IG={gains[best_feature]:.4f})")

# Step 6: Prediction function
def predict_golf(outlook, temperature, humidity, windy):
    outlook_enc = le_outlook.transform([outlook])[0]
    temp_enc = le_temp.transform([temperature])[0]
    humidity_enc = le_humidity.transform([humidity])[0]
    windy_enc = le_windy.transform([windy])[0]
    input_data = pd.DataFrame([[outlook_enc, temp_enc, humidity_enc, windy_enc]], 
                              columns=['Outlook_Enc', 'Temperature_Enc', 'Humidity_Enc', 'Windy_Enc'])
    prediction = clf.predict(input_data)[0]
    return le_play.inverse_transform([prediction])[0]

# Test examples
print("\nTest Examples:")
tests = [('Sunny', 'Cool', 'High', 'TRUE'), ('Overcast', 'Mild', 'Normal', 'FALSE'), 
         ('Rainy', 'Hot', 'High', 'FALSE'), ('Sunny', 'Mild', 'Normal', 'FALSE')]
for i, (o, t, h, w) in enumerate(tests, 1):
    print(f"{i}. {o}, {t}, {h}, {w} -> {predict_golf(o, t, h, w)}")

print("\nModel ready. Use predict_golf(outlook, temperature, humidity, windy)")

Golf Dataset:
     Outlook Temperature Humidity  Windy Play_Golf
0      Rainy         Hot     High  FALSE        No
1      Rainy         Hot     High   TRUE        No
2   Overcast         Hot     High  FALSE        No
3      Sunny        Mild     High  FALSE       Yes
4      Sunny        Cool   Normal  FALSE       Yes
5      Sunny        Cool   Normal   TRUE       Yes
6   Overcast        Cool   Normal   TRUE        No
7      Rainy        Mild     High  FALSE       Yes
8      Rainy        Cool   Normal  FALSE        No
9      Sunny        Mild   Normal  FALSE       Yes
10     Rainy        Mild   Normal   TRUE       Yes
11  Overcast        Mild     High   TRUE       Yes
12  Overcast         Hot   Normal  FALSE       Yes
13     Sunny        Mild     High   TRUE        No

Total: 14 | Yes: 8, No: 6

Model Accuracy: 100.00%

Confusion Matrix:
[[6 0]
 [0 8]]

Decision Tree Rules:
|--- Temperature <= 1.50
|   |--- Outlook <= 1.50
|   |   |--- Outlook <= 0.50
|   |   |   |--- Windy <= 0.50
|  

In [13]:
# Custom prediction - change values and run this cell
custom_outlook = 'Sunny'
custom_temperature = 'Mild'
custom_humidity = 'Normal'
custom_windy = 'FALSE'

result = predict_golf(custom_outlook, custom_temperature, custom_humidity, custom_windy)

print("Custom Prediction:")
print(f"Outlook: {custom_outlook}")
print(f"Temperature: {custom_temperature}")
print(f"Humidity: {custom_humidity}")
print(f"Windy: {custom_windy}")
print(f"Result: {result}")

Custom Prediction:
Outlook: Sunny
Temperature: Mild
Humidity: Normal
Windy: FALSE
Result: Yes
