In [8]:
#-------1.WEATHER DATASET USING C4.5 ALGORITHM-------#


import math
import pandas as pd

# --- Step 1: Create the dataset ---
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast',
                'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temp': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak',
             'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Decision': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No',
                 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
df = pd.DataFrame(data)

# --- Step 2: Entropy calculation ---
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = 0
    for i in range(len(elements)):
        p = counts[i]/np.sum(counts)
        entropy_val += -p * math.log2(p)
    return entropy_val

# --- Step 3: Information gain ratio ---
def info_gain_ratio(data, split_attribute_name, target_name="Decision"):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    
    # Weighted entropy
    weighted_entropy = 0
    for i in range(len(vals)):
        subset = data[data[split_attribute_name] == vals[i]]
        weighted_entropy += (counts[i]/np.sum(counts)) * entropy(subset[target_name])
        
    info_gain = total_entropy - weighted_entropy
    
    # Split info (for gain ratio)
    split_info = 0
    for i in range(len(vals)):
        p = counts[i]/np.sum(counts)
        split_info += -p * math.log2(p)
        
    if split_info == 0:
        return 0  # avoid division by zero
    gain_ratio = info_gain / split_info
    return gain_ratio

# --- Step 4: Recursive tree builder (C4.5) ---
def c45_build_tree(data, target_name="Decision"):
    if len(np.unique(data[target_name])) == 1:
        return np.unique(data[target_name])[0]
    
    if len(data.columns) == 1:
        return data[target_name].mode()[0]
    
    gain_ratios = [info_gain_ratio(data, attr, target_name) for attr in data.columns if attr != target_name]
    best_attr = data.columns[:-1][np.argmax(gain_ratios)]
    
    tree = {best_attr: {}}
    
    for val in np.unique(data[best_attr]):
        sub_data = data[data[best_attr] == val].drop(columns=[best_attr])
        subtree = c45_build_tree(sub_data, target_name)
        tree[best_attr][val] = subtree
        
    return tree

# --- Step 5: Build and display the decision tree ---
import numpy as np
tree = c45_build_tree(df)
print("C4.5 Decision Tree:\n", tree)

# --- Step 6: Classify a new sample ---
def classify(tree, sample):
    for key in tree.keys():
        value = sample[key]
        if value in tree[key]:
            if isinstance(tree[key][value], dict):
                return classify(tree[key][value], sample)
            else:
                return tree[key][value]
    return None

# Example test case
sample = {'Outlook': 'Sunny', 'Temp': 72, 'Humidity': 90, 'Wind': 'Weak'}
print("\nPredicted Decision for sample:", classify(tree, sample))


C4.5 Decision Tree:
 {'Temp': {np.int64(64): 'Yes', np.int64(65): 'No', np.int64(68): 'Yes', np.int64(69): 'Yes', np.int64(70): 'Yes', np.int64(71): 'No', np.int64(72): {'Outlook': {'Overcast': 'Yes', 'Sunny': 'No'}}, np.int64(75): 'Yes', np.int64(80): 'No', np.int64(81): 'Yes', np.int64(83): 'Yes', np.int64(85): 'No'}}

Predicted Decision for sample: No


In [9]:
#-------2.WEATHER DATASET USING CART ALGORITHM-------#
import pandas as pd
import numpy as np

# --- Step 1: Dataset ---
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast',
                'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temp': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak',
             'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Decision': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No',
                 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)

# --- Step 2: Gini impurity ---
def gini_impurity(y):
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return 1 - np.sum(probs ** 2)

# --- Step 3: Weighted Gini for a split ---
def gini_index(groups, classes):
    n_instances = sum([len(group) for group in groups])
    gini = 0.0
    for group in groups:
        if len(group) == 0:
            continue
        size = len(group)
        score = 0.0
        _, counts = np.unique(group, return_counts=True)
        for count in counts:
            p = count / size
            score += p * p
        gini += (1 - score) * (size / n_instances)
    return gini

# --- Step 4: Split dataset based on attribute and value ---
def test_split(index, value, dataset):
    left, right = [], []
    for row in dataset:
        if isinstance(value, (int, float)):  # numeric
            if row[index] <= value:
                left.append(row)
            else:
                right.append(row)
        else:  # categorical
            if row[index] == value:
                left.append(row)
            else:
                right.append(row)
    return left, right

# --- Step 5: Find the best split ---
def get_best_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    best_index, best_value, best_score, best_groups = None, None, 999, None
    for index in range(len(dataset[0]) - 1):  # skip target column
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < best_score:
                best_index, best_value, best_score, best_groups = index, row[index], gini, groups
    return {'index': best_index, 'value': best_value, 'groups': best_groups}

# --- Step 6: Create a terminal node value ---
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

# --- Step 7: Recursive split ---
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_best_split(left)
        split(node['left'], max_depth, min_size, depth + 1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_best_split(right)
        split(node['right'], max_depth, min_size, depth + 1)

# --- Step 8: Build the tree ---
def build_tree(train, max_depth, min_size):
    root = get_best_split(train)
    split(root, max_depth, min_size, 1)
    return root

# --- Step 9: Make prediction ---
def predict(node, row):
    if isinstance(row[node['index']], (int, float)):
        if row[node['index']] <= node['value']:
            if isinstance(node['left'], dict):
                return predict(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return predict(node['right'], row)
            else:
                return node['right']
    else:
        if row[node['index']] == node['value']:
            if isinstance(node['left'], dict):
                return predict(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return predict(node['right'], row)
            else:
                return node['right']

# --- Step 10: Train the CART tree ---
dataset = df.values.tolist()
tree = build_tree(dataset, max_depth=3, min_size=1)

print("CART Decision Tree (dictionary form):")
print(tree)

# --- Step 11: Test sample ---
test_sample = ['Sunny', 72, 90, 'Weak']  # new record
prediction = predict(tree, test_sample)
print("\nPredicted Decision for sample:", prediction)


CART Decision Tree (dictionary form):
{'index': 3, 'value': 'Weak', 'left': {'index': 0, 'value': 'Sunny', 'left': {'index': 1, 'value': 72, 'left': 'No', 'right': 'No'}, 'right': {'index': 0, 'value': 'Overcast', 'left': 'Yes', 'right': 'Yes'}}, 'right': {'index': 0, 'value': 'Overcast', 'left': {'index': 1, 'value': 64, 'left': 'Yes', 'right': 'Yes'}, 'right': {'index': 0, 'value': 'Sunny', 'left': 'No', 'right': 'No'}}}

Predicted Decision for sample: No


In [10]:
#-----3.A.LOAN DATASET-----#
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder

# Dataset
df = pd.DataFrame({
    'Income': ['Low', 'Low', 'Medium', 'Medium', 'High', 'High'],
    'Credit': ['Good', 'Bad', 'Good', 'Bad', 'Good', 'Bad'],
    'Loan Approved': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'No']
})

# -----------------------------------------------------------
# 🔹 C4.5 Implementation (Information Gain Ratio)
# -----------------------------------------------------------
def entropy(col):
    elements, counts = np.unique(col, return_counts=True)
    entropy_val = 0
    for i in range(len(elements)):
        p = counts[i]/np.sum(counts)
        entropy_val += -p * math.log2(p)
    return entropy_val

def info_gain_ratio(data, split_attribute_name, target_name="Loan Approved"):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    
    weighted_entropy = 0
    for i in range(len(vals)):
        subset = data[data[split_attribute_name] == vals[i]]
        weighted_entropy += (counts[i]/np.sum(counts)) * entropy(subset[target_name])
        
    info_gain = total_entropy - weighted_entropy
    
    split_info = 0
    for i in range(len(vals)):
        p = counts[i]/np.sum(counts)
        split_info += -p * math.log2(p)
    
    if split_info == 0:
        return 0
    return info_gain / split_info

def c45_build_tree(data, target_name="Loan Approved"):
    if len(np.unique(data[target_name])) == 1:
        return np.unique(data[target_name])[0]
    
    if len(data.columns) == 1:
        return data[target_name].mode()[0]
    
    gain_ratios = [info_gain_ratio(data, attr, target_name) for attr in data.columns if attr != target_name]
    best_attr = data.columns[:-1][np.argmax(gain_ratios)]
    
    tree = {best_attr: {}}
    for val in np.unique(data[best_attr]):
        sub_data = data[data[best_attr] == val].drop(columns=[best_attr])
        subtree = c45_build_tree(sub_data, target_name)
        tree[best_attr][val] = subtree
    return tree

c45_tree = c45_build_tree(df)
print("C4.5 Decision Tree:")
print(c45_tree)


C4.5 Decision Tree:
{'Credit': {'Bad': {'Income': {'High': 'No', 'Low': 'No', 'Medium': 'Yes'}}, 'Good': 'Yes'}}


In [11]:
#3.B. CART Implementation
# Gini Impurity
def gini_impurity(y):
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return 1 - np.sum(probs ** 2)

def gini_index(groups, classes):
    n_instances = sum([len(group) for group in groups])
    gini = 0.0
    for group in groups:
        if len(group) == 0:
            continue
        size = len(group)
        score = 0.0
        _, counts = np.unique(group, return_counts=True)
        for count in counts:
            p = count / size
            score += p * p
        gini += (1 - score) * (size / n_instances)
    return gini

def test_split(index, value, dataset):
    left, right = [], []
    for row in dataset:
        if row[index] == value:
            left.append(row)
        else:
            right.append(row)
    return left, right

def get_best_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    best_index, best_value, best_score, best_groups = None, None, 999, None
    for index in range(len(dataset[0]) - 1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < best_score:
                best_index, best_value, best_score, best_groups = index, row[index], gini, groups
    return {'index': best_index, 'value': best_value, 'groups': best_groups}

def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_best_split(left)
        split(node['left'], max_depth, min_size, depth + 1)
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_best_split(right)
        split(node['right'], max_depth, min_size, depth + 1)

def build_tree(train, max_depth, min_size):
    root = get_best_split(train)
    split(root, max_depth, min_size, 1)
    return root

dataset = df.values.tolist()
cart_tree = build_tree(dataset, max_depth=2, min_size=1)
print("\nCART Decision Tree (dictionary form):")
print(cart_tree)



CART Decision Tree (dictionary form):
{'index': 1, 'value': 'Good', 'left': {'index': 0, 'value': 'Low', 'left': 'Yes', 'right': 'Yes'}, 'right': {'index': 0, 'value': 'Medium', 'left': 'Yes', 'right': 'No'}}


In [12]:
#Using scikit learn for both C4.5 and CART

from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder

# Encode categorical data
le = LabelEncoder()
df_encoded = df.apply(le.fit_transform)

X = df_encoded[['Income', 'Credit']]
y = df_encoded['Loan Approved']

# --- C4.5 approximation (using entropy) ---
c45_clf = DecisionTreeClassifier(criterion='entropy')
c45_clf.fit(X, y)
print("\nC4.5 (scikit-learn) tree:")
print(export_text(c45_clf, feature_names=['Income', 'Credit']))

# --- CART (using Gini) ---
cart_clf = DecisionTreeClassifier(criterion='gini')
cart_clf.fit(X, y)
print("\nCART (scikit-learn) tree:")
print(export_text(cart_clf, feature_names=['Income', 'Credit']))



C4.5 (scikit-learn) tree:
|--- Credit <= 0.50
|   |--- Income <= 1.50
|   |   |--- class: 0
|   |--- Income >  1.50
|   |   |--- class: 1
|--- Credit >  0.50
|   |--- class: 1


CART (scikit-learn) tree:
|--- Credit <= 0.50
|   |--- Income <= 1.50
|   |   |--- class: 0
|   |--- Income >  1.50
|   |   |--- class: 1
|--- Credit >  0.50
|   |--- class: 1

