In [1]:
import pandas as pd
import numpy as np

1. Write a python function program to demonstrate the working of the decision tree based C4.5 algorithms
without using scikit-learn library. Use following data set for building the decision tree and apply this
knowledge to classify a new sample.
The dataset has three attributes: Outlook (Sunny, Overcast, Rainy), Temperature, Humidity and Wind (Weak,
Strong). The target attribute is Play Tennis (Yes/No).

In [2]:
df=pd.read_csv('data.csv').iloc[:,1:]
df

Unnamed: 0,Outlook,Temp.,Humidity,Wind,Decision
0,Sunny,85,85,Weak,No
1,Sunny,80,90,Strong,No
2,Overcast,83,78,Weak,Yes
3,Rain,70,96,Weak,Yes
4,Rain,68,80,Weak,Yes
5,Rain,65,70,Strong,No
6,Overcast,64,65,Strong,Yes
7,Sunny,72,95,Weak,No
8,Sunny,69,70,Weak,Yes
9,Rain,75,80,Weak,Yes


In [3]:
def entropy(data, col):
    vals,cnts = np.unique(data[col],return_counts=True)
    e=sum((-cnt/len(data[col]))*np.log2(cnt/len(data[col])) for cnt in cnts)
    return e

In [4]:
def information_gain(data, split_col, target):
    e = entropy(data,target)
    vals, cnts = np.unique(data[split_col], return_counts=True)
    weighted_e = sum((cnts[i] / np.sum(cnts)) * entropy(data[data[split_col] == vals[i]], target) 
                        for i in range(len(vals)))    
    return e - weighted_e

In [5]:
def gain_ratio(data, split_col, target):
    gain=information_gain(data, split_col, target)
    info=entropy(data, split_col)
    return gain/(info + 1e-9)

In [6]:
def best_feature(data, target_col):
    features = data.columns[:-1]  
    gains = {feature: gain_ratio(data, feature, target_col) for feature in features}
    return max(gains, key=gains.get)
best_feature(df,'Decision')

'Temp.'

In [7]:
def C45_tree(data, target_col):
    if len(data[target_col].unique()) == 1:
        return data[target_col].iloc[0]
    if data.shape[1] == 1:
        return data[target_col].mode()[0]

    feature = best_feature(data, target_col)
    tree = {feature: {}}

    if data[feature].dtype == 'object':
        for val in data[feature].unique():
            subset = data[data[feature] == val]
            tree[feature][val] = C45_tree(subset.drop(columns=feature), target_col)
    else: 
        thresholds = np.sort(data[feature].unique())
        for split in thresholds[:-1]:
            left = data[data[feature] <= split]
            right = data[data[feature] > split]
            if not left.empty:
                tree[feature][f"<= {split}"] = C45_tree(left.drop(columns=feature), target_col)
            if not right.empty:
                tree[feature][f"> {split}"] = C45_tree(right.drop(columns=feature), target_col)

    return tree

In [8]:
c45tree=C45_tree(df,'Decision')
c45tree

{'Temp.': {'<= 64': 'Yes',
  '> 64': {'Outlook': {'Sunny': {'Humidity': {'<= 70': 'Yes',
      '> 70': 'No',
      '<= 85': {'Wind': {'Weak': 'No', 'Strong': 'Yes'}},
      '> 85': 'No',
      '<= 90': {'Wind': {'Weak': 'No', 'Strong': 'No'}},
      '> 90': 'No'}},
    'Overcast': 'Yes',
    'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}},
  '<= 65': {'Outlook': {'Rain': 'No', 'Overcast': 'Yes'}},
  '> 65': {'Humidity': {'<= 70': 'Yes',
    '> 70': {'Outlook': {'Sunny': 'No',
      'Overcast': 'Yes',
      'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}},
    '<= 75': 'Yes',
    '> 75': {'Outlook': {'Sunny': 'No',
      'Overcast': 'Yes',
      'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}},
    '<= 78': 'Yes',
    '> 78': {'Outlook': {'Sunny': 'No',
      'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}},
      'Overcast': 'Yes'}},
    '<= 80': {'Wind': {'Weak': 'Yes',
      'Strong': {'Outlook': {'Sunny': 'Yes', 'Rain': 'No'}}}},
    '> 80': {'Outlook': {'Sunny': 'No', 'R

In [9]:
def C45_predict(tree, sample):
    if not isinstance(tree, dict):
        return tree 
    feature = next(iter(tree)) 
    value = sample[feature]      
    if isinstance(value, (np.int64, np.float64)):
        for condition in tree[feature]:
            threshold = float(condition.split('<= ')[-1].split('> ')[-1])  
            if '<=' in condition and value <= threshold:
                return C45_predict(tree[feature][condition], sample)
            elif '>' in condition and value > threshold:
                return C45_predict(tree[feature][condition], sample)
    else: 
        if value in tree[feature]:
            return C45_predict(tree[feature][value], sample)
    return "Unknown" 

In [10]:
sample = {'Temp.': 75, 'Outlook': 'Overcast', 'Humidity': 70, 'Wind': 'Weak'}
sample_df = pd.DataFrame([sample])

prediction = C45_predict(c45tree, sample_df.iloc[0])
print(f'Prediction for the sample: {prediction}')

Prediction for the sample: Yes


2. Write a python function program to demonstrate the working of the decision tree based CART algorithms
without using scikit-learn library. Use Q. No. 1 data set for building the decision tree and apply this knowledge
to classify a new sample.

In [11]:
def gini(y):
    probs = y.value_counts(normalize=True)
    return 1 - (probs ** 2).sum()

In [12]:
def best_split(data, target):
    best_gini, best_feature, best_threshold = float('inf'), None, None
    for feature in data.columns[:-1]:
        for threshold in np.unique(data[feature]):
            left, right = data[data[feature] <= threshold], data[data[feature] > threshold]
            if len(left) == 0 or len(right) == 0: continue
            gini_split = (len(left) * gini(left[target]) + len(right) * gini(right[target])) / len(data)
            if gini_split < best_gini:
                best_gini, best_feature, best_threshold = gini_split, feature, threshold
    return best_feature, best_threshold
best_split(df,'Decision')

('Outlook', 'Overcast')

In [13]:
def cart_tree(data, target):
    if len(data[target].unique()) == 1:
        return data[target].iloc[0]
    if data.shape[1] == 1:
        return data[target].mode()[0]

    feature, threshold = best_split(data, target)
    if feature is None:
        return data[target].mode()[0]

    left = data[data[feature] <= threshold] if isinstance(threshold, (int, float)) else data[data[feature] == threshold]
    right = data[data[feature] > threshold] if isinstance(threshold, (int, float)) else data[data[feature] != threshold]

    return {
        feature: {
            f'<= {threshold}': cart_tree(left.drop(columns=feature), target),
            f'> {threshold}': cart_tree(right.drop(columns=feature), target)
        }
    }

In [14]:
carttree=cart_tree(df,'Decision')
carttree

{'Outlook': {'<= Overcast': 'Yes',
  '> Overcast': {'Temp.': {'<= 75': 'Yes',
    '> 75': {'Wind': {'<= Strong': 'No',
      '> Strong': {'Humidity': {'<= 80': 'Yes', '> 80': 'No'}}}}}}}}

In [15]:
def cart_predict(tree, sample):
    if not isinstance(tree, dict):
        return tree  
    feature = next(iter(tree))  
    if isinstance(sample[feature], (np.int64, np.float64)):
        thresholds = list(tree[feature].keys())
        threshold = float(thresholds[0].split(' ')[-1]) 
        if sample[feature] <= threshold:
            return cart_predict(tree[feature]['<= ' + str(threshold)], sample)
        else:
            return cart_predict(tree[feature]['> ' + str(threshold)], sample)
    value = sample[feature]
    if value in tree[feature]:
        return cart_predict(tree[feature][value], sample)

    return "Unknown"


In [16]:
prediction = cart_predict(carttree, sample_df.iloc[0])
print(f'Prediction for the sample: {prediction}')

Prediction for the sample: Unknown


3. Write a python function program to demonstrate the working of the decision tree based C4.5 and CART
algorithms without and with using scikit-learn library. Using the following dataset, apply aforementioned
algorithms. The attributes are Income (Low, Medium, High) and Credit (Good, Bad), and the target is Loan
Approved (Yes/No).

In [22]:
data=pd.read_csv('loan.csv')
data

Unnamed: 0,Income,Credit,Loan Approved
0,Low,Good,Yes
1,Low,Bad,No
2,Medium,Good,Yes
3,Medium,Bad,Yes
4,High,Good,Yes
5,High,Bad,No


In [26]:
sample = {'Income': 'Low', 'Credit': 'Good'}
sample_df = pd.DataFrame([sample])

In [27]:
c45tree=C45_tree(data,'Loan Approved')
c45tree

{'Credit': {'Good': 'Yes',
  'Bad': {'Income': {'Low': 'No', 'Medium': 'Yes', 'High': 'No'}}}}

In [28]:
prediction = C45_predict(c45tree, sample_df.iloc[0])
print(f'Prediction for the sample: {prediction}')

Prediction for the sample: Yes


In [32]:
carttree=cart_tree(data,'Loan Approved')
carttree

{'Credit': {'<= Bad': {'Income': {'<= Low': 'No', '> Low': 'No'}},
  '> Bad': 'Yes'}}

In [33]:
prediction = cart_predict(carttree, sample_df.iloc[0])
print(f'Prediction for the sample: {prediction}')

Prediction for the sample: Unknown
