In [14]:
# Import libraries
import pandas as pd
import numpy as np
from collections import Counter
import math

In [15]:
# Load the dataset
data = pd.read_csv('diabetes_data.csv')

# View column names
print("Columns:", data.columns.tolist())

# Let's assume 'class' is the target.
TARGET = 'class'


Columns: ['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity', 'class']


In [16]:
# Binning numerical attributes (e.g., Age)
def bin_age(age):
    if age < 30:
        return '<30'
    elif age <= 50:
        return '30-50'
    else:
        return '>50'

# Apply binning
if 'Age' in data.columns:
    data['Age'] = data['Age'].apply(bin_age)

# If other numerical features exist, bin similarly if needed


Split the data into a test and training set

In [17]:
#split here
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

Implement the entropy function

In [18]:
# Function to calculate entropy
def entropy(column):
    counts = Counter(column)
    total = len(column)
    entropy = 0
    
    for count in counts.values():
        probability = count / total
        entropy -= probability * math.log2(probability)
    
    return entropy


Implement the info gain function

In [19]:
# Function to calculate information gain
def information_gain(data, split_attr, target_attr=TARGET):
    # Calculate total entropy before split
    total_entropy = entropy(data[target_attr])
    
    # Calculate weighted entropy after split
    weighted_entropy = 0
    for value in data[split_attr].unique():
        subset = data[data[split_attr] == value]
        weight = len(subset) / len(data)
        weighted_entropy += weight * entropy(subset[target_attr])
    
    # Calculate information gain
    info_gain = total_entropy - weighted_entropy
    return info_gain


Implement the ID3 algo

In [20]:
# ID3 algorithm
def id3(data, original_data, features, target_attr=TARGET, parent_node_class=None):
    # If all target values are the same, return that value
    if len(data[target_attr].unique()) == 1:
        return data[target_attr].iloc[0]
    
    # If no features left, return most common class
    if len(features) == 0:
        return data[target_attr].mode()[0]
    
    # If dataset is empty, return most common class from parent
    if len(data) == 0:
        return parent_node_class
    
    # Find best feature to split on
    best_feature = max(features, key=lambda x: information_gain(data, x))
    
    # Create tree structure
    tree = {best_feature: {}}
    
    # Remove best feature from features list
    features = [f for f in features if f != best_feature]
    
    # Create branches for each value of best feature
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        subtree = id3(subset, original_data, features, target_attr, data[target_attr].mode()[0])
        tree[best_feature][value] = subtree
    
    return tree


Prediction Function

In [21]:
def predict(query, tree, default=None):
    # If tree is a leaf node (not a dict), return its value
    if not isinstance(tree, dict):
        return tree
    
    # Get the first key (feature) from the tree
    feature = list(tree.keys())[0]
    
    # If feature not in query, return default
    if feature not in query:
        return default
    
    # Get the value for this feature from the query
    value = query[feature]
    
    # If value not in tree, return default
    if value not in tree[feature]:
        return default
    
    # Recursively predict using the subtree
    subtree = tree[feature][value]
    return predict(query, subtree, default)


Implement the testing functiom

In [22]:
def test(data, tree):
    correct = 0
    total = len(data)
    
    for _, row in data.iterrows():
        query = row.to_dict()
        actual = query.pop(TARGET)
        predicted = predict(query, tree)
        
        if predicted == actual:
            correct += 1
    
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.2%}")
    return accuracy

The rest of the stuff has been implemented for you

In [23]:
#  tree visualization
def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "->", tree)
        return
    for attr, branches in tree.items():
        for value, subtree in branches.items():
            print(indent + f"[{attr} = {value}]")
            print_tree(subtree, indent + "  ")


In [24]:
# Prepare features list
features = data.columns.tolist()
features.remove(TARGET)

# Build the decision tree
tree = id3(train_data, train_data, features)

# Display the tree
print_tree(tree)


# Test accuracy on the same data (since no train-test split here)
test(test_data, tree)

# Predict on a single new instance (example)
example_query = {col: test_data[col].iloc[0] for col in features}
print("Example prediction:", predict(example_query, tree))


[Polyuria = Yes]
  [Polydipsia = Yes]
    -> Positive
  [Polydipsia = No]
    [Itching = Yes]
      [Genital thrush = No]
        [visual blurring = Yes]
          -> Negative
        [visual blurring = No]
          -> Positive
      [Genital thrush = Yes]
        [Obesity = No]
          -> Positive
        [Obesity = Yes]
          [sudden weight loss = No]
            -> Positive
          [sudden weight loss = Yes]
            -> Negative
    [Itching = No]
      -> Positive
[Polyuria = No]
  [Gender = Male]
    [Irritability = No]
      [Polydipsia = No]
        [weakness = Yes]
          [Itching = No]
            [Alopecia = Yes]
              [sudden weight loss = Yes]
                -> Negative
              [sudden weight loss = No]
                -> Positive
            [Alopecia = No]
              -> Negative
          [Itching = Yes]
            [Alopecia = Yes]
              -> Negative
            [Alopecia = No]
              [Age = 30-50]
                -> Positiv