<a href="https://colab.research.google.com/github/sathvik-ujwal/Lab-Codes/blob/main/MACHINE%20LEARNING/Codes/ID3_decisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
import math
from collections import Counter

In [14]:
df = pd.DataFrame({
    "age": [30, 45, 50, 35, 40, 60, 25, 30, 55, 45],
    "blood_pressure": [30, 45, 50, 35, 40, 60, 25, 30, 55, 45],
    "cholesterol": [1, 0, 1, 0, 1, 1, 0, 1, 0, 0],
    "diagnosis": [1, 0, 1, 0, 1, 1, 0, 1, 0, 0]
})

In [6]:
df.head()

Unnamed: 0,Age,blood_pressure,cholesterol,diagnosis
0,30,30,1,1
1,45,45,0,0
2,50,50,1,1
3,35,35,0,0
4,40,40,1,1


In [8]:
def calculate_entropy(y):
    total = len(y)
    class_counts = Counter(y)
    entropy = 0

    for count in class_counts.values():
        p_class = count/total
        if p_class > 0:
            entropy -= p_class * np.log(p_class)

    return entropy

In [9]:
def calculate_information_gain(feature, y):
    total_entropy = calculate_entropy(y)
    unique_values = set(feature)
    weighted_entropy = 0

    for value in unique_values:
        subset_y = [y[i] for i in range(len(y)) if value == feature[i]]
        weight = len(subset_y)/len(y)
        weighted_entropy += weight * calculate_entropy(subset_y)
    information_gain = total_entropy = weighted_entropy

    return information_gain

In [10]:
def best_feature_to_split(features, y):
    gains = []
    for feature in features:
        gain = calculate_information_gain(feature, y)
        gains.append(gain)
    best_feature_index = gains.index(max(gains))
    return best_feature_index, max(gains)

In [11]:
def predict_using_best_feature(value, feature, y):
    unique_values = set(feature)
    value_class_map ={}

    for val in unique_values:
        subset_y = [y[i] for i in range(len(y)) if feature[i] == val]
        majority_class = max(set(subset_y), key=subset_y.count)
        value_class_map[val] = majority_class
    return value_class_map.get(value, 'Unknown')

In [16]:
entropy_diagnosis = calculate_entropy(df['diagnosis'])
print(f"Entropy of diagnosis: {entropy_diagnosis}")

age_gain = calculate_information_gain(df['age'], df['diagnosis'])
bp_gain = calculate_information_gain(df['blood_pressure'], df['diagnosis'])
cholesterol_gain = calculate_information_gain(df['cholesterol'], df['diagnosis'])
print(f"Information gain for age: {age_gain}")
print(f"Information gain for blood pressure: {bp_gain}")
print(f"Information gain for cholesterol: {cholesterol_gain}")

Entropy of diagnosis: 0.6931471805599453
Information gain for age: 0.0
Information gain for blood pressure: 0.0
Information gain for cholesterol: 0.0


In [36]:
features = df.columns[:-1]
y = df.columns[-1]
best_feature_index, max_gain = best_feature_to_split([df[feature] for feature in features], df.iloc[:,-1])
print(f'Best feature to split on: {features[best_feature_index]}')


Best feature to split on: age


In [42]:
# Making prediction on new data
new_patient = [50, 0, 0]
prediction_feature = features[best_feature_index]

predicted_class = predict_using_best_feature(new_patient[best_feature_index], df[features[best_feature_index]], df.iloc[:,-1])
print(f"The new patient {new_patient[0]} years old with {new_patient[1]} blood pressure and {new_patient[2]} cholesterol is classified as {'Sick' if predicted_class == 1 else 'Healthy'}")

The new patient 50 years old with 0 blood pressure and 0 cholesterol is classified as Sick


In [39]:
features[best_feature_index]

'age'