**Question 1 **

**    Implement the decision tree using Python based on information gain for splitting nodes (features).**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
# Defining the Decision Tree Class with Entropy and Information Gain

class DecisionTree:
    def __init__(self):
        pass

    def entropy(self, column):
        # Calculate the entropy of a given column
        values, counts = np.unique(column, return_counts=True)
        probabilities = counts / counts.sum()
        entropy = sum(probabilities * -np.log2(probabilities))
        return entropy

    def information_gain(self, data, feature_name, target_name):
        # Calculate information gain for a given feature
        total_entropy = self.entropy(data[target_name])
        values, counts = np.unique(data[feature_name], return_counts=True)
        weighted_entropy = sum(counts / counts.sum() * self.entropy(data.where(data[feature_name] == value).dropna()[target_name]) for value in values)
        information_gain = total_entropy - weighted_entropy
        return information_gain

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def build_tree(self, X, y):
        # Recursively build the decision tree
        if len(set(y)) == 1:
            return {'predict': y.iloc[0]}

        best_gain = 0
        best_feature = None
        for feature in X.columns:
            gain = self.information_gain(pd.concat([X, y], axis=1), feature, y.name)
            if gain[0] > best_gain:
                best_gain = gain[0]
                best_feature = feature

        if best_gain == 0:
            return {'predict': y.value_counts().idxmax()}

        root = {'feature': best_feature}
        for value in X[best_feature].unique():
            root[value] = self.build_tree(X[X[best_feature] == value], y[X[best_feature] == value])

        return root

    def predict_instance(self, instance, tree):
        if 'predict' in tree:
            return tree['predict']

        feature = tree['feature']
        value = instance[feature]
        if value not in tree:
            return None  # No prediction for unknown value
        return self.predict_instance(instance, tree[value])


    def predict(self, X):
        predictions = []
        for _, instance in X.iterrows():
            prediction = self.predict_instance(instance, self.tree)
            predictions.append(prediction)
        return predictions

In [None]:
data = {
    'age': ['<=30', '<=30', '31-40', '>40', '>40', '>40', '31-40', '<=30', '<=30', '>40', '<=30', '31-40', '31-40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

df = pd.DataFrame(data)
df

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,31-40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
6,31-40,low,yes,excellent,yes
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes


In [None]:
# Splitting Data
X = df.drop(columns='buys_computer')
y = df['buys_computer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Fit the decision tree
dt = DecisionTree()
dt.fit(X_train, y_train)

In [None]:
# Checking prediction and accuracy
y_pred = dt.predict(X_test)

for i in range(len(X_test)):
    print(f"For sample {list(X_test.iloc[i])}, predicted class: {y_pred[i]}")

print("\nAccuracy:",metrics.accuracy_score(y_test, y_pred))

For sample ['>40', 'medium', 'no', 'fair'], predicted class: yes
For sample ['<=30', 'medium', 'no', 'fair'], predicted class: yes
For sample ['31-40', 'low', 'yes', 'excellent'], predicted class: yes

Accuracy: 0.6666666666666666
