<a href="https://colab.research.google.com/github/srigokulavishnu/MLlab/blob/main/ID3_(Small_Dataset).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import math
import pprint
from google.colab import files


uploaded = files.upload()

df = pd.read_csv('job_data.csv')

def discretize_cgpa(cgpa):
    try:
        cgpa = float(cgpa)
    except (ValueError, TypeError):
        return "Unknown"
    if cgpa < 8:
        return "<8"
    elif cgpa >= 9:
        return ">=9"
    else:
        return ">=8"


def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    total = np.sum(counts)
    ans = 0
    for count in counts:
        p_i = count / total
        ans -= p_i * math.log2(p_i)
    return ans

def attribute_entropy(data, attribute, target):
    values, counts = np.unique(data[attribute], return_counts=True)
    total = np.sum(counts)
    ans = 0
    for i in range(len(values)):
        subset = data[data[attribute] == values[i]]
        subset_entropy = entropy(subset[target])
        ans += (counts[i] / total) * subset_entropy
    return ans

def info_gain(data, attribute, target):
    total_entropy = entropy(data[target])
    attr_entropy = attribute_entropy(data, attribute, target)
    return total_entropy - attr_entropy

def id3(data, original_data, features, target, parent_class=None):
    if len(np.unique(data[target])) <= 1:
        return np.unique(data[target])[0]
    elif len(data) == 0:
        return np.unique(original_data[target])[
            np.argmax(np.unique(original_data[target], return_counts=True)[1])
        ]
    elif len(features) == 0:
        return parent_class
    else:
        parent_class = np.unique(data[target])[
            np.argmax(np.unique(data[target], return_counts=True)[1])
        ]
        gains = [info_gain(data, feature, target) for feature in features]
        best_feature_index = np.argmax(gains)
        best_feature = features[best_feature_index]
        tree = {best_feature: {}}
        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value]
            sub_features = [f for f in features if f != best_feature]
            subtree = id3(sub_data, original_data, sub_features, target, parent_class)
            tree[best_feature][value] = subtree
        return tree

target_attribute = 'JobOffer'
features = [col for col in df.columns if col != target_attribute]

decision_tree = id3(df, df, features, target_attribute)
pprint.pprint(decision_tree)

def majority_class(data_subset):
    return data_subset['JobOffer'].value_counts().idxmax()

def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attribute = next(iter(tree))
    attribute_value = sample.get(attribute)
    if attribute_value in tree[attribute]:
        return predict(tree[attribute][attribute_value], sample)
    else:
        return majority_class(df)

cgpa_input = float(input("\nEnter your CGPA: "))
interactive_input = input("Are you interactive? (Yes/No): ").strip().title()
practical_input = input("Your practical knowledge? (VG/AVG/Good): ").strip().upper()
communication_input = input("Your communication level? (G/M/P): ").strip().upper()

new_sample = {
    "CGPA": discretize_cgpa(cgpa_input),
    "Interactive": interactive_input,
    "Practical": practical_input,
    "Communication": communication_input
}

prediction = predict(decision_tree, new_sample)
if prediction.strip().lower() in ["yes", "y"]:
    print("\nYou will get the job.")
else:
    print("\nYou will not get the job.")


Saving job_data.csv to job_data (1).csv
{'CGPA': {'<8': 'N',
          '>=8': 'Y',
          '>=9': {'Practical': {'AVG': 'N', 'G': 'Y', 'VG': 'Y'}}}}

Enter your CGPA: 7
Are you interactive? (Yes/No): yes
Your practical knowledge? (VG/AVG/Good): good
Your communication level? (G/M/P): g

You will not get the job.
