In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Import the required libraries
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn import preprocessing

In [None]:
#Read the dataset
dataset = pd.read_csv("/content/job-offer-dataset.csv")
dataset.head()

Unnamed: 0,Age,Salary,Professional,Computer_Skill,Offer_Job
0,Young,High,No,Fair,No
1,Young,High,No,Excellent,No
2,Middle,High,No,Fair,Yes
3,Old,Medium,No,Fair,Yes
4,Old,Low,Yes,Fair,Yes


In [None]:
#Function to compute Entropy
def entropy(col):
    categories, count = np.unique(col, return_counts=True)
    #calculate the entropy value using the formula
    entropy = np.sum([(-count[i]/np.sum(count))*np.log2(count[i]/np.sum(count)) for i in range(len(categories))])
    return entropy

In [None]:
#Function to compute Information Gain
def infogain(df, split_col, label="Offer_Job"):
    #calculate entropy of the entire dataset
    total = entropy(df[label])
    val, count = np.unique(df[split_col],return_counts=True)
    #calculate the weighted entropy
    weighted = np.sum([(count[i]/np.sum(count))*entropy(df.where(df[split_col]==val[i]).
                                dropna()[label])for i in range(len(val))])
    #calculate information gain using the formula
    infogain = total-weighted
    return infogain

In [None]:
#Function to build the Decision Tree
def BuildDT(df, original, features, label="Offer_Job", parent=None):

    #Case 1: All target features have the same value, return the same value
    if len(np.unique(df[label])) <= 1:
        return np.unique(df[label])[0]
      
    #Case 2: Dataset is empty
    elif len(df) == 0:
        return np.unique(original[label])[np.argmax(np.unique(original[label], return_counts=True)[1])]
    
    #Case 3: Feature space is empty
    elif len(features) == 0:
        return parent 

    #Case 4: Start building the decision tree
    else:
        parent = np.unique(df[label])[np.argmax(np.unique(df[label],return_counts=True)[1])]
    
    #Select best split feature for the dataset
    item_values = [infogain(df,feature,label)for feature in features] 
    best_feature_index = np.argmax(item_values)
    best_feature = features[best_feature_index]

    #Define the structure of the tree
    tree = {best_feature:{}}

    #Remove the feature with highest information gain
    features = [i for i in features if i!= best_feature]
    for value in np.unique(df[best_feature]):
        value = value
        sub_data = df.where(df[best_feature]==value).dropna()
        #Recursive call to BuildDT()
        subtree = BuildDT(sub_data,dataset,features,label,parent)
        #Add the sub-tree
        tree[best_feature][value] = subtree
    return(tree)

In [None]:
#Function to output the classification result of unlabelled data
def ClassifyInstance(input_data, tree, default=1):
  for key in list(input_data.keys()):
    if key in list(tree.keys()):
      try:
        pred = tree[key][input_data[key]]
      except:
        return default
      pred = tree[key][input_data[key]]

      if isinstance(pred,dict):
          return predict(input_data,pred)
      else:
          return pred

In [None]:
#Function call for BuildDT()
features = dataset.columns[:-1] #taking only the independent variables
decision_tree = BuildDT(dataset, dataset, features)
pprint(decision_tree)

{'Age': {'Middle': 'Yes',
         'Old': {'Computer_Skill': {'Excellent': 'No', 'Fair': 'Yes'}},
         'Young': {'Professional': {'No': 'No', 'Yes': 'Yes'}}}}


In [None]:
#Function call for ClassifyInstance()
data = {'Age': 'Young', 'Salary': 'High', 'Professional': 'No', 'Computer_Skill': 'Fair'}
print("Input: ",data)
print("Offer_Job: ", ClassifyInstance(data, decision_tree))

Input:  {'Age': 'Young', 'Salary': 'High', 'Professional': 'No', 'Computer_Skill': 'Fair'}
Offer_Job:  No
