<a href="https://colab.research.google.com/github/Sourabhsahu33/ML-Algorithms/blob/main/Decision_Tree_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

df = pd.read_csv("Tree_Data.csv")
print("\n Given Malware Behaviour Dataset:\n\n", df)


 Given Malware Behaviour Dataset:

    Attack Type Firewall Status Malware Detected Data Exfiltrated
0      Malware         Blocked              Yes               No
1     Phishing         Allowed               No              Yes
2         DDoS         Blocked               No               No
3   Ransomware         Blocked              Yes               No
4    Intrusion         Blocked               No               No
5      Malware         Blocked              Yes              Yes
6     Phishing         Allowed               No               No
7         DDoS         Blocked               No               No
8   Ransomware         Blocked              Yes               No
9    Intrusion         Blocked               No               No
10     Malware         Blocked              Yes              Yes
11    Phishing         Allowed               No              Yes
12        DDoS         Blocked               No               No
13  Ransomware         Blocked              Yes      

# Attribute Prediction

In [3]:
t = df.keys()[-1]
print('Target Attribute is   ➡ ', t)

# Get the attribute names from input dataset
attribute_names = list(df.keys())

#Remove the target attribute from the attribute names list
attribute_names.remove(t)

print('Predicting Attributes ➡ ', attribute_names)

Target Attribute is   ➡  Data Exfiltrated
Predicting Attributes ➡  ['Attack Type', 'Firewall Status', 'Malware Detected']


# Entropy of the Training Data Set

In [4]:
#Function to calculate the entropy of probaility of observations
# -p*log2*p

import math
def entropy(probs):
    return sum( [-prob*math.log(prob, 2) for prob in probs])

#Function to calulate the entropy of the given Datasets/List with respect to target attributes
def entropy_of_list(ls,value):
    from collections import Counter

    # Total intances associated with respective attribute
    total_instances = len(ls)  # = 14
    print("---------------------------------------------------------")
    print("\nTotal no of instances/records associated with '{0}' is ➡ {1}".format(value,total_instances))
    # Counter calculates the propotion of class
    cnt = Counter(x for x in ls)
    print('\nTarget attribute class count(Yes/No)=',dict(cnt))

    # x means no of YES/NO
    probs = [x / total_instances for x in cnt.values()]
    print("\nClasses➡", max(cnt), min(cnt))
    print("\nProbabilities of Class 'p'='{0}' ➡ {1}".format(max(cnt),max(probs)))
    print("Probabilities of Class 'n'='{0}'  ➡ {1}".format(min(cnt),min(probs)))

    # Call Entropy
    return entropy(probs)

# Information Gain of Attributes

In [5]:
def information_gain(df, split_attribute, target_attribute,battr):
    print("\n\n----- Information Gain Calculation of",split_attribute,"----- ")

    # group the data based on attribute values
    df_split = df.groupby(split_attribute)
    glist=[]
    for gname,group in df_split:
        print('Grouped Attribute Values \n',group)
        print("---------------------------------------------------------")
        glist.append(gname)

    glist.reverse()
    nobs = len(df.index) * 1.0
    df_agg1=df_split.agg({target_attribute:lambda x:entropy_of_list(x, glist.pop())})
    df_agg2=df_split.agg({target_attribute :lambda x:len(x)/nobs})

    df_agg1.columns=['Entropy']
    df_agg2.columns=['Proportion']

    # Calculate Information Gain:
    new_entropy = sum( df_agg1['Entropy'] * df_agg2['Proportion'])
    if battr !='S':
        old_entropy = entropy_of_list(df[target_attribute],'S-'+df.iloc[0][df.columns.get_loc(battr)])
    else:
        old_entropy = entropy_of_list(df[target_attribute],battr)
    return old_entropy - new_entropy

# Using ID3 Algorithm

In [6]:
def id3(df, target_attribute, attribute_names, default_class=None,default_attr='S'):

    from collections import Counter
    cnt = Counter(x for x in df[target_attribute])# class of YES /NO

    ## First check: Is this split of the dataset homogeneous?
    if len(cnt) == 1:
        return next(iter(cnt))  # next input data set, or raises StopIteration when EOF is hit.

    ## Second check: Is this split of the dataset empty? if yes, return a default value
    elif df.empty or (not attribute_names):
        return default_class  # Return None for Empty Data Set

    ## Otherwise: This dataset is ready to be devied up!
    else:
        # Get Default Value for next recursive call of this function:
        default_class = max(cnt.keys()) #No of YES and NO Class
        # Compute the Information Gain of the attributes:
        gainz=[]
        for attr in attribute_names:
            ig= information_gain(df, attr, target_attribute,default_attr)
            gainz.append(ig)
            print('\nInformation gain of','“',attr,'”','is ➡', ig)
            print("=========================================================")

        index_of_max = gainz.index(max(gainz))               # Index of Best Attribute
        best_attr = attribute_names[index_of_max]            # Choose Best Attribute to split on
        print("\nList of Gain for arrtibutes:",attribute_names,"\nare:", gainz,"respectively.")
        print("\nAttribute with the maximum gain is ➡", best_attr)
        print("\nHence, the Root node will be ➡", best_attr)
        print("=========================================================")

        # Create an empty tree, to be populated in a moment
        tree = {best_attr:{}} # Initiate the tree with best attribute as a node
        remaining_attribute_names =[i for i in attribute_names if i != best_attr]

        # Split dataset-On each split, recursively call this algorithm.Populate the empty tree with subtrees, which
        # are the result of the recursive call
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,target_attribute, remaining_attribute_names,default_class,best_attr)
            tree[best_attr][attr_val] = subtree
        return tree


# Tree formation

In [7]:
#Function to calulate the entropy of the given Dataset with respect to target attributes
def entropy_dataset(a_list):
    from collections import Counter

    # Counter calculates the propotion of class
    cnt = Counter(x for x in a_list)
    num_instances = len(a_list)*1.0    # = 14
    print("\nNumber of Instances of the Current Sub-Class is {0}".format(num_instances ))

    # x means no of YES/NO
    probs = [x / num_instances for x in cnt.values()]
    print("\nClasses➡", "'p'=",max(cnt), "'n'=",min(cnt))
    print("\nProbabilities of Class 'p'='{0}' ➡ {1}".format(max(cnt),max(probs)))
    print("Probabilities of Class 'n'='{0}'  ➡ {1}".format(min(cnt),min(probs)))

    # Call Entropy
    return entropy(probs)

# The initial entropy of the YES/NO attribute for our dataset.
print("Entropy calculation for input dataset:\n")
print(df['Data Exfiltrated'])

total_entropy = entropy_dataset(df['Data Exfiltrated'])
print("\nTotal Entropy(S) of Tree_Data Dataset➡", total_entropy)
print("=========================================================")
####################################################

from pprint import pprint
tree = id3(df,t,attribute_names)
print("\nThe Resultant Decision Tree is: ⤵\n")
pprint(tree)

attribute = next(iter(tree))
print("\nBest Attribute ➡",attribute)
print("Tree Keys      ➡",tree[attribute].keys())

Entropy calculation for input dataset:

0      No
1     Yes
2      No
3      No
4      No
5     Yes
6      No
7      No
8      No
9      No
10    Yes
11    Yes
12     No
13    Yes
14     No
15     No
Name: Data Exfiltrated, dtype: object

Number of Instances of the Current Sub-Class is 16.0

Classes➡ 'p'= Yes 'n'= No

Probabilities of Class 'p'='Yes' ➡ 0.6875
Probabilities of Class 'n'='No'  ➡ 0.3125

Total Entropy(S) of Tree_Data Dataset➡ 0.8960382325345575


----- Information Gain Calculation of Attack Type ----- 
Grouped Attribute Values 
    Attack Type Firewall Status Malware Detected Data Exfiltrated
2         DDoS         Blocked               No               No
7         DDoS         Blocked               No               No
12        DDoS         Blocked               No               No
---------------------------------------------------------
Grouped Attribute Values 
    Attack Type Firewall Status Malware Detected Data Exfiltrated
4    Intrusion         Blocked           

In [None]:
def classify(instance, tree,default=None):  # Instance of Play Tennis with Predicted
    attribute = next(iter(tree))            # Outlook/Humidity/Wind
    if instance[attribute] in tree[attribute].keys(): # Value of the attributs in  set of Tree keys
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict):                  # this is a tree, delve deeper
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default

df_new=pd.read_csv('Dataset/Tree_Data.csv')
df_new['Predicted'] = df_new.apply(classify, axis=1, args=(tree,'?'))
print(df_new)

   Attack Type Firewall Status Malware Detected Data Exfiltrated Predicted
0      Malware         Blocked              Yes               No       Yes
1     Phishing         Allowed               No              Yes       Yes
2         DDoS         Blocked               No               No        No
3   Ransomware         Blocked              Yes               No       Yes
4    Intrusion         Blocked               No               No        No
5      Malware         Blocked              Yes              Yes       Yes
6     Phishing         Allowed               No               No       Yes
7         DDoS         Blocked               No               No        No
8   Ransomware         Blocked              Yes               No       Yes
9    Intrusion         Blocked               No               No        No
10     Malware         Blocked              Yes              Yes       Yes
11    Phishing         Allowed               No              Yes       Yes
12        DDoS         Bl