<a href="https://colab.research.google.com/github/ravi-prakash1907/Machine-Learning-for-Cyber-Security/blob/main/Classifiers/decisionTreeCore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with Decision Tree 

In [108]:
from math import log2
import pandas as pd
import numpy as np

## Data Collection

In [109]:
import requests

def downloadCSV(fileURL, saveAs='downloaded.csv'):
  req = requests.get(fileURL)
  fileURLContent = req.content
  csv_file = open(saveAs, 'wb')
  
  csv_file.write(fileURLContent)
  csv_file.close()

In [110]:
#get data
downloadCSV("https://raw.githubusercontent.com/ravi-prakash1907/Machine-Learning-for-Cyber-Security/main/Datasets/decisionTreeSample.csv?token=AJGAAOHPMO2B2C6UPVPQ5I3ARUNLI","data.csv")

## Algo Requirements

### Entropy

In [111]:
def getEntropy(df, colPredict):
  labels = df[colPredict].unique()
  total = len(df)
  entropy = 0
  for l in labels:
    tempDF = df[df[colPredict] == l]
    count = len(tempDF)
    Pi = count/total
    entropy += -Pi * log2(Pi)
  return entropy

In [112]:
# getEntropy(df,'Class')

### Gini Index

In [113]:
def getGiniIndex(df, colPredict):
  labels = df[colPredict].unique()
  total = len(df)
  partialIndex = 0
  for l in labels:
    tempDF = df[df[colPredict] == l]
    count = len(tempDF)
    Pi = count/total
    partialIndex += Pi**2
  giniIndex = 1-partialIndex
  return giniIndex

In [114]:
# getGiniIndex(df,'Class')

In [115]:
def getAttrGiniIndex(df, attr, colPredict):
  target = df[attr].unique()
  partialGiniIndex = []

  ## info gain for every val in label
  for t in target:
    tempDF = df[df[attr] == t]
    tempGiniIndex = getGiniIndex(tempDF,colPredict)
    partialGiniIndex.append(tempGiniIndex)
  
  ## final gini index for attr
  finalGiniIndex = getGiniIndex(df,colPredict) - sum(partialGiniIndex)
  return finalGiniIndex

### Information Gain

In [116]:
def getAttrEntropy(df, attr, colPredict):
  target = df[attr].unique()
  partialEntropies = []

  ## info gain for every val in label
  for t in target:
    tempDF = df[df[attr] == t]
    tempEntropy = getEntropy(tempDF,colPredict)
    tempEntropy *= len(tempDF)/len(df)
    partialEntropies.append(tempEntropy)

  return partialEntropies

In [117]:
# getAttrEntropy(df,'Age','Class')

In [118]:
def getInfoGain(df,attr,colPredict):
  avgEntropies = getAttrEntropy(df, attr, colPredict)
  infoGain = getEntropy(df,colPredict) - sum(avgEntropies)
  return infoGain

In [119]:
# getInfoGain(df,'Age','Class')

### Gain Ratio

In [120]:
def getGainRatio(df, attr, colPredict):
  infoGain = getInfoGain(df, attr, colPredict)
  entropy = getEntropy(df,colPredict)
  gainRatio = infoGain/entropy
  return gainRatio

In [121]:
# getGainRatio(df,'Age','Class')

--- 

## Algo. for selecting root _(in decision tree)_

#### loading data

In [122]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,attr-1,attr-2,attr-3,attr-4,Class
0,3,2,3,0,1
1,3,0,2,1,2
2,3,0,0,0,2
3,2,2,1,1,2
4,2,1,3,1,0


In [123]:
df.shape

(50, 5)

In [124]:
## constats
colToPredict = 'Class'
comparisionMat = pd.DataFrame(columns=['Algorithm','Root Attribute'])

In [125]:
def addRow(df, algo, root):
    #create rows for comparision
    thisRow = {"Algorithm":algo,
               "Root Attribute":root}
    thisRow = pd.Series(thisRow)
    df = df.append(thisRow,ignore_index=True)
    
    return df

### ID3

In [126]:
def id3(df, predictionCol):
  attributes = list(df.columns)
  attributes.remove(predictionCol)
  attrCount = len(attributes)

  infoGains = list(map(getInfoGain, [df]*attrCount, attributes, [predictionCol]*attrCount))
  rootAttr = df.columns[infoGains.index(max(infoGains))]

  return rootAttr

In [127]:
root = id3(df,colToPredict)
comparisionMat = addRow(comparisionMat,'ID3',root)

### CART

In [128]:
def cart(df, predictionCol):
  attributes = list(df.columns)
  attributes.remove(predictionCol)
  attrCount = len(attributes)

  giniIndex = list(map(getAttrGiniIndex, [df]*attrCount, attributes, [predictionCol]*attrCount))
  rootAttr = df.columns[giniIndex.index(max(giniIndex))]

  return rootAttr

In [129]:
root = cart(df,colToPredict)
comparisionMat = addRow(comparisionMat,'CART',root)

### C4.5

In [130]:
def c4dot5(df, predictionCol):
  attributes = list(df.columns)
  attributes.remove(predictionCol)
  attrCount = len(attributes)

  gainRatio = list(map(getGainRatio, [df]*attrCount, attributes, ['Class']*attrCount))
  rootAttr = df.columns[gainRatio.index(max(gainRatio))]

  return rootAttr

In [131]:
root = c4dot5(df,colToPredict)
comparisionMat = addRow(comparisionMat,'C4.5',root)

---

## Comparision

In [132]:
print("Given dataset (sample):\n")
df.head()

Given dataset (sample):



Unnamed: 0,attr-1,attr-2,attr-3,attr-4,Class
0,3,2,3,0,1
1,3,0,2,1,2
2,3,0,0,0,2
3,2,2,1,1,2
4,2,1,3,1,0


In [133]:
print("Comparision matrix for root selection for decision tree:\n")
comparisionMat

Comparision matrix for root selection for decision tree:



Unnamed: 0,Algorithm,Root Attribute
0,ID3,attr-3
1,CART,attr-4
2,C4.5,attr-3
