<a href="https://colab.research.google.com/github/ravi-prakash1907/Machine-Learning-for-Cyber-Security/blob/main/Classifiers/decisionTreeCore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from math import log2
import pandas as pd
import numpy as np

## Data Collention

In [2]:
import requests

def downloadCSV(fileURL, saveAs='downloaded.csv'):
  req = requests.get(fileURL)
  fileURLContent = req.content
  csv_file = open(saveAs, 'wb')
  
  csv_file.write(fileURLContent)
  csv_file.close()

In [3]:
#get data
downloadCSV("https://raw.githubusercontent.com/ravi-prakash1907/Machine-Learning-for-Cyber-Security/main/Datasets/opth.csv?token=AJGAAOATXXFQUSQYLRLCYY3ARP33S","data.csv")

## Algo Requirements

### Entropy

In [4]:
def getEntropy(df, colPredict):
  labels = df[colPredict].unique()
  total = len(df)
  entropy = 0
  for l in labels:
    tempDF = df[df[colPredict] == l]
    count = len(tempDF)
    Pi = count/total
    entropy += -Pi * log2(Pi)
  return entropy

In [5]:
# getEntropy(df,'Class')

### Gini Index

In [6]:
def getGiniIndex(df, colPredict):
  labels = df[colPredict].unique()
  total = len(df)
  partialIndex = 0
  for l in labels:
    tempDF = df[df[colPredict] == l]
    count = len(tempDF)
    Pi = count/total
    partialIndex += Pi**2
  giniIndex = 1-partialIndex
  return giniIndex

In [7]:
# getGiniIndex(df,'Class')

In [8]:
def getAttrGiniIndex(df, attr, colPredict):
  target = df[attr].unique()
  partialGiniIndex = []

  ## info gain for every val in label
  for t in target:
    tempDF = df[df[attr] == t]
    tempGiniIndex = getGiniIndex(tempDF,colPredict)
    partialGiniIndex.append(tempGiniIndex)
  
  ## final gini index for attr
  finalGiniIndex = getGiniIndex(df,colPredict) - sum(partialGiniIndex)
  return finalGiniIndex

### Information Gain

In [9]:
def getAttrEntropy(df, attr, colPredict):
  target = df[attr].unique()
  partialEntropies = []

  ## info gain for every val in label
  for t in target:
    tempDF = df[df[attr] == t]
    tempEntropy = getEntropy(tempDF,colPredict)
    tempEntropy *= len(tempDF)/len(df)
    partialEntropies.append(tempEntropy)

  return partialEntropies

In [10]:
# getAttrEntropy(df,'Age','Class')

In [11]:
def getInfoGain(df,attr,colPredict):
  avgEntropies = getAttrEntropy(df, attr, colPredict)
  infoGain = getEntropy(df,colPredict) - sum(avgEntropies)
  return infoGain

In [12]:
# getInfoGain(df,'Age','Class')

### Gain Ratio

In [13]:
def getGainRatio(df, attr, colPredict):
  infoGain = getInfoGain(df, attr, colPredict)
  entropy = getEntropy(df,colPredict)
  gainRatio = infoGain/entropy
  return gainRatio

In [14]:
# getGainRatio(df,'Age','Class')

--- 

## Algo. for selecting root _(in decision tree)_

#### loading data

In [15]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Age,Eyesight,Estigmatic,UseType,Class
0,1,1,1,1,3
1,1,1,1,2,2
2,1,1,2,1,3
3,1,1,2,2,1
4,1,2,1,1,3


In [16]:
df.shape

(24, 5)

In [17]:
## constats
colToPredict = 'Class'
comparisionMat = pd.DataFrame(columns=['Algorithm','Root Attribute'])

In [18]:
def addRow(df, algo, root):
    #create rows for comparision
    thisRow = {"Algorithm":algo,
               "Root Attribute":root}
    thisRow = pd.Series(thisRow)
    df = df.append(thisRow,ignore_index=True)
    
    return df

### ID3

In [19]:
def id3(df, predictionCol):
  attributes = list(df.columns)
  attributes.remove(predictionCol)
  attrCount = len(attributes)

  infoGains = list(map(getInfoGain, [df]*attrCount, attributes, [predictionCol]*attrCount))
  rootAttr = df.columns[infoGains.index(max(infoGains))]

  return rootAttr

In [20]:
root = id3(df,colToPredict)
comparisionMat = addRow(comparisionMat,'ID3',root)

### CART

In [21]:
def cart(df, predictionCol):
  attributes = list(df.columns)
  attributes.remove(predictionCol)
  attrCount = len(attributes)

  giniIndex = list(map(getAttrGiniIndex, [df]*attrCount, attributes, [predictionCol]*attrCount))
  rootAttr = df.columns[giniIndex.index(max(giniIndex))]

  return rootAttr

In [22]:
root = cart(df,colToPredict)
comparisionMat = addRow(comparisionMat,'CART',root)

### C4.5

In [23]:
def c4dot5(df, predictionCol):
  attributes = list(df.columns)
  attributes.remove(predictionCol)
  attrCount = len(attributes)

  gainRatio = list(map(getGainRatio, [df]*attrCount, attributes, ['Class']*attrCount))
  rootAttr = df.columns[gainRatio.index(max(gainRatio))]

  return rootAttr

In [24]:
root = c4dot5(df,colToPredict)
comparisionMat = addRow(comparisionMat,'C4.5',root)

---

## Comparision

In [25]:
comparisionMat

Unnamed: 0,Algorithm,Root Attribute
0,ID3,UseType
1,CART,UseType
2,C4.5,UseType
