### Decision Tree

In [1]:
import numpy as np
import pandas as pd

### Data set
The banknote dataset involves predicting whether a given banknote is authentic given a number of measures taken from a photograph.

The dataset contains 1,372 rows with 5 numeric variables. It is a classification problem with two classes (binary classification).

Below provides a list of the five variables in the dataset.

    variance of Wavelet Transformed image (continuous).
    skewness of Wavelet Transformed image (continuous).
    kurtosis of Wavelet Transformed image (continuous).
    entropy of image (continuous).
    class (integer).


In [19]:
data = pd.read_csv('Banknote_Dataset.csv')
data.columns = ['variance','skewness','kurtosis','entropy','class']
data.shape

(1372, 5)

In [20]:
data.head()

Unnamed: 0,variance,skewness,kurtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,1
1,4.5459,8.1674,-2.4586,-1.4621,1
2,3.866,-2.6383,1.9242,0.10645,1
3,3.4566,9.5228,-4.0112,-3.5944,1
4,0.32924,-4.4552,4.5718,-0.9888,1


### Gini

In [65]:
def gini_index(groups,classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted gini for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) /size
            score += p * p
        # weighted group size by its relative size
        gini += (1 - score) * size/ n_instances
        
    return gini

In [66]:
gini_index([[[1,1],[0,0]],[[1,1],[1,0]]],[0,1])

0.5

### Splitting dataset

In [41]:
def test_split(index, value, dataset):
    left,right = [], []
    for row in dataset:
        if row[index] <value:
            left.append(row)
        else:
            right.append(row)
    return left,right