# DSCI-508 Project 5B
### Matt Snyder

In [3]:
import pandas as pd
import os  
import numpy as np
from sklearn import tree
from IPython.display import Image

## Load data set into dataframe

In [4]:
folder = os.getcwd() + '/'  # double backslashes for Windows OS

In [5]:
df = pd.read_csv(folder + 'animals-training.csv', header=0)
df

Unnamed: 0,Legs,Body Covering,Animal
0,0,scales,snake
1,0,scales,snake
2,2,feathers,bird
3,2,feathers,bird
4,2,feathers,bird
5,2,furry,gorilla
6,2,furry,gorilla
7,4,furry,dog
8,4,furry,dog
9,4,furry,dog


## Calculate Entropy and Gini before first split

In [85]:
def calc_node_gini_entropy(node_df):
    """
        Group the node dataframe by Animal, and add columns for Gini and Entropy
    """
    node_size = len(node_df)
    # Group by Animal column
    node_df_grouped = node_df.groupby('Animal').size().reset_index(name='counts')
    # Intermediate columns
    node_df_grouped.loc[:,'p'] = node_df_grouped.loc[:,'counts'] / node_size
    node_df_grouped.loc[:,'pnot'] = 1 - node_df_grouped.loc[:,'p'] 
    node_df_grouped.loc[:,'logp'] = np.log2(node_df_grouped.loc[:,'p'])
    # Entropy
    node_df_grouped.loc[:,'Entropy'] = node_df_grouped.loc[:,'p'] * -1 * node_df_grouped.loc[:,'logp']
    # Gini
    node_df_grouped.loc[:,'Gini'] = node_df_grouped.loc[:,'p'] * node_df_grouped.loc[:,'pnot']
    # return result
    return node_df_grouped

In [87]:
grouped_df = calc_node_gini_entropy(df)
grouped_df.head(10)

Unnamed: 0,Animal,counts,p,pnot,logp,Entropy,Gini
0,bird,3,0.15,0.85,-2.736966,0.410545,0.1275
1,butterfly,7,0.35,0.65,-1.514573,0.530101,0.2275
2,caterpillar,2,0.1,0.9,-3.321928,0.332193,0.09
3,cow,1,0.05,0.95,-4.321928,0.216096,0.0475
4,dog,3,0.15,0.85,-2.736966,0.410545,0.1275
5,gorilla,2,0.1,0.9,-3.321928,0.332193,0.09
6,snake,2,0.1,0.9,-3.321928,0.332193,0.09


In [90]:
entropy = grouped_df['Entropy'].sum()
gini = grouped_df['Gini'].sum()
print (f'Before first split, Entropy: {round(entropy, 5)}')
print (f'Before first split, Gini:    {round(gini, 5)}')

Before first split, Entropy: 2.56387
Before first split, Gini:    0.8


## Function to Compute Optimal Split Point(s)

In [99]:
def entropy_gini_for_split_candidate (df, selector):
    # select subset of dataframe using selector and negated selector (true and false nodes)
    true_node = df.loc[selector, :]
    false_node = df.loc[~ selector, :]
    # calculate entropy and gini of true and false node
    true_node_grouped = calc_node_gini_entropy(true_node)
    false_node_grouped = calc_node_gini_entropy(false_node)
    # calculate entropy and gini of split
    entropy = (len(true_node) * true_node_grouped['Entropy'].sum() + len(false_node) * false_node_grouped['Entropy'].sum()) / animals
    gini = (len(true_node) * true_node_grouped['Gini'].sum() + len(false_node) * false_node_grouped['Gini'].sum()) / animals
    return entropy, gini

In [100]:
def calc_split_entropy_gini(df):
    """
        For each possible strategy of splitting on covering or leg count, compute entropy and gini
        Return a dataframe of all the strategies
    """
    # Calculate entropy and gini for splitting on each different body covering
    coverings = df.loc[:, 'Body Covering'].unique()
    split_results = []
    for covering in coverings:
        sel = df.loc[:, 'Body Covering'] == covering
        e, g = entropy_gini_for_split_candidate(df, sel)
        split_results.append([e, g])
    split_results_df_1 = pd.DataFrame(data = split_results,
                            index = coverings,
                            columns = ['Entropy', 'Gini'])
    
    # Add splits for leg counts
    leg_counts = [1, 3, 5, 10]
    split_results = []
    for legs in leg_counts:
        sel = df.loc[:, 'Legs'] <= legs
        e, g = entropy_gini_for_split_candidate(df, sel)
        split_results.append([e, g])
    split_results_df_2 = pd.DataFrame(data = split_results,
                            index = [f'Legs <= {legs}' for legs in leg_counts],
                            columns = ['Entropy', 'Gini'])
    
    split_results_df = pd.concat([split_results_df_1, split_results_df_2])
    return split_results_df

In [103]:
def calc_optimal_split_entropy_gini(df):
    """
        Find optimal split point entropy and gini
        Return dataframe of the optimal split strategies
    """
    split_results_df = calc_split_entropy_gini(df)
    optimal_entropy_for_splitting = split_results_df['Entropy'].min()
    sel = np.round(split_results_df.loc[:, 'Entropy'], 5) == round(optimal_entropy_for_splitting, 5)
    optimal_split_strategies = split_results_df.loc[sel, :]
    return optimal_split_strategies

## Test the Function for Computing Optimal Split Points

In [105]:
optimal_split_strategies = calc_optimal_split_entropy_gini(df)
optimal_split_strategies.head()

Unnamed: 0,Entropy,Gini
scales,1.571091,0.582828
Legs <= 5,1.571091,0.582828


In [106]:
print (f'Optimal values for split points: Entropy {round(optimal_split_strategies.iloc[0,0], 5)}, Gini {round(optimal_split_strategies.iloc[0,1], 5)}')

Optimal values for split points: Entropy 1.57109, Gini 0.58283


### Additionally, show Entropy and Gini for all strategies

In [108]:
split_results_df = calc_split_entropy_gini(df)
split_results_df.head(10)

Unnamed: 0,Entropy,Gini
scales,1.571091,0.582828
feathers,1.954025,0.641176
furry,1.629797,0.636264
hide,2.277468,0.742105
Legs <= 1,2.09487,0.688889
Legs <= 3,1.629797,0.636264
Legs <= 5,1.571091,0.582828
Legs <= 10,2.09487,0.688889
