### Author: Yifan Wang

# Part 1: ID3




In [79]:
from sklearn.datasets import load_breast_cancer
import numpy as np
from collections import Counter

In [177]:
# Helper Functions.
# Basically our tree will be searching for the best info gain at each split


def entropy(tmp_y):
    '''
    Key Metrics of building a decision tree
    Specifically Shannon Entropy
    '''
    tmp_ent = 0
    for uni_y in np.unique(tmp_y):
        p = len(tmp_y[tmp_y==uni_y])/len(tmp_y)
        tmp_ent -= (p*np.log2(p))
    return tmp_ent


def decide_split_data(x,y):
    '''
    Given subset of X,Y, search for the best splitting node based on:
    
    information gain
    '''
    m,n = x.shape
    best_gain = 0
    split_row, split_col = None,None
    
    previous_entropy = entropy(y)
    for col in range(n):
        tmp_vec = x[:,col].ravel()

        for row in range(m):
            val = tmp_vec[row]
            # >= & < is my convention here:
            if val!=np.max(tmp_vec) and val!= np.min(tmp_vec):
                left_b = np.where(tmp_vec<val)[0].tolist()
                right_b = np.where(tmp_vec>=val)[0].tolist()

                # new entropy is the weighted  average entropy from each of the subset
                new_ent = \
                (len(y[left_b])/len(y))*entropy(y[left_b]) + \
                (len(y[right_b])/len(y))*entropy(y[right_b])
                
                
#                 print('new entropy: %f'%new_ent)
                info_gain = previous_entropy - new_ent

                if info_gain > best_gain:
                    split_row, split_col = row,col
                    best_gain = info_gain
                    print('better gain:{}'.format(best_gain))
                    print()

    return split_row, split_col
                
                

def mode(x_list):
    return Counter(x_list).most_common(1)[0][0]
    


In [178]:
### Again, breast cancer data, just a subset because I write crappy code and i might break
X,y = load_breast_cancer(return_X_y=True)
X,y = X[:30,:5],y[:30,]

In [179]:
least_children = 2 # this can control overfit, maybe should more?

In [180]:
def build_tree(tmp_x,tmp_y,threshold=1):
#     Exit Condition 0:
    
    # Exit Condition 1:
    if \
    len(tmp_y)== 1 or len(np.unique(tmp_y))==1:
        print('exit condition:')
        print('tmp_y:')
        print(tmp_y)
        
        mode_val = mode(tmp_y.flatten().tolist())
        return([np.nan, mode_val, np.nan, np.nan]) # Leaf Node: format [feat,splitval,]
    
    
    
    
    
    
    # Otherwise Split:
    print("start....subset Y len {}".format(len(tmp_y)))
    split_row,split_col = decide_split_data(tmp_x,tmp_y)
    
    if not split_row and not split_col:
        print('no better split...return mode')
        mode_val = mode(tmp_y.flatten().tolist())
        return([np.nan, mode_val, np.nan, np.nan])
        
    
    print("split on:")
    print(split_row,split_col)
    split_vec = tmp_x[:,split_col]
    split_val = tmp_x[split_row,split_col]
    
    # Recursively Split to left and right branches:
    left_ind = np.where(split_vec<split_val)[0].tolist()
    right_ind = np.where(split_vec>=split_val)[0].tolist()
    left_dat,left_y = tmp_x[left_ind,:],tmp_y[left_ind,]
    right_dat,right_y = tmp_x[right_ind,:],tmp_y[right_ind,]
    
    
#     print('left dat shape')
#     print(left_dat.shape)
#     print(left_y.shape)
#     print('right dat shape')
#     print(right_dat.shape)
#     print(right_y.shape)
#     print()
    
    
    left_tree = build_tree(left_dat,left_y)
    right_tree = build_tree(right_dat,right_y)
    
    
    if isinstance(left_tree, list): # If list, tree len 1
        len_l_tree = 1
    else:
        len_l_tree = left_tree.shape[0] # If array, tree len >1
    print('len left tree {}'.format(len_l_tree))
    
    
    
    root = [split_col,split_val,1,len_l_tree+1] # Format [split_col, split_val, left_tree_relative_idx, right_tree_relative_idx]
    
    
    
    return(np.vstack([root,left_tree,right_tree]))

In [181]:
tree = build_tree(X,y)

start....subset Y len 30
better gain:0.054824648581652036

better gain:0.11901271691308318

better gain:0.23910902851462254

better gain:0.2689955935892812

split on:
8 2
start....subset Y len 6
better gain:0.19087450462110944

better gain:0.4591479170272448

better gain:1.0

split on:
2 4
exit condition:
tmp_y:
[1 1 1]
exit condition:
tmp_y:
[0 0 0]
len left tree 1
exit condition:
tmp_y:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
len left tree 3


#### This is how our tree look like, spend some time to stare at it

* Each row is a node

* the rows with 3 nan are leaves, means no more children they have

* The format is [split_col, split_val, left_tree_relative_idx, right_tree_relative_idx]

In [183]:
tree

array([[ 2.    , 87.5   ,  1.    ,  4.    ],
       [ 4.    ,  0.1186,  1.    ,  2.    ],
       [    nan,  1.    ,     nan,     nan],
       [    nan,  0.    ,     nan,     nan],
       [    nan,  0.    ,     nan,     nan]])

In [191]:
def query(tree,tmp_test_array):
    '''
    Test for single example
    '''
    assert len(tmp_test_array.shape) == 2, "Make sure your test data is 2d array"
#     print(tree)

    start_node = tree[0,:] # Iteratively hit first row
#     print(start_node)
#     print()
    test_feat,test_val,left_tree_jump,right_tree_jump = start_node[0],start_node[1],start_node[2],start_node[3]
     
    # Exit Condition:
    if np.isnan(test_feat) and np.isnan(left_tree_jump) and np.isnan(right_tree_jump):
        pred = test_val;
        return pred 
    #Test:
    if tmp_test_array[0,int(test_feat)] < test_val:
        # If <, go left branch:
        jump_loc = left_tree_jump
        pred = query(tree[int(jump_loc):,],tmp_test_array)
    
    else:
        # If >=, go right branch:
        jump_loc = right_tree_jump
        pred = query(tree[int(jump_loc):,],tmp_test_array)

    return pred
    
    
    
def predict(tree,tmp_test_array):
    '''
    Wrap-up fun for prediction
    '''

    assert len(tmp_test_array.shape) == 2, "Make sure your test data is 2d array"
    result = []

    for i in range(tmp_test_array.shape[0]):
        inp = tmp_test_array[i,:].reshape(1,-1)
        result.append(query(tree,inp))
    return result  
        
        
        
    

In [194]:
# We gonna cheat here, in-sample prediction:
pred = predict(tree,X)

In [198]:
import pandas
print(pandas.DataFrame([i for i in zip(pred,y)],columns=['in_sample_prediction','label']))

    in_sample_prediction  label
0                    0.0      0
1                    0.0      0
2                    0.0      0
3                    0.0      0
4                    0.0      0
5                    0.0      0
6                    0.0      0
7                    0.0      0
8                    0.0      0
9                    0.0      0
10                   0.0      0
11                   0.0      0
12                   0.0      0
13                   0.0      0
14                   0.0      0
15                   0.0      0
16                   0.0      0
17                   0.0      0
18                   0.0      0
19                   1.0      1
20                   1.0      1
21                   1.0      1
22                   0.0      0
23                   0.0      0
24                   0.0      0
25                   0.0      0
26                   0.0      0
27                   0.0      0
28                   0.0      0
29                   0.0      0


Yay All correct!!

### Next: 

* Perfect Ensemble Random Trees
* Random Forest
* Gradient Boosting