<a href="https://colab.research.google.com/github/stefanoridolfi/ML_From_scratch/blob/master/test_ricorsivo_split_random_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
from random import seed
from random import randrange
seed(1)

# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
  # count all samples at split point
  n_instances = float(sum([len(group) for group in groups]))
  # sum weighted Gini index for each group
  gini = 0.0
  for group in groups:
    # size is the number of istance (rows) inside the group 
    size = float(len(group))
    # avoid divide by zero
    if size == 0:
      # is size is 0 exit from the loop of the groups
      continue
    score = 0.0
   # score the group based on the score for each class
    class_list=[]
    for row in group:
        class_list.append(row[-1])
    for class_val in classes:
      # p of this class in this group
      p = class_list.count(class_val) / size
      # score is the sum of all p^2 of all class inside the group
      score += p * p
    # weight the group score by its relative size
    # Gini index of this group
    gini_group=(1.0 - score) * (size / n_instances)
    #print("Gini index of group",gini_group)
    gini+=gini_group
  return gini


# get classes in dataset
def get_classes(dataset):
    class_values = list(set(row[-1] for row in dataset))
    return class_values

def split_in2(dataset,level):
  class_values=get_classes(dataset)
  b_groups=None
  gini=0
  b_score=b_value=999
  for index in range(len(dataset[0])-1):
    for row in dataset:
      #print("row",row)
      left, right = split_value(index, row[index], dataset)
      #print("left, right in split_in2 in level",level,"    " ,len(left),len(right))
      groups=[left, right]
      gini = gini_index(groups, class_values)
      pred_l="-"
      if len(left)>0:
        pred_l=to_terminal(left)
      pred_r="-"
      if len(right)>0:
        pred_r=to_terminal(right)
      if gini < b_score:
        b_index, b_value,len_left,len_right,b_groups,b_pred_l,b_pred_r,b_score=index,row[index],len(left),len(right),groups,pred_l,pred_r,gini    
  return {'level':level,'index':b_index,'b_value':b_value,'len_left':len_left,'len_right':len_right,'groups':b_groups,'pred_l':b_pred_l,'pred_r':b_pred_r,'gini':b_score}
  
  
#Split a dataset based on an attribute and an attribute value
def split_value(index, value, dataset):
  left, right = list(), list()
  for row in dataset:
    if row[index] < value:
      left.append(row)
    else:
      right.append(row)
  return left, right

def get_split(node,n_levels):
  #print("node di groups",node['groups'] )
  left, right = node['groups']
  level=node['level']
  if  (len(left)==0 or len(right)==0):
    return
  if (len(left)>1 and level <n_levels):
      level=node['level']
      node['left']=split_in2(left,level+1)
      get_split(node['left'],n_levels-1)
  if (len(right)>1 and level <n_levels):
       level=node['level']
       node['right']=split_in2(right,level+1)
       get_split(node['right'],n_levels-1)
  else:
    return 

def build_tree(node,n_levels):
    root=split_in2(dataset,0)
    get_split(root,n_levels)
    return root

def print_tree(node,listone,ident):
  if isinstance(node, dict):
    if node['level']==0:
      #print("\nlevel:",int("0"),"\nleft:",node['groups'][0],"\nright:",node['groups'][1])
      #listone.append([node['level'],node['groups'][0],node['groups'][1]])
      listone.append([0,node['index'],node['b_value'],node['pred_l'],node['pred_r'],str(ident)])

    if 'left' in node.keys():
      l=node['left']
      lev_l=l['level']
      #print("\nlevel:",lev_l,"\nleft:",l['index'],l['b_value'])
      #istone.append([lev_l,l['groups'],l['groups']])
      ident=str(ident)+"L"
    
      listone.append([lev_l,l['index'],l['b_value'],l['pred_l'],l['pred_r'],ident])
      print_tree(l,listone,ident)
      #print(l)
      #print_tree(l,listone)
      #print_tree(r,listone)
    if 'right' in node.keys():
      r=node['right']
      lev_r=r['level']
      ident=str(ident)+"R"
      #print("\nlevel:",lev_r,"\nright:",r['index'],r['b_value'])
      #listone.append([lev_r,r['groups'],r['groups']])
      listone.append([lev_r,r['index'],r['b_value'],r['pred_l'],r['pred_r'],ident])
      print_tree(r,listone,ident)
      #print(l)
      #print_tree(l,listone)
      #print_tree(r,listone)
      #print_tree(node,listone)
  return listone
  

def to_terminal(group):
  outcomes = [int(row[-1]) for row in group]
  return max(set(outcomes), key=outcomes.count)

def max_in_listone(listone):
  maks=0
  for row in listone:
    if row[0]>maks:
      maks=row[0]
  return maks

#print("max=",max)
########################

dataset = [[2.771244718,1.784783929,0],[1.728571309,1.169761413,0],[3.678319846,2.81281357,0],[3.961043357,2.61995032,0],[7.497545867,3.162953546,1],[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],[10.12493903,3.234550982,1],[6.642287351,3.319983761,1],[2.771244718,1.784783929,0],[1.728571309,1.169761413,0],[3.678319846,2.81281357,0],[3.961043357,2.61995032,0],[7.497545867,3.162953546,1],[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],[10.12493903,3.234550982,1],[6.642287351,3.319983761,1],[2.771244718,1.784783929,0],[1.728571309,1.169761413,0],[3.678319846,2.81281357,0],[3.961043357,2.61995032,0],[7.497545867,3.162953546,1],[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],[10.12493903,3.234550982,1],[6.642287351,3.319983761,1],[2.771244718,1.784783929,0],[1.728571309,1.169761413,0],[3.678319846,2.81281357,0],[3.961043357,2.61995032,0],[7.497545867,3.162953546,1],[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],[10.12493903,3.234550982,1],[6.642287351,3.319983761,1]]


#print("dataset",dataset)
n_levels=5
node=split_in2(dataset,0)
#print("\nnode=",node)
tree=build_tree(node,n_levels)
print("tree",tree,"\n")
listone=print_tree(tree,[],0)
print("test outcomes",to_terminal(dataset))
maximum=max_in_listone(listone)
print("listone: level, variable index, variable values,pred_left, pred:_right",listone)
for i in range(max_in_listone(listone)+1):
 for row in listone:
    if row[0]==i:
      print("livello:",row[0],"X"+str((row[1]+1)),"<",row[2],row[3],row[4])

def print_loop (di):
  for x in di:
    if x=='left':
      print("level di left",di['left']['level'])
      print_loop(di['left'])
    if x=='right':
      print("level di right",di['right']['level'])
      print_loop(di['right'])
    
test=print_loop(tree)


tree {'level': 0, 'index': 0, 'b_value': 6.642287351, 'len_left': 16, 'len_right': 20, 'groups': [[[2.771244718, 1.784783929, 0], [1.728571309, 1.169761413, 0], [3.678319846, 2.81281357, 0], [3.961043357, 2.61995032, 0], [2.771244718, 1.784783929, 0], [1.728571309, 1.169761413, 0], [3.678319846, 2.81281357, 0], [3.961043357, 2.61995032, 0], [2.771244718, 1.784783929, 0], [1.728571309, 1.169761413, 0], [3.678319846, 2.81281357, 0], [3.961043357, 2.61995032, 0], [2.771244718, 1.784783929, 0], [1.728571309, 1.169761413, 0], [3.678319846, 2.81281357, 0], [3.961043357, 2.61995032, 0]], [[7.497545867, 3.162953546, 1], [9.00220326, 3.339047188, 1], [7.444542326, 0.476683375, 1], [10.12493903, 3.234550982, 1], [6.642287351, 3.319983761, 1], [7.497545867, 3.162953546, 1], [9.00220326, 3.339047188, 1], [7.444542326, 0.476683375, 1], [10.12493903, 3.234550982, 1], [6.642287351, 3.319983761, 1], [7.497545867, 3.162953546, 1], [9.00220326, 3.339047188, 1], [7.444542326, 0.476683375, 1], [10.1249390