In [1]:
import math
import random

In [2]:
dataset = [
    [2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,1],
    [5.332441248,2.088626775,1],
    [6.922596716,1.77106367,1],
    [8.675418651,-0.242068655,1],
    [7.673756466,3.508563011,1]
    ]

In [3]:
def test_split(index,value,data):
    left, right = list(), list()
    for row in data:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
            
    return left, right

In [4]:
def gini_index(groups,class_values):
    gini = 0.0
    for class_val in class_values:
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            proportion = [row[-1] for row in group].count(class_val) / float(size)
            gini += proportion * (1.0 - proportion)
    return gini

In [17]:
def get_split(data,n_features):
    class_values = list(set(row[-1] for row in data))
    # class_values = [0,1]
    b_index, b_value, b_score, b_groups = 10,10,10,None
    features = list()
    while len(features) < n_features:
        index = random.randrange(len(data[0]) - 1)
        if index not in features:
            features.append(index)
    for index in features:
        for row in data:
            groups = test_split(index,row[index], data)
            gini = gini_index(groups, class_values)
            
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    
    return {'index' : b_index, 'value' : b_value, 'groups' : b_groups}

In [6]:
def to_terminal(group):
    outcome = [row[-1] for row in group]
    return max(set(outcome), key = outcome.count)

In [7]:
def split(node,max_depth, min_size, n_features,depth):
    left,right = node['groups']
    del node['groups']
    
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left, n_features)
        split(node,max_depth, min_size, n_features,depth+1)
    
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right, n_features)
        split(node,max_depth, min_size, n_features,depth+1)

In [8]:
def build_tree(sample,max_depth, min_size, n_features):
    root = get_split(sample, n_features)
    split(root, max_depth, min_size, n_features,1)
    return root

In [9]:
def predict(node,row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
        
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

In [10]:
def subsample(dataset, ratio):
    sample = list()
    n_sample = round(len(dataset) * ratio)
    while len(sample) < n_sample:
        index = random.randrange(len(dataset))
        sample.append(dataset[index])
    return sample

In [11]:
def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key = predictions.count)

In [12]:
def random_forest(dataset,max_depth,min_size,sample_size,n_trees,n_features):
    trees = list()
    for i in range(n_trees):
        sample = subsample(dataset,sample_size)
        tree = build_tree(sample,max_depth, min_size, n_features)
        trees.append(tree)
    predictions = [bagging_predict(trees,row) for row in dataset]
    return predictions

In [13]:
# predictions = random_forest()
# print(predictions)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
threshold = 0.20
len(dataset) * threshold

2.0

In [None]:
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = len(dataset[0])
trees = [1,5,10]
for n_trees in trees:
    random_forest(dataset,max_depth,min_size,sample_size,n_trees,n_features)