In [1]:
import pandas as pd

In [2]:
my_data=[['slashdot','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['digg','USA','yes',24,'Basic'],
        ['kiwitobes','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['(direct)','New Zealand','no',12,'None'],
        ['(direct)','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['slashdot','France','yes',19,'None'],
        ['digg','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['kiwitobes','UK','no',19,'None'],
        ['digg','New Zealand','yes',12,'Basic'],
        ['slashdot','UK','no',21,'None'],
        ['google','UK','yes',18,'Basic'],
        ['kiwitobes','France','yes',19,'Basic']]

In [3]:
my_data = pd.DataFrame(my_data, columns=["source", "country", "yes_no", "pages","tier"])

In [4]:
def split_dataset(dataset, column, value):
    if isinstance(value,int) or isinstance(value,float):
        df1 = dataset[dataset[column]>=value]
        df2 = dataset[dataset[column]< value]
    else:
        df1 = dataset[dataset[column]== value]
        df2 = dataset[dataset[column]!= value]
    return (df1,df2)

In [5]:
def unique_counts(dataset, class_column="tier"):
    return dataset[class_column].value_counts().to_dict()

In [6]:
from math import log
def entropy(dataset):
    log2 = lambda x: log(x)/log(2)
    results = unique_counts(dataset)
    ent = 0.0
    for k,v in results.items():
        p = float(v)/len(dataset)
        ent = ent - p*log2(p)
    return ent

In [7]:
class tree_node:
    def __init__(self,col=-1,value=None,leftn=None, rightn=None, leaf=None):
        self.col=col
        self.value=value
        self.leftn=leftn
        self.rightn=rightn
        self.leaf=leaf

In [8]:
class DecisionTree:
    def __init__(self, tree=None):
        self.tree = tree
    def fit(self, features, target, score_f=entropy):
        label = target.name
        features[label] = target.values
        self.tree = self.build_tree(features, label, score_f)
    def predict(self, features):
        predictions = []
        for row in range(0,len(features)):
            predictions.append(self.classify(features.iloc[row], self.tree))
        return predictions
    def classify(self, features, tree):
        if tree.leaf != None:
            return list(tree.leaf.keys())[0]
        else:
            v = features[tree.col]
            branch = None
            if isinstance(v,int) or isinstance(v,float):
                if v>= tree.value:
                    branch = tree.leftn
                else:
                    branch = tree.rightn
            else:
                if v==tree.value:
                    branch = tree.leftn
                else:
                    branch = tree.rightn 
            return self.classify(features, branch)
    def build_tree(self, dataset,label="tier",score_f=entropy):
        if len(dataset) == 0:
            return tree_node()
        current_score = score_f(dataset)

        best_gain = 0.0
        best_col_val = None
        best_dfs = None

        columns = list(dataset.columns)
        columns.remove(label)
        for col in columns:
            unique_vals = list(dataset[col].unique())
            for val in unique_vals:
                (df1,df2) = split_dataset(dataset, col, val)
                p = float(len(df1)/len(dataset))
                infg = current_score - p*score_f(df1) - (1-p)*score_f(df2)
                if infg > best_gain and len(df1)>0 and len(df2)>0:
#                     print("GAIN: {}, COL: {}, VAL: {}".format(infg,col,val))
                    best_gain = infg
                    best_col_val = (col,val)
                    best_dfs = (df1,df2)
        if best_gain>0:
            leftn = self.build_tree(best_dfs[0])
            rightn = self.build_tree(best_dfs[1])
            return tree_node(best_col_val[0], best_col_val[1], leftn, rightn)
        else:
            return tree_node(leaf=unique_counts(dataset))
        
def printtree(tree, indent=''):

    # Is this a leaf node?
    if tree.leaf!=None:
        print(str(tree.leaf))
    else:
        print(str(tree.col)+':'+str(tree.value)+'? ')
        # Print the branches
        print(indent+'T->', end=" ")
        printtree(tree.leftn,indent+'  ')
        print(indent+'F->', end=" ")
        printtree(tree.rightn,indent+'  ')

In [9]:
dt = DecisionTree()
test1 = dt.fit(my_data[["source", "country", "yes_no", "pages"]], my_data["tier"])

In [10]:
printtree(dt.tree)

source:google? 
T-> pages:18? 
  T-> yes_no:no? 
    T-> {'None': 1}
    F-> {'Basic': 1}
  F-> {'Premium': 3}
F-> source:slashdot? 
  T-> {'None': 3}
  F-> yes_no:yes? 
    T-> {'Basic': 4}
    F-> pages:21? 
      T-> {'Basic': 1}
      F-> {'None': 3}


In [11]:
predicted = dt.predict(my_data[["source", "country", "yes_no", "pages"]])

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(my_data['tier'], predicted)

1.0

In [13]:
def generate_trees(train, features, label, m=9):
    dt_list = []
    for i in range(0,m):
        sample = train.sample(len(train), replace=True)
        dt = DecisionTree()
        dt.fit(sample[features], sample[label])
        dt_list.append(dt)
    return dt_list

In [14]:
generate_trees(my_data, ["source", "country", "yes_no", "pages"], "tier")

[<__main__.DecisionTree at 0x1549811a160>,
 <__main__.DecisionTree at 0x15498202cc0>,
 <__main__.DecisionTree at 0x15498ab6be0>,
 <__main__.DecisionTree at 0x15498109e10>,
 <__main__.DecisionTree at 0x1549767a160>,
 <__main__.DecisionTree at 0x15498202128>,
 <__main__.DecisionTree at 0x15498aebef0>,
 <__main__.DecisionTree at 0x1549821ac50>,
 <__main__.DecisionTree at 0x15498aebe80>]

In [15]:
from collections import Counter
def bagging(train, test, features, label, m):
    predictions = []
    dt_list = generate_trees(train, features, label, m)
    df = pd.DataFrame()
    for i in range(0, len(dt_list)):
        dt = dt_list[i]
        predictions = dt.predict(test)
        df[i] = pd.Series(predictions)
    for row in df.iterrows():
        c = Counter([pred for pred in row[1]])
        predicted.append(c.most_common()[0][0])
    return predictions

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(my_data.drop('tier', 1), my_data['tier'], test_size=0.2)
x_train['tier']  = y_train
x_test['tier'] = y_test

bagging(x_train, x_test, ["source", "country", "yes_no", "pages"], 'tier', 9)

['None', 'None', 'None', 'None']