# importing all the libraries

In [1]:
from random import seed
from random import randrange
from csv import reader
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline


# loading the binary class Data Set

In [3]:
import io
data = pd.read_csv('BinaryClass_Mobile_Price_train.csv')
print(data)

      battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  \
0               842     0          2.2         0   1       0           7   
1              1021     1          0.5         1   0       1          53   
2               563     1          0.5         1   2       1          41   
3               615     1          2.5         0   0       0          10   
4              1821     1          1.2         0  13       1          44   
5              1859     0          0.5         1   3       0          22   
6              1821     0          1.7         0   4       1          10   
7              1954     0          0.5         1   0       0          24   
8              1445     1          0.5         0   0       0          53   
9               509     1          0.6         1   2       1           9   
10              769     1          2.9         1   0       0           9   
11             1520     1          2.2         0   5       1          33   
12          

#  random forest

In [4]:
# Split a dataset into k folds and does a cross validation split
def crossvalidator(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split
 
# Calculate accuracy percentage on the test and train data
def accuracycalculator(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0
 
# Evaluate an algorithm using a cross validation split
scores=[]
def algorithm(dataset, algorithm, n_folds, *args):
  dataset_copy=list(dataset)
  train_set=dataset_copy[:int(len(dataset_copy)*(7/10))] #Training data is first 70%
  test_set=dataset_copy[int(len(dataset_copy)*(7/10)):] #Test data is last 30%
  predicted = algorithm(train_set, test_set, *args)
  actual = [row[-1] for row in test_set]
  accuracy = accuracycalculator(actual, predicted)
  scores.append(accuracy)
  return scores
 
# Split a dataset based on an attribute and an attribute value
def datasetsplitter(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right
 
# Calculate the Gini index for a split dataset
def giniindexcalculator(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini
 
# Select the best split point for a dataset
def bestsplit(dataset, n_features):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    features = list()
    while len(features) < n_features:
        index = randrange(len(dataset[0])-1)
        if index not in features:
            features.append(index)
    for index in features:
        for row in dataset:
            groups = datasetsplitter(index, row[index], dataset)
            gini = giniindexcalculator(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}
 
# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)
 
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, n_features, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = bestsplit(left, n_features)
        split(node['left'], max_depth, min_size, n_features, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = bestsplit(right, n_features)
        split(node['right'], max_depth, min_size, n_features, depth+1)
 
# Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
    root = bestsplit(train, n_features)
    split(root, max_depth, min_size, n_features, 1)
    return root
 
# Make a prediction with a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']
 
# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
    sample = list()
    n_sample = round(len(dataset) * ratio)
    while(len(sample) < n_sample):
        index = randrange(len(dataset))
        sample.append(dataset[index])
    return sample
 
# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count)
 
# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
    trees = list()
    for i in range(n_trees):
        sample = subsample(train, sample_size)
        tree = build_tree(sample, max_depth, min_size, n_features)
        trees.append(tree)
        predictions = [bagging_predict(trees, row) for row in test]
    return(predictions)

# Accuracy 

In [5]:
dataset = data.values.tolist()
# evaluate algorithm
max_depth = 10
min_size = 1
n_folds = 5
features = int(sqrt(len(dataset[0])-1))
for trees in [1, 5, 10]:
    scores = algorithm(dataset, random_forest, n_folds, max_depth, min_size, 1.0, trees, features)
    print('Trees: '  , trees)
    print('Scores: '  , scores)
    print('Mean Accuracy: ' , (sum(scores)/float(len(scores))))

Trees:  1
Scores:  [89.83333333333333]
Mean Accuracy:  89.83333333333333
Trees:  5
Scores:  [89.83333333333333, 92.33333333333333]
Mean Accuracy:  91.08333333333333
Trees:  10
Scores:  [89.83333333333333, 92.33333333333333, 93.0]
Mean Accuracy:  91.72222222222221
