In [1]:
# Random Forest Algorithm Implementation
from random import seed
from random import randrange
from csv import reader
from math import sqrt

# Calculate accuracy of the model
def accuracy_score(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# split a dataframe based on feature and feature value
def split_node(index, value, dataset):
	left, right = list(), list()
	for col in dataset:
		if col[index] < value:
			left.append(col)
		else:
			right.append(col)
	return left, right

# Calculate the gini index for a best split dataframe
def gini_index(groups, classes):
	# count all samples at get_split point
	total_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		for class_val in classes:
			p = [col[-1] for col in group].count(class_val) / size
			score += p * p
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / total_instances)
	return gini

# Select the best split for a dataframe
def best_get_split(dataset, k_features):
	class_values = list(set(col[-1] for col in dataset))
	tree_index, tree_value, tree_score, tree_groups = 9999, 9999, 9999, None
	features = list()
	while len(features) < k_features:
		index = randrange(len(dataset[0])-1)
		if index not in features:
			features.append(index)
	for index in features:
		for col in dataset:
			groups = split_node(index, col[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < tree_score:
				tree_index, tree_value, tree_score, tree_groups = index, col[index], gini, groups
	return {'index':tree_index, 'value':tree_value, 'groups':tree_groups}

# Create a terminal node 
def terminal_node(group):
	outcomes = [col[-1] for col in group]
	return max(set(outcomes), key=outcomes.count)

# Create child splits for a node or make terminal
def get_split(node, max_depth, min_size, k_features, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no get_split
	if not left or not right:
		node['left'] = node['right'] = terminal_node(left + right)
		return
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = terminal_node(left), terminal_node(right)
		return
	# process left child
	if len(left) <= min_size:
		node['left'] = terminal_node(left)
	else:
		node['left'] = best_get_split(left, k_features)
		get_split(node['left'], max_depth, min_size, k_features, depth+1)
	# process right child
	if len(right) <= min_size:
		node['right'] = terminal_node(right)
	else:
		node['right'] = best_get_split(right, k_features)
		get_split(node['right'], max_depth, min_size, k_features, depth+1)

# Build a decision tree
def tree_build(train, max_depth, min_size, k_features):
	root = best_get_split(train, k_features)
	get_split(root, max_depth, min_size, k_features, 1)
	return root

# Make a prediction with a decision tree
def forecast(node, col):
	if col[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return forecast(node['left'], col)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return forecast(node['right'], col)
		else:
			return node['right']

# Create a random subsample from the dataset with replacement
def subsample(dataframe, ratio):
	sample = list()
	n_sample = round(len(dataframe) * ratio)
	while len(sample) < n_sample:
		index = randrange(len(dataframe))
		sample.append(dataframe[index])
	return sample

# Make a prediction with a list of bagged trees
def bagging_prediction(trees, col):
	predictions = [forecast(tree, col) for tree in trees]
	return max(set(predictions), key=predictions.count)

# Random Forest Algorithm
def random_forest_algo(train, test, max_depth, min_size, sample_size, n_trees, k_features):
	trees = list()
	for i in range(n_trees):
		sample = subsample(train, sample_size)
		tree = tree_build(sample, max_depth, min_size, k_features)
		trees.append(tree)
	predictions = [bagging_prediction(trees, col) for col in test]
	return(predictions)

# OOB forecast Algorithm
def OOB_forecast(train, test, max_depth, min_size, sample_size, n_trees, k_features):
    trees = list()
    OOB_scores=list()
    for i in range(n_trees):
        sample = subsample(train, sample_size)
        OOB_sample=[i for i in train if i not in sample]
        OOB_actual=[col[-1] for col in OOB_sample]
        tree = tree_build(sample, max_depth, min_size, k_features)
        trees.append(tree)
        OOB_predict=[bagging_prediction(trees,col) for col in OOB_sample]
        OOB_score=accuracy_score(OOB_actual,OOB_predict)
        OOB_scores.append(OOB_score)
    #print("OOB Scores",OOB_scores)
    return(sum(OOB_scores)/len(OOB_scores))



In [2]:
#importing the spam dataset
import pandas as pd
df=pd.read_csv("spam.data.txt",header=None, delimiter=" ")
df=df.values.tolist()

In [52]:
#defining the hyper-parameters
max_depth = 10
min_size = 10
sample_size = 0.66
k_features = int(sqrt(len(df[0])-1))
trees=[1,5,10,15,20]
scores=list()
test=list()
OOB_scores=list()

In [53]:
#Running the defined Random Forest model and calculating OOB errors
from sklearn.model_selection import train_test_split
for n_trees in trees:
    train,test= train_test_split(df, test_size=0.3, random_state=3250)
    actual = [col[-1] for col in test]
    predicted=random_forest_algo(train, test, max_depth, min_size, sample_size, n_trees, k_features)
    OOB_score=OOB_forecast(train, test, max_depth, min_size, sample_size, n_trees, k_features)
    OOB_scores.append(OOB_score)
    score=accuracy_score(actual, predicted)
    scores.append(score)
    print('Trees: %d' % n_trees)
    print('Scores: %s' % score)
    print('OOB Scores: %s' % OOB_score)

Trees: 1
Scores: 89.50036205648081
OOB Scores: 86.21815806662312
Trees: 5
Scores: 93.70021723388848
OOB Scores: 89.77394390698862
Trees: 10
Scores: 93.62780593772628
OOB Scores: 92.56036118660295
Trees: 15
Scores: 94.6415640839971
OOB Scores: 93.00215839201638
Trees: 20
Scores: 95.07603186097032
OOB Scores: 93.1606641431746


In [6]:
#checking the sensitivity when changing the m parameters

#defining the hyper-parameters
max_depth = 10
min_size = 10
sample_size = 0.66
m_paramter=[1,5,10,15,20]
scores=list()
test=list()
OOB_scores=list()

#Running the defined Random Forest model and calculating OOB errors
from sklearn.model_selection import train_test_split
for m in m_paramter:
    train,test= train_test_split(df, test_size=0.3, random_state=3250)
    actual = [col[-1] for col in test]
    predicted=random_forest_algo(train, test, max_depth, min_size, sample_size, 5, m)
    OOB_score=OOB_forecast(train, test, max_depth, min_size, sample_size, 5, m)
    OOB_scores.append(OOB_score)
    score=accuracy_score(actual, predicted)
    scores.append(score)
    print('No. of features(m): %d' % m)
    print('Scores: %s' % score)
    print('OOB Scores: %s' % OOB_score)

No. of features(m): 1
Scores: 88.34178131788559
OOB Scores: 79.1377793217569
No. of features(m): 5
Scores: 93.9174511223751
OOB Scores: 88.33371518335733
No. of features(m): 10
Scores: 94.27950760318609
OOB Scores: 90.82312485559571
No. of features(m): 15
Scores: 93.19333816075309
OOB Scores: 91.68319823114079
No. of features(m): 20
Scores: 94.1346850108617
OOB Scores: 92.17811496919562


In [29]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
df=pd.read_csv("spam.data.txt",header=None, delimiter=" ")
y=df[df.columns[-1]]
x=df.drop(df.columns[-1],axis=1)
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.3, random_state=3250)

In [47]:
#using Scikit Learn Random Forecast for classification
rf_sklearn=RandomForestClassifier()
rf_sklearn.fit(x_train,y_train)
predicted=rf_sklearn.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predicted)


0.9601737871107893