# ML- Random Forest Code 

## Breast Cancer Diagnosis


Import the needed Libraries and packages 

In [28]:
import numpy as np 
from random import seed
import random 
from random import randrange
from csv import reader
from math import sqrt
from sklearn.metrics import confusion_matrix, f1_score 
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd 
from google.colab import drive
import os
from random import sample

Mount Drive 

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


Set file path and change directory to our specified directory 

In [3]:
# filepath 
filepath= "/content/drive/MyDrive/ML-project/data_final.csv"

#change directory 
os.chdir("/content/drive/MyDrive/ML-project/")

Preparing the Dataset

In [4]:
# function to load our csv data file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

In [25]:
# function to convert the string column entries to floats
def str_column_to_float(dataset, column):
	# loop over dataset
	for row in dataset:
		# convert the string to float 
		row[column] = float(row[column].strip())

# function to convert the string column entries to integers
def str_column_to_int(dataset, column):
    # create set of all vals in the col
    values = set(row[column] for row in dataset)
    # create dictionary mapping each val to integer index
    value_map = {value: i for i, value in enumerate(values)}
    # iterate over each row in the dataset and replace the string value with its corresponding integer value
    for row in dataset:
        row[column] = value_map[row[column]]

Functions for Data Separation and Cross Validation

In [26]:
# function to split the dataset into k specified folds
def cross_validation_split(dataset, n_folds): 
    # copy the dataset 
    dataset_copy = dataset.copy()
    # get size of each fold
    fold_size = len(dataset) // n_folds
    # empty list to store the folds
    folds = []
    # loop through the k folds 
    for i in range(n_folds):
        # sample without replacement to get the datapoints in this fold 
        fold = sample(dataset_copy, fold_size)
        # remove the samples from the dataset so they don't appear in other folds
        dataset_copy = [data for data in dataset_copy if data not in fold]
        # add the samples to the list of folds
        folds.append(fold)
    # return the list of folds
    return folds

In [36]:
# function to split our dataset based on a feature and its corresaponding value
def test_split(index, value, dataset):
    # split dataset into left and right nodes 
		# based on the specified feature and value
		left = [row for row in dataset if row[index] < value]
		right = [row for row in dataset if row[index] >= value]
		# return the left and right splits 
		return left, right

In [37]:
# function to create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
    # get size of subsample
    n_sample = round(len(dataset) * ratio)
    # empty list to store subsample
    sample = []
    # loop through the dataset 
    for i in range(n_sample):
        # select an index randomly 
        index = random.randrange(len(dataset))
        # add the row to the subsample 
        sample.append(dataset[index])
    return sample

Functions for Random Forest Construction 

In [48]:
# function to build the decision tree
def construct_tree(train, max_depth, min_size, n_features):
	# make the root 
	root = create_split_point(train, n_features)
	# split the re4st of the dataset recursively to make the whole tree 
	actual_split(root, max_depth, min_size, n_features, 1)
	return root

In [46]:
# function to select the best split point
def create_split_point(dataset, n_features):
	# get list of unique class vals
	class_values = list(set(row[-1] for row in dataset))
	# variables to store the best split point found so far
	best_index, best_value, best_score, best_groups = 999, 999, 999, None 
	features = []
	# list of features to consider for split
	while len(features) < n_features:
		index = random.randrange(len(dataset[0])-1)
		if index not in features:
			features.append(index)
	# loop through each feature and each row to find best split point
	for index in features:
		for row in dataset:
			# split dataset based on the current feature and row 
			groups = test_split(index, row[index], dataset)
			# get the gini index of the split
			gini = gini_index(groups, class_values)
			# update the best split point if the current split has a lower gini index
			if gini < best_score:
				best_index, best_value, best_score, best_groups = index, row[index], gini, groups
	# return a dictionary with the best split point found
	return {'index': best_index, 'value': best_value, 'groups': best_groups}

In [44]:
# function that creates child splits for a node or makes the node terminal
def actual_split(node, max_depth, min_size, n_features, depth):
	left, right = node['groups']
	del(node['groups'])
	# if no split, then the node is a terminal node 
	if not left or not right:
		# make node left and righ terminal 
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth
	# if more than max depth, stop expamding th etree and make the nodes terminal 
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# if the length is less than the max depth, continue divoding the tree 
	# left child
	# if size is less than min size needed for a node, make it terminal 
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		# if size is large enough for datapoints under this node, choose an attribute to split on and split 
		node['left'] = create_split_point(left, n_features)
		actual_split(node['left'], max_depth, min_size, n_features, depth+1)
	# right child
	# if size is less than min size needed for a node, make it terninal 
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		# if size is large enough for datapoints under this node, choose an attribute to split on and split 
		node['right'] = create_split_point(right, n_features)
		actual_split(node['right'], max_depth, min_size, n_features, depth+1)

In [49]:
# function to create a terminal node value
def to_terminal(subset):
	# set the node label as the majoity label of the corresponding rows in the subset
	outcomes = [row[-1] for row in subset]
	return max(set(outcomes), key=outcomes.count)

In [51]:
# function to calculate the gini index for the dataset
def gini_index(groups, classes):
	# get all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum the weighted gini index for each group
	gini = 0.0
	# gor each group 
	for g in groups:
		# get size of group
		size = float(len(g))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# get the score of the group based on the score for each class
		for val in classes:
			p = [row[-1] for row in g].count(val) / size
			score += p * p
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / n_instances)
	# return the gini index 
	return gini

Codes For Plotting, Visualization, and Calculating Performance  

In [52]:
# function to make a prediction with a decision tree
def predict(node, row):
    # check if feature val of given row is less than the node split value 
		# here, feature value less than node -> go to left 
    if row[node['index']] < node['value']:
        # if left child of the node is a dictionary, we already have a subtree at this node 
				# recursively call the predict function with the left child as the new node
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        # if not, no left subtree
				# return the predicted class label = value of the left child
        else:
            return node['left']
    # here, feature value greater than or equal to the split node -> go right 
    else:
        # if right child of the node is a dictionary, we already have a subtree at this node 
				# recursively call the predict function with the left child as the new node
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        # if not, no right subtree
				# return the predicted class label = value of the right child
        else:
            return node['right']

In [15]:
# function to make a prediction with a list of bagged trees
# def bagging_predict(trees, row):
# 	# make predictions for each tree in trees 
# 	predictions = [predict(tree, row) for tree in trees]
# 	# return the majority prediction from all trees 
# 	return max(set(predictions), key=predictions.count)

def bagging_predict(trees, row):
    # list to store the different tree predictions 
    predictions = []
    # make predictions for each tree in trees
    for tree in trees:
        prediction = predict(tree, row)
        predictions.append(prediction)
    # count the occurrence of each prediction (0 or 1)
    counts = [predictions.count(p) for p in set(predictions)]
    # get the index of the prediction with the max count
    max_index = counts.index(max(counts))
    # return the prediction with the max count
    return list(set(predictions))[max_index]

In [16]:
# function to calculate the accuracy 
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

In [17]:
# function to evaluate our algorithm using cross validation 
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores_test = list()
	scores_train = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predictions_test, predictions_train = algorithm(train_set, test_set, *args)
	
		actual_test = [row[-1] for row in fold]
		accuracy_test = accuracy_metric(actual_test, predictions_test)
		f1_score_test = f1_score(actual_test, predictions_test)
		# plot_confusion_matrix(actual_test, predictions_test,[0.,1.])
	
		actual_train = [row[-1] for row in train_set]
		accuracy_train = accuracy_metric(actual_train, predictions_train)
		f1_score_train = f1_score(actual_train, predictions_train)
		
		scores_test.append(accuracy_test)
		scores_train.append(accuracy_train)
	

	return scores_test, scores_train,f1_score_test,f1_score_train

In [18]:
#function to plot the confusion matrix 
def plot_confusion_matrix(actual_classes : np.array, predicted_classes : np.array, sorted_labels : list):
		matrix = confusion_matrix(actual_classes, predicted_classes, labels=sorted_labels)
		matrix= matrix.astype('float') / matrix.sum(axis=1)[:,np.newaxis]
		plt.figure(figsize=(12.8,6))
		sns.heatmap(matrix, annot=True, xticklabels=sorted_labels, yticklabels=sorted_labels, cmap="RdPu", fmt="g")
		plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')
		# if title == None:
		# 	plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')
		# else:
		# 	plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title(title)
		plt.show()

Running Random Forest

In [19]:
# the function for running the random forest algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
	trees = list()
	for i in range(n_trees):
		sample = subsample(train, sample_size)
		tree = construct_tree(sample, max_depth, min_size, n_features)
		trees.append(tree)
	predictions_train = [bagging_predict(trees, row) for row in train]
	predictions_test = [bagging_predict(trees, row) for row in test]

	return(predictions_test,predictions_train)

Our test of Random Forest 

In [47]:
# Test the random forest algorithm
seed(2)

# load and prepare data
filename = '/content/drive/MyDrive/ML-project/data_final.csv'
dataset = load_csv(filename)

# convert string attributes to integers
for i in range(0, len(dataset[0])-1):
	str_column_to_float(dataset, i)

# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

# evaluate algorithm
# set the number of folds for cross validation, the tree max depth and min size, the number of features at each level, and initialize the array of accuracies 
n_folds = 5
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset[0])-1))
num_trees_lower = 5
num_trees_upper = 10 + 1
all_mean_accuracies = [0]*(num_trees_upper-num_trees_lower)
i = 0

# loop over the range of forests with n_trees in each forest
for n_trees in range(num_trees_lower,num_trees_upper):
  # for each forest size within the rage, get test,train, f1 scores 
	scores_test, scores_train,f1_score_test,f1_score_train = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
	# get the mean accuracy 
	mean_test_accuracy = sum(scores_test)/float(len(scores_test))
	all_mean_accuracies[i] = mean_test_accuracy
	
	print('Number of Trees in our Forest: %d' % n_trees)
	print('Testing Scores: %s' % scores_test)
	print('Testing Mean Accuracy: %.3f%%' % (mean_test_accuracy))
	print('F1 Score: %.3f%%' % (f1_score_test*100))
	print('Training Scores: %s' % scores_train)
	print('Training Mean Accuracy: %.3f%%' % (sum(scores_train)/float(len(scores_train))))
	print('______________')
	i += 1

Number of Trees in our Forest: 5
Testing Scores: [93.80530973451327, 93.80530973451327, 94.69026548672566, 94.69026548672566, 93.80530973451327]
Testing Mean Accuracy: 94.159%
F1 Score: 95.238%
Training Scores: [98.45132743362832, 99.33628318584071, 99.11504424778761, 98.67256637168141, 99.77876106194691]
Training Mean Accuracy: 99.071%
______________


KeyboardInterrupt: ignored