In [None]:
##########################################################
# 1. IMPORT ALL PACKAGES
##########################################################
import pandas as pd
import numpy as np
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import math
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt #for plotting
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import multilabel_confusion_matrix
# importing mean()
from statistics import mean

In [None]:
########################################################### 2. LOAD DATASET
##########################################################
data = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv",header=0)# header 0 means the first row is name of the coloumn
# Delete unused columns
data.drop(["Unnamed: 32","id"], axis=1, inplace=True)
# Change label M(ganas = malignant) = 1 dan B(jinak = benign) = 0
data.diagnosis = [1 if each == "M" else 0 for each in data.diagnosis]
# Test select malignant data
m_data = data.loc[data['diagnosis'] == 1]
# Test select benign data
b_data = data.loc[data['diagnosis'] == 0]
# View sample data
b_data.head(1000)

In [None]:

# Check all classes/labels in trainiing data
all_label = set(data['diagnosis'].tolist())
print("All labels: {0}".format(all_label))


# Data distribution for each class
dst_data = Counter(data['diagnosis'])
print(dst_data)
# Plot distribution
sns.countplot(data['diagnosis'])


In [None]:
##########################################################
# 3. SHARE TO TEST AND TRAIN DATA
##########################################################
x = data.iloc[:, 1:]
y = data['diagnosis'].tolist()
# Share test and train data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

data.iloc[0:1, 1:].values

In [None]:
# 6. TRAIN RANDOM FOREST ALGORITHM
##########################################################
# Create a RandomForestClassifier object with the parameters over the data
# n_estimators (default=100) = the number of trees in the forest.
# max_depth (default=None) = the maximum depth of the tree.
model_clf = RandomForestClassifier(n_estimators=500, max_depth=2, random_state=0)

# Train the Random Forest algorithm
model_clf.fit(x_train, y_train)

In [None]:

# 6. APPLY THE TRAINED LEARNER TO TEST NEW DATA
##########################################################
# Apply the trained perceptron to make prediction of test data
y_pred = model_clf.predict(x_test)

In [None]:
##########################################################
# 6. CONFUSION MATRIX
##########################################################
# Actual and predicted classes
lst_actual_class = y_test
lst_predicted_class = y_pred
# label M(ganas = malignant) = 1 dan B(jinak = benign) = 0
lst_classes = [0, 1]
# Compute binary-class confusion matrix
tn, fp, fn, tp = confusion_matrix(lst_actual_class, lst_predicted_class, labels=lst_classes).ravel()
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
precision = round(tp/(tp+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
f1score = round((precision*sensitivity)/(precision+sensitivity), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
mcc = round(((tp*tn)-(fp*fn))/(math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))), 3);
print("TP={0}, FP={1}, TN={2}, FN={3}".format(tp, fp, tn, fn));
print("Sensitivity: {0}".format(sensitivity));
print("Specificity: {0}".format(specificity));
print("Accuracy: {0}".format(accuracy));
print("Balanced Accuracy: {0}".format(balanced_accuracy));
print("MCC: {0}".format(mcc));
print("Precision: {0}".format(precision));
print("F1 score: {0}".format(f1score));
