# Random Forest in Python using scikit learn

In [2]:
# Required Python Packages

#Pandas:
#Pandas package is the best choice for tabular data analysis.
#All the data manipulation tasks in this article are going to use the Pandas methods.
import pandas as pd

#train_test_split:
#We imported scikit-learn train_test_split method to split the breast cancer dataset into test and train dataset.
#Train dataset will be used in the training phase and the test dataset will be used in the validation phase.
from sklearn.model_selection import train_test_split

#RandomForestClassifier:
#We imported scikit-learn RandomForestClassifier method to model the training dataset with random forest classifier.
#Later the modeled random forest classifier used to perform the predictions.
from sklearn.ensemble import RandomForestClassifier

#accuracy_score:
#We imported scikit-learn accuracy_score method to calculate the accuracy of the trained classifier.
from sklearn.metrics import accuracy_score

#confusion_matrix:
#We imported scikit-learn confusion_matrix to understand the trained classifier behavior over the test dataset or validate dataset.
from sklearn.metrics import confusion_matrix

In [16]:
#In this dataset Missing value is represented as ?
data = pd.read_csv("breast-cancer-wisconsin.csv",na_values='?')

In [17]:
data.head()

Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,CancerType
0,1002945,5,4,4,5,7,10.0,3,2,1,2
1,1015425,3,1,1,1,2,2.0,3,1,1,2
2,1016277,6,8,8,1,3,4.0,3,7,1,2
3,1017023,4,1,1,3,2,1.0,3,1,1,2
4,1017122,8,10,10,8,7,10.0,9,7,1,4


In [18]:
data.describe()

Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,CancerType
count,698.0,698.0,698.0,698.0,698.0,698.0,682.0,698.0,698.0,698.0,698.0
mean,1071807.0,4.416905,3.137536,3.210602,2.809456,3.217765,3.548387,3.438395,2.869628,1.590258,2.690544
std,617532.3,2.817673,3.052575,2.972867,2.856606,2.215408,3.645226,2.440056,3.055004,1.716162,0.951596
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870258.2,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,1238354.0,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [103]:
#Checking for Missing / NULL Values
data.isnull().values.any()

True

In [11]:
data.shape

(698, 11)

In [20]:
data["BareNuclei"].isnull().sum()

16

In [23]:
data[data.isnull().any(axis=1)]

Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,CancerType
22,1057013,8,4,5,1,2,,7,3,1,4
39,1096800,6,6,6,9,6,,7,8,1,2
138,1183246,1,1,1,1,1,,2,1,1,2
144,1184840,1,1,3,1,2,,2,1,1,2
157,1193683,1,1,2,1,3,,1,1,1,2
163,1197510,5,1,1,1,2,,3,1,1,2
234,1241232,3,1,4,1,2,,3,1,1,2
248,169356,3,1,1,1,2,,3,1,1,2
274,432809,3,1,3,1,2,,2,1,1,2
291,563649,8,8,8,1,2,,6,10,1,4


In [24]:
#Removing the records with Missing values
data1 = data.dropna()

In [25]:
data1.shape

(682, 11)

In [26]:
data1.head()

Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,CancerType
0,1002945,5,4,4,5,7,10.0,3,2,1,2
1,1015425,3,1,1,1,2,2.0,3,1,1,2
2,1016277,6,8,8,1,3,4.0,3,7,1,2
3,1017023,4,1,1,3,2,1.0,3,1,1,2
4,1017122,8,10,10,8,7,10.0,9,7,1,4


In [29]:
data1.dtypes

CodeNumber                    int64
ClumpThickness                int64
UniformityCellSize            int64
UniformityCellShape           int64
MarginalAdhesion              int64
SingleEpithelialCellSize      int64
BareNuclei                  float64
BlandChromatin                int64
NormalNucleoli                int64
Mitoses                       int64
CancerType                    int64
dtype: object

In [97]:
#Converting BareNuclei column values from float to int
data1["BareNuclei"].astype("int64")

0      10
1       2
2       4
3       1
4      10
5      10
6       1
7       1
8       1
9       1
10      1
11      3
12      3
13      9
14      1
15      1
16      1
17     10
18      1
19     10
20      7
21      1
23      1
24      7
25      1
26      1
27      1
28      1
29      1
30      1
       ..
668     5
669     8
670     1
671     1
672     1
673     1
674     1
675     1
676     1
677     1
678     1
679    10
680    10
681     1
682     1
683     1
684     1
685     1
686     1
687     1
688     1
689     1
690     5
691     1
692     1
693     2
694     1
695     3
696     4
697     5
Name: BareNuclei, Length: 682, dtype: int64

In [None]:
data1.dtypes

In [102]:
#setting features
#Excluding the first and the last column
features = list(data1.columns[1:-1])
#setting data
X = data1[features]
#setting target data
y = data1["CancerType"]

In [47]:
X.head()

Unnamed: 0,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses
0,5,4,4,5,7,10,3,2,1
1,3,1,1,1,2,2,3,1,1
2,6,8,8,1,3,4,3,7,1
3,4,1,1,3,2,1,3,1,1
4,8,10,10,8,7,10,9,7,1


In [50]:
y.head()

0    2
1    2
2    2
3    2
4    4
Name: CancerType, dtype: int64

In [61]:
#Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [99]:
X_train.shape

(477, 9)

In [98]:
y_train.shape

(477,)

In [100]:
X_test.shape

(205, 9)

In [101]:
y_test.shape

(205,)

In [64]:
#Create Model using Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [65]:
#Perform predictions using predict method
prediction = clf.predict(X_test)

In [84]:
#Converted the test_y into list object from pandas dataframe. 
#The reason is as we randomly split the train and test dataset the indexes of the test_y won’t be in order. 
#If we convert the dataframe in to list object the indexes will be in order.
#From the above code, we are printing the first 5 values of test_y and the predict results target.

for i in range(0, 5):
        print("Actual outcome :: {} and Predicted outcome :: {}".format(list(y_test)[i], prediction[i]))

Actual outcome :: 4 and Predicted outcome :: 4
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 4 and Predicted outcome :: 4
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 4 and Predicted outcome :: 4


In [80]:
#Calculate Accuracy using accuracy_score method
print("Test Accuracy:",accuracy_score(y_test, prediction))

Test Accuracy: 0.9658536585365853


In [78]:
print("Train Accuracy:",accuracy_score(y_train, clf.predict(X_train)))

Train Accuracy: 0.9958071278825996


In [86]:
 print(" Conusion matrix ", confusion_matrix(y_test, prediction))

 Conusion matrix  [[134   2]
 [  5  64]]


In [95]:
y_test[data1['CancerType']==2].shape[0]

136

In [96]:
y_test[data1['CancerType']==4].shape[0]

69

The accuracy of the model is <b>96.58%</b>.

Based on the confusion matrix, we were able to achieve the following:
<ul><li>Out of 136 records of Bening tumor, the model is able to predict 134 records</li>
<li>Out of 69 records of Malignant tumor, the model is able to predict 64 records</li></ul>