# Loading Libraries and Dependencies

In [1]:
#import the dataset
from __future__ import division, print_function, unicode_literals
import pandas as pd
import numpy as np
Rcancer = pd.read_csv('breast-cancer-wisconsin.csv', sep=",", header=None, 
                     names=["Sample code number", "Clump Thickness", 
                            "Uniformity of Cell Size", "Uniformity of Cell Shape", 
                            "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei" ,
                            "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"])
pd.DataFrame.count(Rcancer)

Sample code number             699
Clump Thickness                699
Uniformity of Cell Size        699
Uniformity of Cell Shape       699
Marginal Adhesion              699
Single Epithelial Cell Size    699
Bare Nuclei                    699
Bland Chromatin                699
Normal Nucleoli                699
Mitoses                        699
Class                          699
dtype: int64

# Preprocessing the dataset

In [12]:
#delete the missing value
Rcancer=Rcancer.replace(to_replace="?", value=np.nan)
cancer=Rcancer.dropna()
pd.DataFrame.count(cancer)

Sample code number             683
Clump Thickness                683
Uniformity of Cell Size        683
Uniformity of Cell Shape       683
Marginal Adhesion              683
Single Epithelial Cell Size    683
Bare Nuclei                    683
Bland Chromatin                683
Normal Nucleoli                683
Mitoses                        683
Class                          683
dtype: int64

In [13]:
cancer.describe()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,1076720.0,4.442167,3.150805,3.215227,2.830161,3.234261,3.445095,2.869693,1.603221,2.699854
std,620644.0,2.820761,3.065145,2.988581,2.864562,2.223085,2.449697,3.052666,1.732674,0.954592
min,63375.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,877617.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171795.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238705.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [14]:
cancer.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# Splitting the dataset into training and testing sets

In [15]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
X=pd.DataFrame(cancer, columns= ["Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", 
                            "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei" ,
                            "Bland Chromatin", "Normal Nucleoli", "Mitoses"])
y = pd.DataFrame(cancer, columns = ["Class"])

# Split validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1, stratify=y)


# Building the model

In [16]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz 


In [17]:
# Information gain
clf_infoG = tree.DecisionTreeClassifier(criterion="entropy", max_depth=6)
clf_infoG = clf_infoG.fit(X_train, y_train)
feature_names = list(X.columns)
infoG_tree = tree.export_graphviz(clf_infoG, out_file= None,feature_names=feature_names,class_names='24',filled=True, rounded=True,special_characters=True)


# Evaluating the model

In [18]:
#model evaluation
y_pred_infoG = clf_infoG.predict(X_test)               
infoG_matrix = confusion_matrix(y_test, y_pred_infoG)
print (infoG_matrix) 

print(classification_report(y_test, y_pred_infoG, target_names=["benign", "malignant"]))

[[167  11]
 [  5  91]]
              precision    recall  f1-score   support

      benign       0.97      0.94      0.95       178
   malignant       0.89      0.95      0.92        96

    accuracy                           0.94       274
   macro avg       0.93      0.94      0.94       274
weighted avg       0.94      0.94      0.94       274



In [19]:
print("Accuracy:",accuracy_score(y_test, y_pred_infoG))

Accuracy: 0.9416058394160584


# Printing the accuracy

In [20]:
print("Accuracy:",accuracy_score(y_test, y_pred_infoG)*100)

Accuracy: 94.16058394160584
