In [2]:
#Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
#Importing dataset
data = pd.read_csv('BreastCancer_Data.csv')
data.isna().sum()
data = data.dropna(axis=1)
print(len(data[0]))
print(data['diagnosis'].value_counts())
sb.countplot(data['diagnosis'],label="count")
#sb.pairplot(data,hue="diagnosis")
corr = data.corr()
plt.figure(figsize=(20,20))
#sb.heatmap(corr,annot=True,fmt='%')

In [3]:
#Split the data into features and labels
X = data.iloc[:,2:].values
y = data.iloc[:,1].values

#Encoding the categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

#Splitting the data into 70% training set and 30% testing set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

#Feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
#Creating a function of many Machine Learning models
def models(X_train,y_train):
    
    #Using Logistic Resgression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state=0)
    log.fit(X_train,y_train)
    
    #Using Decision Tree algorithm
    from sklearn.tree import DecisionTreeClassifier
    tree =  DecisionTreeClassifier(criterion='entropy',random_state=0)
    tree.fit(X_train,y_train)
    
    #Using Random Forest Classification algorithm
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    forest.fit(X_train,y_train)
    
    #Using K-Nearest Neighbour algorithm
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=13,p=2)
    knn.fit(X_train,y_train)
    
    #using Support Vector Machines algorithm
    from sklearn.svm import SVC
    svm = SVC(kernel='linear',random_state=0)
    svm.fit(X_train,y_train)
    
    #Printing accuracy scores
    print('[0]Logistic Regression training accuracy score : ',log.score(X_train,y_train))
    print('[1]Decision Tree training accuracy score : ',tree.score(X_train,y_train))
    print('[2]Random Forest training accuracy score : ',forest.score(X_train,y_train))
    print('[3]KNN accuracy training score : ',log.score(X_train,y_train))
    print('[4]SVM accuracy training score : ',log.score(X_train,y_train))
    
    return log,tree,forest,knn,svm

In [8]:
#Calling the fuction models
model = models(X_train,y_train)

#Testing accuracy and other metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

for i in range(len(model)):
  print('Model ',i)
  #Check precision, recall, f1-score
  print( classification_report(y_test, model[i].predict(X_test)) )
  #Another way to get the models accuracy on the test data
  print( accuracy_score(y_test, model[i].predict(X_test))*100)
  print()#Print a new line

[0]Logistic Regression training accuracy score :  0.9899497487437185
[1]Decision Tree training accuracy score :  1.0
[2]Random Forest training accuracy score :  0.9949748743718593
[3]KNN accuracy training score :  0.9899497487437185
[4]SVM accuracy training score :  0.9899497487437185
Model  0
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       108
           1       0.98      0.95      0.97        63

    accuracy                           0.98       171
   macro avg       0.98      0.97      0.97       171
weighted avg       0.98      0.98      0.98       171

97.6608187134503

Model  1
              precision    recall  f1-score   support

           0       0.97      0.93      0.95       108
           1       0.88      0.95      0.92        63

    accuracy                           0.94       171
   macro avg       0.93      0.94      0.93       171
weighted avg       0.94      0.94      0.94       171

93.56725146198829

Model



In [9]:
#Since the testing accuracy of Logistic Regression is higher than that of any other trained models we use the Logistic Regression model for classification
#Print Prediction of Random Forest Classifier model
pred = model[0].predict(X_test)
print(pred)

print()

#Print the actual values
print(y_test)

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1
 1 0 0 0 0 0 1 1 1 0 1 0 0 0 1 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0
 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1
 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 1 1 1 0 0 0]

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1
 1 0 0 0 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 1 0
 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1
 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 1 1 1 0 0 0]


In [11]:
#Confusion matrix visualization
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test,pred)
print(matrix)

[[107   1]
 [  3  60]]
