# 4 Different Algorithms on Breast Cancer Dataset

In this notebook I have used four different machine learning algorithms on the Wisconsin Diagnostic Breast Cancer Dataset. I have used Logistic Regression, SVM, Decision Tree, and KNN. This is a walkthrough of each of the 4 algorithms for anyone new to ML. Hope you enjoy!

# Importing Libraries

In [None]:
# Importing all necessary Python libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data

In [None]:
# Retrieving the data.
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

# Feature Engineering with Standardization and PCA

In [None]:
# Dropping the unnamed feature.
df2 = df.drop(columns=['Unnamed: 32', 'id'])

In [None]:
# Replacing the diagnosis of malignant or benign with 1s and 0s.
df2['diagnosis'].replace({'M':1, 'B':0}, inplace = True)

In [None]:
df2.info()

In [None]:
X = df2.iloc[:, 1:].values
y = df2['diagnosis']

In [None]:
# Using standard scaler to receive values from +3 to -3.
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

In [None]:
# Applying PCA to reduce the dimensionality of the data.
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents1 = pca.fit_transform(X)

In [None]:
principalComponents1

# Logistic Regression

In [None]:
# Splitting the data into training and testing sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
# Implementing Logistic Regression.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0)
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
# Visualizing the confusion matrix.
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

# SVM

In [None]:
# Implementing Support Vector Machine.
from sklearn.svm import SVC
svc = SVC(kernel = 'linear', random_state = 0)
svc.fit(X_train, y_train)

In [None]:
y_pred = svc.predict(X_test)

In [None]:
# Visualizing the confusion matrix.
cm1 = confusion_matrix(y_test, y_pred)
print(cm1)
accuracy_score(y_test, y_pred)

# Decision Tree

In [None]:
# Implementing Decision Tree.
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt.fit(X_train, y_train)

In [None]:
y_pred = dt.predict(X_test)

In [None]:
# Visualizing the confusion matrix.
cm2 = confusion_matrix(y_test, y_pred)
print(cm2)
accuracy_score(y_test, y_pred)

# KNN

In [None]:
# Implementing KNN.
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
# Visualizing the confusion matrix.
cm3 = confusion_matrix(y_test, y_pred)
print(cm3)
accuracy_score(y_test, y_pred)