# Breast Cancer Detection

This project compares different algorithms to diagnose breast cancer using the following dataset: 
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

In [None]:
data=pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

In [None]:
data.head()

# Data pre-processing

In [None]:
data['diagnosis'].value_counts()

In [None]:
data.dtypes

In [None]:
data.shape

# Exploratory Data Analysis

In [None]:
df=data.drop('id',axis=1)

In [None]:
df.describe().T

# Univariate Data Analysis

In [None]:
df['diagnosis']

Encode diagnosis(target values)

In [None]:
#Encode the diagnosis values
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df.iloc[:,0] = labelencoder.fit_transform(df.iloc[:,0].values)
df

In [None]:
sns.distplot(df['diagnosis'])

In [None]:
df.hist(bins=20,figsize=(30,30),layout=(8,4))

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=df,orient="h")

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(35,15))
sns.heatmap(df.corr(),vmax=1,square=True,annot=True,cmap='viridis')
plt.title('Correlation between attributes')
plt.show()

# Model Building

In [None]:
X=df.drop('diagnosis',axis=1)
X=X.iloc[:,:-1].values
y=df['diagnosis']
print(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.20,random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
print(X_train)
print(X_test)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)

# Predicting the test set results

In [None]:
y_pred=classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm=confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

# KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
KNN=KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
KNN.fit(X_train, y_train)

In [None]:
y_pred_knn = KNN.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_knn)
print(cm)
accuracy_score(y_test, y_pred_knn)

# Support Vector Machines

In [None]:
from sklearn.svm import SVC

In [None]:
svc=SVC(gamma=0.025,C=3)
svc.fit(X_train,y_train)

In [None]:
predicted_svc=svc.predict(X_test)

In [None]:
predicted_svc

In [None]:
cm = confusion_matrix(y_test, predicted_svc)
print(cm)
accuracy_score(y_test, predicted_svc)

Highest level of accuracy achieved by Support Vector Classifier.

# Compare all models

In [None]:
logisticregression=pd.DataFrame(y_pred)
knn=pd.DataFrame(y_pred_knn)
svcpred=pd.DataFrame(predicted_svc)
new_df=pd.concat([logisticregression,knn,svcpred],axis=1)

In [None]:
new_df.columns=[['Logisticregression','KNN','Support Vector Classifier']]
new_df

In [None]:
from sklearn.metrics import classification_report

In [None]:
print('Logistic Regression report')
print(classification_report(y_test,y_pred))
print('KNN classification report')
print(classification_report(y_test,y_pred_knn))
print('SVC classification report')
print(classification_report(y_test,predicted_svc))