### This dataset consists of digital images of fine needle aspirate (**FNA**, a biopsy procedure) of breast mass and describes the characteristics of the cell nuclei present. 
### The diagnosis i.e. whether the nuclei is **Benign** **OR** **Malignant** is displayed in the 'diagnosis' column in the data frame.Breast cancer is the most common type of cancer affecting women among which **Invasive Ductal Carcinoma** (IDC) is the most common form of breast cancer representing 80% of all breast cancer diagnosis.
### By detecting the malignant/benign cell nuclei classification becomes easier as well as faster resulting in early diagnosis and preventing fatality.

### The data also consists of various other characteristic features of the nuclei, this notebook shows various classification techniques performed on the dataset along with the comparison of thier accuracy, aiming to provide suitable solution.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#importing all neccesary libraries and data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#get data frame and display it
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df

In [None]:
#display columns and check for any null values
df.columns

In [None]:
#check the presence of NaN values
df.isnull().sum()

In [None]:
#drop'Unamed'column
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv').drop(["Unnamed: 32"], axis = 1)
df


In [None]:
#labeling the diagnosis as '1' and '0'
df["diagnosis"] = df["diagnosis"].map({'M':1, 'B':0})

In [None]:
df

In [None]:
#Assigning our input and output data
X = df[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean','fractal_dimension_mean',
                 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se',
                 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst','fractal_dimension_worst']]

Y = df['diagnosis']

# Data Visualization
### The following count plot shows the number of Benign and Malignant cells in graphical form

In [None]:
#plot a countplot
sns.countplot(df["diagnosis"])
plt.show()

# Correlation
### Shows the degree of linear relation between the variables present in the dataset

In [None]:
cor= df.corr()
sns.heatmap(cor,cmap='coolwarm')
plt.savefig('heatmap.png')
plt.show()

In [None]:
df.corr()

# Standard Scaling

In [None]:
#Get the training and test data
X_train_orig, X_test_orig, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [None]:
#Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train_orig)
X_test = sc.transform(X_test_orig)

# **Logistic Regression Model**

In [None]:
#training the model
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(random_state = 0)
lr_classifier.fit(X_train, Y_train)
#predicting the model
Y_pred = lr_classifier.predict(X_test)
#Accuracy
lr_score = accuracy_score(Y_test, Y_pred)
print("Accuracy of the model:",lr_score)
print(classification_report(Y_test, Y_pred))


# K Nearest Neighbour model

In [None]:
#training the model
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier.fit(X_train, Y_train)
Y_pred = knn_classifier.predict(X_test)
#Accuracy
knn_score = accuracy_score(Y_test, Y_pred)
print("Accuracy of the model:",knn_score)
print(classification_report(Y_test, Y_pred))

# Support Vector Machine (SVM)

In [None]:
#training
from sklearn.svm import SVC
svm_classifier = SVC(kernel = 'linear', random_state = 0)
svm_classifier.fit(X_train, Y_train)
Y_pred = svm_classifier.predict(X_test)
#Accuracy
svm_score = accuracy_score(Y_test, Y_pred)
print("Accuracy of the model:", svm_score)
print(classification_report(Y_test, Y_pred))

# Support Vector Machine Kernel (radial basis function)

In [None]:
#training the model
from sklearn.svm import SVC
ksvm_classifier = SVC(kernel = 'rbf', random_state = 0)
ksvm_classifier.fit(X_train, Y_train)
Y_pred = ksvm_classifier.predict(X_test)
#accuracy
ksvm_score = accuracy_score(Y_test, Y_pred)
print("Accuracy of the model:", ksvm_score)
print(classification_report(Y_test, Y_pred))

# Guassian Naive Bayes


In [None]:
#training the model
from sklearn.naive_bayes import GaussianNB
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, Y_train)
Y_pred = gnb_classifier.predict(X_test)
#accuracy
gnb_score = accuracy_score(Y_test, Y_pred)
print("Accuracy of the model:", gnb_score)
print(classification_report(Y_test, Y_pred))

# Decision Tree Algorithm

In [None]:
#training the model
from sklearn.tree import DecisionTreeClassifier
dectree_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dectree_classifier.fit(X_train, Y_train)
Y_pred = dectree_classifier.predict(X_test)
#Accuracy
dectree_score = accuracy_score(Y_test, Y_pred)
print("Accuracy of the model:", dectree_score)
print(classification_report(Y_test, Y_pred))


# Random Forest Classifier


In [None]:
#training the model
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rf_classifier.fit(X_train, Y_train)
Y_pred = rf_classifier.predict(X_test)
#accuracy
rf_score = accuracy_score(Y_test, Y_pred)
print("Accuracy of the model:",rf_score )
print(classification_report(Y_test, Y_pred))

## Comparision Table

In [None]:
    models_initial = pd.DataFrame({
    'Model'       : ['Logistic Regression', 'Decision Tree', 'Support Vector Machine', 'Kernel SVM', 
                     'Random Forest', 'K-Nearest Neighbors', 'Gaussian Naive Bayes'],
    'Accuracy'    : [lr_score, dectree_score, svm_score, ksvm_score, rf_score, knn_score, gnb_score],
    }, columns = ['Model', 'Accuracy'])
    
    models_initial

### As we can see in the above table, The model created with **Random forest algorithm** gives the highest accuracy as compared to other models.