# Breast Cancer Classification

***Attribute Information:***

* ID number
 
* Diagnosis (M = malignant, B = benign)

***Ten real-valued features are computed for each cell nucleus:***
* radius (mean of distances from center to points on the perimeter)
* texture (standard deviation of gray-scale values)
* perimeter
* area
* smoothness (local variation in radius lengths)
* compactness (perimeter^2 / area - 1.0)
* concavity (severity of concave portions of the contour)
* concave points (number of concave portions of the contour)
* symmetry
* fractal dimension ("coastline approximation" - 1)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

import warnings
warnings.filterwarnings('ignore')


plt.style.use('ggplot')

In [None]:
# Load the data:-

df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df

# Data preprocessing:-

In [None]:
# View raw data:-

pd.set_option('max_columns',33)
df.head()

In [None]:
# Dimension of the data:-

df.shape

> No. of rows and columns in the data is 569 and 33 respectvely.

In [None]:
# Data types:-

df.info()

In [None]:
# Checking Null values using heatmap:-

sns.heatmap(df.isnull())

In [None]:
# Count the null values:-

df.isnull().sum()

>As we can see that the last column Unnamed: 32 has all NaN value so we will drop this column.

In [None]:
# Droping Unnamed: 32 Column

df.drop('Unnamed: 32', axis = 1, inplace = True)

In [None]:
# Find unique values in 'diagnosis' column:-

df.diagnosis.unique()

In [None]:
# Replace 'M' with 1 and 'B' with 0 in column 'diagnosis':-

df['diagnosis'] = df['diagnosis'].apply(lambda val: 1 if val == 'M' else 0)

In [None]:
# Again view raw data:-

df.head()

In [None]:
# Describing the data / Statistical Data analysis:-

pd.set_option('precision',3)
df.describe()

# Data Visualization:-

In [None]:
# Diagnosis Pie chart:-

print(df.diagnosis.value_counts())
df.diagnosis.value_counts().plot.pie();

In [None]:
# Heatmap:-

plt.figure(figsize=(30,30))
sns.heatmap(df.corr(),annot = True, cmap = 'Blues');

In [None]:
# EDA:-

plt.figure(figsize = (20, 15))
plotnumber = 1

for column in df:
    if plotnumber <= 30:
        ax = plt.subplot(5, 6, plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column)
        
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
# Getting Mean Columns
m_col = ['diagnosis','radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

# Getting Se Columns
s_col = ['diagnosis','radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se']

# Getting Worst column
w_col = ['diagnosis','radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

In [None]:
# Heatmap For Mean Columns:-

plt.figure(figsize=(15,15))
sns.heatmap(df[m_col].corr(),annot = True, cmap = 'Blues');

In [None]:
# pairplot for mean columns

sns.pairplot(df[m_col],hue = 'diagnosis', palette='Blues');

In [None]:
# Heatmap for se columns

plt.figure(figsize=(15,15))
sns.heatmap(df[s_col].corr(),annot = True, cmap = 'Reds');

In [None]:
# pairplot for se columns

sns.pairplot(df[m_col],hue = 'diagnosis', palette='Reds');

In [None]:
# Heatmap for Worst columns:-

plt.figure(figsize=(15,15))
sns.heatmap(df[w_col].corr(),annot = True, cmap = 'Greens');

In [None]:
# pairplot for worst columns:-

sns.pairplot(df[w_col],hue = 'diagnosis', palette='Greens');

In [None]:
# removing highly correlated features

corr_matrix = df.corr().abs() 

mask = np.triu(np.ones_like(corr_matrix, dtype = bool))
tri_df = corr_matrix.mask(mask)

to_drop = [x for x in tri_df.columns if any(tri_df[x] > 0.92)]

df = df.drop(to_drop, axis = 1)

print(f"The reduced dataframe has {df.shape[1]} columns.")

> We can see that there are many columns which are very highly correlated which causes multicollinearity so we have to remove highly correlated features.

# Data Modelling:-

In [None]:
# Getting Features:-

x = df.drop(columns = 'diagnosis')

# Getting Predicting Value:-

y = df['diagnosis']

In [None]:
# Splitting data into training and testing data:-

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=1/9, random_state=252)

In [None]:
# Scaling data:-

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.fit_transform(x_test)

# Creating Classification Models:-

In [None]:
# Defining Models:-

def Classification_Models(x,y,xt,yt):
    # Importing All LIberaries
    from sklearn.metrics import accuracy_score
    from sklearn.linear_model import LogisticRegression
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import svm
    from sklearn.neighbors import KNeighborsClassifier

    # Initializing models:-
    
    logisreg = LogisticRegression()
    lda = LinearDiscriminantAnalysis()
    gnb = GaussianNB()
    dtc = DecisionTreeClassifier()
    rfc = RandomForestClassifier()
    svmodel = svm.SVC()
    knnmodel = KNeighborsClassifier()
    
    # Fitting Models
    logisreg.fit(x,y)
    lda.fit(x,y)
    gnb.fit(x,y)
    dtc.fit(x,y)
    rfc.fit(x,y)
    svmodel.fit(x,y)
    knnmodel.fit(x,y)
    
    # Getting Predicting Values:-
    
    logi_pred = logisreg.predict(xt)
    lda_pred = lda.predict(xt)
    gnb_pred = gnb.predict(xt)
    dtc_pred = dtc.predict(xt)
    rfc_pred = rfc.predict(xt)
    svm_pred = svmodel.predict(xt)
    knn_pred = knnmodel.predict(xt)
    
    # Getting Accuracy Score
    acc_logisreg = accuracy_score(yt, logi_pred)
    acc_lda = accuracy_score(yt, lda_pred)
    acc_ganb = accuracy_score(yt, gnb_pred)
    acc_dtree = accuracy_score(yt, dtc_pred)
    acc_rf = accuracy_score(yt, rfc_pred)
    acc_svc = accuracy_score(yt, svm_pred)
    acc_knn = accuracy_score(yt, knn_pred)
    
    # MOdel Selection
    models = pd.DataFrame({
    'Model': ['Logistic Regression','Linear Discriminant Analysis','Naive Bayes', 'Decision Tree', 'Random Forest', 'Support Vector Machines', 
              'K - Nearest Neighbors'],
    'Score': [acc_logisreg, acc_lda, acc_ganb, acc_dtree, acc_rf, acc_svc, acc_knn]})

    print(models.sort_values(by='Score', ascending=False))
    sns.barplot(x = models['Score'], y = models['Model'], palette='viridis');

In [None]:
Classification_Models(x_train,y_train,x_test,y_test)

> Logistic Regression, SVM, Random Forest and KNN were the best here with the accuracy of 100%.

# Please leave your feedbacks in the comment section. Thank you.....

# If you like my work, please do a upvote :)