# 1. Importing Libraries

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
%matplotlib inline

SEED=42

# Importing Preprocessing Library
from sklearn.preprocessing import OrdinalEncoder


# Importing Model Selection Library
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


#Model
from sklearn.neighbors import KNeighborsRegressor

# Accuracy Metrics
from sklearn.metrics import r2_score

plt.rcParams['xtick.labelsize']=11
plt.rcParams['ytick.labelsize']=11


import missingno as miss

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


from sklearn.metrics import accuracy_score

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
print('The shape of the Wisconsin Breast Cancer Dataset is {}'.format(df.shape))

In [None]:
# Visualizing the Matrix Plot

fig = plt.figure(figsize=(20,8))
ax1 = fig.add_subplot(1,1,1)
miss.matrix(df, labels=True, fontsize=12, ax=ax1, sparkline=False)
ax1.set_title('Matrix plot for missing values in the Cancer dataset', size=20, color='red')

As last column Unnamed 32 is completely empty, better to drop this from the dataset

In [None]:
y = df['diagnosis']
X = df.drop(['diagnosis', 'id','Unnamed: 32'], axis=1)

# 2. Analysis of Numerical Features

In [None]:
# count plot 
sns.countplot(x='diagnosis',data=df, palette='Set2')
benign, malignant = y.value_counts()
print('Total Benign counts in the Cancer datatset {}'.format(benign))
print('Total Malignant counts in Cancer datatset {}'.format(malignant))

In [None]:
num_var = [var for var in X.columns if X[var].dtype != 'O']
print(num_var)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
plt.tight_layout()
import matplotlib.pyplot as plt
for key, value in enumerate(num_var):
  plt.subplot(6,5,key+1)
  g = sns.histplot(X[value], color='m', label= 'Skewness: {:.2f}'.format(X[value].skew()), kde=True)
  plt.legend(loc='best')
  plt.suptitle('Distribution plot of Numerical Features', size=14).set_position([.5, 1.02])
  plt.tight_layout()

In [None]:
plt.figure(figsize=(30,20))
for key, value in enumerate(num_var):
  plt.subplot(5,6,key+1)
  g = sns.kdeplot(data=df, x=value, hue='diagnosis', shade=True)
  plt.suptitle('KDE (Kernel Density Estimation) Plot of Numerical features', size=15).set_position([.5, 1.02])
  plt.tight_layout()

In [None]:
X.head()

# 3. Data Preprocessing 

In [None]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
standard_scaler.fit(X)

scaled_X = standard_scaler.transform(X)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

list(label_encoder.classes_)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED, test_size=0.2)

print('Size of train dataset after the split in subsets {}'.format(X_train.shape))
print('Size of test/validation dataset after the split in subsets {}'.format(X_test.shape))


# 4. Estimators

1. Naive Bayes Classifier

Naive Bayes models area group of extremely simple classification algorithms that are often suitable for high dimensional datasets because they are ver fast and have few tunable parameters so they end up being very useful as a baseline algorithm for a classification problems

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_predict = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predict)

In [None]:
print('Accuracy of the Naive Bayes classifier is {:.3f}'.format(nb_accuracy))

2. Support Vector Machines

It is a type of discriminative classifier in which we do not model each class rather find a line, curve or a hyperplane that devides the classes from each other.


The basic intuition behind the SVMs is that rather simply drawing a zero width line between two classes, we can draw around each line a margin of some width, upto the nearest point. The line that maximises the margin is the one we will choose as the optimal model, SVMs are the examples of such maximim margin estimators

In [None]:
from sklearn.svm import SVC
SVM_model = SVC(kernel='linear', C=10)
SVM_model.fit(X_train, y_train)
SVM_predict=SVM_model.predict(X_test)
SVM_accuracy_C10 = accuracy_score(y_test, SVM_predict)

In [None]:
print('Accuracy of the SVM classifier with  Linear Kernel is {:.3f}'.format(SVM_accuracy_C10))

In [None]:
from sklearn.metrics import plot_confusion_matrix

class_names = label_encoder.classes_

title_options = [('Confusion Matrix, without Normalize', None), ('Normalized confusion Matrix', 'true')]

for title, normalize in title_options:
    disp = plot_confusion_matrix(SVM_model, X_test, y_test,
                                display_labels = class_names,
                                cmap= plt.cm.Blues,
                                normalize = normalize)
    disp.ax_.set_title(title)

**Tuning the SVM: Softening the Margin**

SVM softness index alows some points to creep into tge margin if that allows a better fit for the model. The hardness of the margin is controlled by the tuning parameter most often known as C

In [None]:
SVM_model = SVC(kernel='linear', C=0.1)
SVM_model.fit(X_train, y_train)
SVM_predict = SVM_model.predict(X_test)
SVM_accuracy_C01 = accuracy_score(y_test, SVM_predict)

In [None]:
print('Accuracy of the SVM classifier with  Linear Kernel is {:.3f}'.format(SVM_accuracy_C01))

In [None]:

title_options = [('Confusion Matrix, without Normalize', None), ('Normalized confusion Matrix', 'true')]

for title, normalize in title_options:
    disp = plot_confusion_matrix(SVM_model, X_test, y_test,
                                display_labels = class_names,
                                cmap= plt.cm.Blues,
                                normalize = normalize)
    disp.ax_.set_title(title)

3. Decision Tree

Random forest a non parametric algorithm is an example of ensemble method, a method that relies on aggregating the result of an ensemble of simple estimators. The surprising result from the ensemble methods are a majority vote among the number of estimator can end up being better than any of individual estimator doing the voting.

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_predict = tree_model.predict(X_test)
tree_accuracy = accuracy_score(y_test,tree_predict)

In [None]:
print('Accuracy of the Decision Tree classifier is {:.3f}'.format(tree_accuracy))

In [None]:
accuracy_dataframe = {'Estimator':['Naive Bayes', 'SVM_C=10', 'SVM_C=0.1', 'Decision Tree'], 
                      'Accuracy': [nb_accuracy, SVM_accuracy_C10, SVM_accuracy_C01, tree_accuracy ]}
accuracy_df = pd.DataFrame(accuracy_dataframe)

print(accuracy_df)

# **PCA: Principal Component Analysis**


PCA is fundametally unsupervised learning method used for dimensionality reduction but it can be very useful as a tool of visualization, noise filtering, for feature extraction and engineering.

The unsupervised learning method rather then predicting the y value from x, attempts to learn about the relationship between the x and y values. In case of PCA one quantifies the relationship by finding a list of principal axes in the data, and using those axes to describe the dataset.

In [None]:
df = df.drop(['id', 'Unnamed: 32'], axis=1)
df_features = df.drop(['diagnosis'], axis=1)

df_features.head()

1. Standardization of the Data

In [None]:
standard_scaler = StandardScaler()
standard_scaler.fit(df_features)

scaled_features = standard_scaler.transform(df_features)

scaled_features

In [None]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=3)
pca_model.fit(scaled_features)

X_pca = pca_model.transform(scaled_features)

print('Shape of the dataset after PCA transformation is {}'.format(X_pca.shape))

In [None]:
def encoder(data):
    if data=='M':
        return 1
    else:
        return 0
    
df_target  = df['diagnosis'].apply(encoder)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(15,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pca[:, 0],X_pca[:,1],X_pca[:,2], c=df_target, s=100)
ax.set_xlabel('First Principal Component', color='black', size=12)
ax.set_ylabel('Second Principal Component',color='black', size=12)
ax.set_zlabel('Third Principal Component', color='black', size=12)
#ax.legend()
#ax.set_label(loc='best')

In [None]:
pca_df = pd.DataFrame(X_pca, columns=['pca0', 'pca1', 'pca2'])          
pca_df['diagnosis'] = df['diagnosis']
print('Shape of PCA dataset is {}'.format(pca_df.shape))
pca_df.head()

In [None]:
sns.pairplot(pca_df, hue='diagnosis', markers=["o", "s"], corner=False)

In [None]:
X = pca_df.drop(['diagnosis'], axis=1)
y = df_target

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=SEED,test_size=0.2)

1. Naive Bayes  

In [None]:
nb_model_pca = GaussianNB()
nb_model_pca.fit(X_train, y_train)
nb_predict_pca = nb_model_pca.predict(X_test)
nb_accuracy_pca = accuracy_score(y_test, nb_predict_pca)

print('The accuracy score after PCA using Naive bayes : {:.3f}'.format(nb_accuracy_pca))

2. Support Vector Machine

In [None]:
SVM_model_pca = SVC(kernel='linear', C=0.1)
SVM_model_pca.fit(X_train, y_train)
SVM_predict_pca = SVM_model_pca.predict(X_test)
SVM_accuracy_pca_C01 = accuracy_score(y_test,SVM_predict_pca)

print('SVM accuracy after PCA and with C=0.1 is  {:.3f}'.format(SVM_accuracy_pca_C01))

In [None]:
SVM_model_pca = SVC(kernel='linear', C=10)
SVM_model_pca.fit(X_train, y_train)
SVM_predict_pca = SVM_model_pca.predict(X_test)
SVM_accuracy_pca_C10 = accuracy_score(y_test,SVM_predict_pca)

In [None]:
print('SVM accuracy after PCA and with C=10 is {:.3f}'.format(SVM_accuracy_pca_C10))

3. Decision Tree

In [None]:
tree_model_pca = DecisionTreeClassifier()
tree_model_pca.fit(X_train, y_train)
tree_predict_pca = tree_model_pca.predict(X_test)
tree_accuracy_pca = accuracy_score(y_test,tree_predict_pca)

print('Accuracy of SVM classifier with Linear Kernel is: {:.3f}'.format(tree_accuracy_pca))

In [None]:
accuracy_dataframe = {'Estimator':['Naive Bayes_PCA', 'SVM_PCA_C=10', 'SVM_PCA_C=0.1', 'Decision Tree_PCA'], 
                      'Accuracy': [nb_accuracy_pca, SVM_accuracy_pca_C10, SVM_accuracy_pca_C01, tree_accuracy_pca ]}
pca_accuracy_df = pd.DataFrame(accuracy_dataframe)

print(pca_accuracy_df)

# Conclusion 

In [None]:
final_accuracy_table = {'Classifiers': ['Naive Bayes Classifier', 'Support Vector Machine_C=10', 'Support Vector Machine_C=0.1', 'Decision Tree'],
                'Accuracy_without_PCA': [nb_accuracy, SVM_accuracy_C10, SVM_accuracy_C01, tree_accuracy],
                'Accuracy_with_PCA': [nb_accuracy_pca, SVM_accuracy_pca_C10, SVM_accuracy_pca_C01, tree_accuracy_pca]}
final_accuracy_dataframe = pd.DataFrame(final_accuracy_table)


final_accuracy_dataframe.head()