
<font color ='red'>
Content :
    
    
    
1. [  Load and Check Data](#1)
    
2. [ Visualizing Data](#2)  
    
3. [Feature Engineering](#3)    
  
4. [Prediction](#4)                
    4.1 [Support Vector Classifier](#5)          
    4.2 [Logistic Regression](#6)        
    4.3 [Naive Bayes Classifier](#7)           
    4.4 [K-Nearest Neighbour Classifier](#8)          
    4.5 [Random Forest Classifier](#9)         
    4.5 [Gradient Boosting Classifier](#10)  
    
5. [Comparing Results](#11) 

<a id = "1"></a>
<font color ='red'>
## 1. Load and Check Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
original_data=pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

In [None]:
original_data.head(10)

In [None]:
original_data.info()

In [None]:
original_data.describe()

In [None]:
data = original_data.copy()
data.isnull().sum()

In [None]:
data.columns

In [None]:
diagnosis_B , diagnosis_M = data.diagnosis.value_counts()
print(f'B : {diagnosis_B}\nM : {diagnosis_M}')

<a id = "2"></a>
<font color ='red'>
## 2. Visualizing Data

In [None]:
labels = ["B", "M"]
sizes = [357, 212]
explode = (0,0)
colors = ["purple","gold"]
fig1, ax1 = plt.subplots(figsize =(10,10))
ax1.pie(sizes,colors = colors ,explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
plt.title("Diagnosis")
plt.show()

In [None]:
fig1, ax1 = plt.subplots(figsize =(10,10))
plt.scatter(x = data.radius_mean, y = data.concavity_mean ,marker ="h", c = "brown" )
plt.grid()
plt.xlabel("Radius Mean")
plt.ylabel("Concavity Mean")

In [None]:
g = sns.jointplot(
    data=data,
    x="radius_mean", y="concavity_mean", 
    kind="kde",
)
plt.show()


In [None]:
sns.violinplot(data=data, x="diagnosis", y="radius_mean",
               split=True, inner="quart", linewidth=1,)
sns.despine(left=True)
plt.show()


In [None]:
sns.violinplot(data=data, x="diagnosis", y="concavity_mean",
               split=True, inner="quart", linewidth=1,)
sns.despine(left=True)
plt.show()

<a id = "3"></a>
<font color ='gold'>
# 3. Feature Engineering

In [None]:
data.drop(["Unnamed: 32"],axis = 1, inplace = True)
data.drop(["id"],axis = 1,inplace = True)
data.columns

In [None]:
y = [1 if each == 'M' else 0 for each in data.diagnosis]
data.drop(['diagnosis'],inplace = True, axis = 1)
data = (data - data.min())/ (data.max() - data.min())
data

In [None]:
all_scores = {}
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size = 0.2, random_state = 24)

<a id = "4"></a>
<font color ='gold'>
# 4. Prediction

<a id = "5"></a>
<font color ='gold'>
# 4.1 Support Vector Classifier

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

svc = svm.SVC()
parameters_svc = {
              'kernel' : ('sigmoid','poly','rbf'),
              'degree' : (range(2,5)),
              'gamma' : ('scale', 'auto')}
svc_grid = GridSearchCV( svc, parameters_svc)
svc_grid.fit(x_train, y_train)
svc_grid_score = svc_grid.score(x_test, y_test)
print('Score : ',svc_grid_score)
print('Best parameters : ',svc_grid.best_params_)
all_scores['Support Vector'] = round((svc_grid_score*100),2)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,svc_grid.predict(x_test))

f, ax = plt.subplots(figsize =(10,10))
sns.heatmap(cm,annot = True,linewidths=0.5,linecolor="blue",fmt = ".0f",ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Original values")
plt.show()

<a id = "6"></a>
<font color ='gold'>
# 4.2 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
log_reg = LogisticRegression()
parameters_log_reg = {
    'penalty' : ('l1', 'l2', 'elasticnet', 'none'),
    'tol' : (1e-4,1e-3,1e-5),
    'C' : (1.0,1.1,1.2,1.5,2.0), 
    'solver' : ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
    'multi_class' : ('auto', 'ovr', 'multinomial')
}
log_reg_grid = GridSearchCV( log_reg, parameters_log_reg)
log_reg_grid.fit(x_train, y_train)
log_reg_score = log_reg_grid.score(x_test, y_test)

In [None]:
print('Score : ',log_reg_score)
print('Best parameters : ',log_reg_grid.best_params_)
all_scores['Logistic Regression'] = round((log_reg_score*100),2)

In [None]:
cm = confusion_matrix(y_test,log_reg_grid.predict(x_test))

f, ax = plt.subplots(figsize =(10,10))
sns.heatmap(cm,annot = True,linewidths=0.5,linecolor="blue",fmt = ".0f",ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Original values")
plt.show()

<a id = "7"></a>
<font color ='red'>
# 4.3 Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
parameters_nb = {
    'var_smoothing' : (1e-8, 1e-9, 1e-10)
}

nb_grid = GridSearchCV( nb, parameters_nb)
nb_grid.fit(x_train, y_train)
nb_grid_score = nb_grid.score(x_test, y_test)
print('Score : ',nb_grid_score)
print('Best parameters : ',nb_grid.best_params_)
all_scores['Naive Bayes'] = round((nb_grid_score*100),2)


In [None]:
cm = confusion_matrix(y_test,nb_grid.predict(x_test))

f, ax = plt.subplots(figsize =(10,10))
sns.heatmap(cm,annot = True,linewidths=0.5,linecolor="blue",fmt = ".0f",ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Original values")
plt.show()

<a id = "8"></a>
<font color ='red'>
# 4.4 K-Nearest Neighbour Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
parameters_knn = {
    'n_neighbors' : (3,4,5,6),
    'weights' : ('uniform', 'distance'),
    'algorithm' : ('auto', 'ball_tree', 'kd_tree', 'brute'),
    'leaf_size' : (24,30,32,48),
    'p' : (1,2,3),
}

knn_grid = GridSearchCV( knn, parameters_knn)
knn_grid.fit(x_train, y_train)
knn_grid_score = knn_grid.score(x_test, y_test)
print('Score : ',knn_grid_score)
print('Best parameters : ',knn_grid.best_params_)
all_scores['K-Nearest Neighbour'] = round((knn_grid_score*100),2)

In [None]:
cm = confusion_matrix(y_test,knn_grid.predict(x_test))

f, ax = plt.subplots(figsize =(10,10))
sns.heatmap(cm,annot = True,linewidths=0.5,linecolor="blue",fmt = ".0f",ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Original values")
plt.show()

<a id = "9"></a>
<font color ='red'>
# 4.5 Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
parameters_rf = {
    'n_estimators' : (10,64,100,128),
    'criterion' : ('gini', 'entropy'),
    'max_features' : ('auto', 'sqrt', 'log2'),
}

rf_grid = GridSearchCV( rf, parameters_rf)
rf_grid.fit(x_train, y_train)
rf_grid_score = rf_grid.score(x_test, y_test)
print('Score : ',rf_grid_score)
print('Best parameters : ',rf_grid.best_params_)
all_scores['Random Forest'] = round((rf_grid_score*100),2)

In [None]:
cm = confusion_matrix(y_test,rf_grid.predict(x_test))

f, ax = plt.subplots(figsize =(10,10))
sns.heatmap(cm,annot = True,linewidths=0.5,linecolor="blue",fmt = ".0f",ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Original values")
plt.show()

<a id = "10"></a>
<font color ='red'>
# 4.6 Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
parameters_gb = {
    'loss' : ('deviance', 'exponential'),
    'learning_rate' : (1.0,1.1,1.2),
    'criterion' : ('friedman_mse', 'mse', 'mae'),    
}

gb_grid = GridSearchCV( gb, parameters_gb)
gb_grid.fit(x_train, y_train)
gb_grid_score = gb_grid.score(x_test, y_test)
print('Score : ',gb_grid_score)
print('Best parameters : ',gb_grid.best_params_)
all_scores['Gradient Boost'] = round((gb_grid_score*100),2)

In [None]:
cm = confusion_matrix(y_test,gb_grid.predict(x_test))

f, ax = plt.subplots(figsize =(10,10))
sns.heatmap(cm,annot = True,linewidths=0.5,linecolor="blue",fmt = ".0f",ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Original values")
plt.show()

<a id = "11"></a>
<font color ='red'>
# 5. Comparing Results

In [None]:
_, ax = plt.subplots(figsize =(8,8))
ax.bar(all_scores.keys(), all_scores.values(), color = 'dodgerblue', align = 'center')
plt.xticks(rotation='vertical')
plt.title('% ACCURACY')


In [None]:
results = pd.DataFrame(all_scores.items(), columns=['Classifier', 'Score'])
results = results.sort_values(by=['Score'],ascending = False)
results.index = range(1,7)
results