## 1.1.Importing Libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style='whitegrid')
from sklearn.preprocessing import LabelEncoder

## 1.2.Importing Data

In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.head()

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.
n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: "Robust Linear Programming Discrimination of Two Linearly Inseparable Sets", Optimization Methods and Software 1, 1992, 23-34].


Attribute Information:
1. ID number
1. Diagnosis (M = malignant, B = benign)

Ten real-valued features are computed for each cell nucleus:

1. radius (mean of distances from center to points on the perimeter)
1. texture (standard deviation of gray-scale values)
1. perimeter
1. area
1. smoothness (local variation in radius lengths)
1. compactness (perimeter^2 / area - 1.0)
1. concavity (severity of concave portions of the contour)
1. concave points (number of concave portions of the contour)
1. symmetry
1. fractal dimension ("coastline approximation" - 1)

The mean, standard error and "worst" or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features. All feature values are recoded with four significant digits.

## 1.3.Data Analysis

In [None]:
df.info()

Count the number of empty values in each column:


In [None]:
df.isna().sum()

We delete the column 'Unnamed: 32' because it does not affect the data set.

In [None]:
df.drop(['Unnamed: 32'], axis = 1 , inplace=True)

In [None]:
df.dtypes

In [None]:
labelencoder_Y = LabelEncoder()
df.iloc[:,1] = labelencoder_Y.fit_transform(df.iloc[:,1].values)

In [None]:
df.iloc[:,1]

We converted the diagnosis values to numerical values. B=0 and M=1

In [None]:
df.head()

# 2.Data Visualization

## 2.1.Diagnosis Values

In [None]:
df['diagnosis'].value_counts()

In [None]:
sns.countplot(df['diagnosis'], label = 'Count')

## 2.2.Correlation Map

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(), annot=True, cmap='viridis')

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df[[df.columns[1],df.columns[2], df.columns[3],df.columns[4],df.columns[5],
                     df.columns[6], df.columns[7], df.columns[8],df.columns[9],df.columns[10],df.columns[11]]].corr(),linewidths=.1,cmap="YlGnBu", annot=True)
plt.yticks(rotation=0);
plt.suptitle('Correlation Map')

## 2.3.Distribution Charts

In [None]:
sns.pairplot(df[['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean']], hue = 'diagnosis')

# 3.Model Building

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from keras.models import Sequential 
from keras.layers import Dense
from sklearn.metrics import confusion_matrix

First, We delete the "id" column as it will not contribute to model training.

In [None]:
df.drop(columns="id", inplace=True, errors="ignore")

We define the "diagnosis" column to y and the other columns to X.

In [None]:
X = df.iloc[:,1:31].values
y = df.iloc[:,0].values

We will divide our data into 4 variables; The x_train and y_train variables for training, x_test and y_test variables to test the model at the end of the training.

The test_size parameter specifies what percentage of the data set should be reserved for testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## 3.1. Logistic Regression

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [None]:
lr_prediction = lr_model.predict(X_test)
lr_prediction

In [None]:
lr_acc = accuracy_score(y_test,lr_prediction)
lr_re = recall_score(y_test,lr_prediction)
lr_pr = precision_score(y_test,lr_prediction)
lr_f1 = f1_score(y_test,lr_prediction)
print("Accuracy score:",lr_acc)
print("Recall score:",lr_re)
print("Precision score:",lr_pr)
print("f1 score:",lr_f1)

In [None]:
plot_confusion_matrix(lr_model,X_test, y_test,cmap= plt.cm.Blues)

## 3.2.KNeighborsClassifier

In [None]:
KNN = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p = 2)
KNN.fit(X_train, y_train)

In [None]:
KNN_predictions = KNN.predict(X_test)
KNN_predictions

In [None]:
KNN_acc = accuracy_score(y_test,KNN_predictions)
KNN_re = recall_score(y_test,KNN_predictions)
KNN_pr = precision_score(y_test,KNN_predictions)
KNN_f1 = f1_score(y_test,KNN_predictions)
print("Accuracy score:",KNN_acc)
print("Recall score:",KNN_re)
print("Precision score:",KNN_pr)
print("f1 score:",KNN_f1)

In [None]:
plot_confusion_matrix(KNN,X_test, y_test,cmap= plt.cm.Blues)

## 3.3.Decision Tree

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

In [None]:
dt_predictions = dt_classifier.predict(X_test)
dt_predictions

In [None]:
dt_acc = accuracy_score(y_test,dt_predictions)
dt_re = recall_score(y_test,dt_predictions)
dt_pr = precision_score(y_test,dt_predictions)
dt_f1 = f1_score(y_test,dt_predictions)
print("Accuracy score:",dt_acc)
print("Recall score:",dt_re)
print("Precision score:",dt_pr)
print("f1 score:",dt_f1)

In [None]:
plot_confusion_matrix(dt_classifier,X_test, y_test,cmap= plt.cm.Blues)

## 3.4.Random Forest Classifier

In [None]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
rf_predictions = rf_classifier.predict(X_test)
rf_predictions

In [None]:
rf_acc = accuracy_score(y_test,rf_predictions)
rf_re = recall_score(y_test,rf_predictions)
rf_pr = precision_score(y_test,rf_predictions)
rf_f1 = f1_score(y_test,rf_predictions)
print("Accuracy score:",rf_acc)
print("Recall score:",rf_re)
print("Precision score:",rf_pr)
print("f1 score:",rf_f1)

In [None]:
plot_confusion_matrix(rf_classifier,X_test, y_test,cmap= plt.cm.Blues)

## 3.5.Gaussian Naive Bayes 

In [None]:
Gnb = GaussianNB() 
Gnb.fit(X_train, y_train) 

In [None]:
Gnb_predictions = Gnb.predict(X_test)
Gnb_predictions

In [None]:
Gnb_acc = accuracy_score(y_test,Gnb_predictions)
Gnb_re = recall_score(y_test,Gnb_predictions)
Gnb_pr = precision_score(y_test,Gnb_predictions)
Gnb_f1 = f1_score(y_test,Gnb_predictions)
print("Accuracy score:",Gnb_acc)
print("Recall score:",Gnb_re)
print("Precision score:",Gnb_pr)
print("f1 score:",Gnb_f1)

In [None]:
plot_confusion_matrix(Gnb,X_test, y_test,cmap= plt.cm.Blues)

## 3.6. Support Vector Machine

In [None]:
svm_model = svm.SVC(random_state=42)
svm_model.fit(X_train, y_train)

In [None]:
svm_prediction = svm_model.predict(X_test)
svm_prediction

In [None]:
svm_acc = accuracy_score(y_test,svm_prediction)
svm_re = recall_score(y_test,svm_prediction)
svm_pr = precision_score(y_test,svm_prediction)
svm_f1 = f1_score(y_test,svm_prediction)
print("Accuracy score:",svm_acc)
print("Recall score:",svm_re)
print("Precision score:",svm_pr)
print("f1 score:",svm_f1)

In [None]:
plot_confusion_matrix(svm_model,X_test, y_test,cmap= plt.cm.Blues)

## 3.7.Artificial Neural Network (ANN)

In [None]:
ANN_model = Sequential() 
ANN_model.add(Dense(activation = "relu", input_dim = 30,  
                     units = 8, kernel_initializer = "uniform")) 
ANN_model.add(Dense(activation = "relu", units = 14,  
                     kernel_initializer = "uniform")) 
ANN_model.add(Dense(activation = "sigmoid", units = 1,  
                     kernel_initializer = "uniform")) 
ANN_model.compile(optimizer = 'adam' , loss = 'binary_crossentropy',  
                   metrics = ['accuracy'] ) 

In [None]:
ANN_model.summary()

In [None]:
ANN_model.fit(X_train , y_train , batch_size = 8 ,epochs = 400 )

In [None]:
ANN_prediction = ANN_model.predict(X_test) 
ANN_prediction = (ANN_prediction > 0.5) 

In [None]:
ANN_acc = accuracy_score(y_test,ANN_prediction)
ANN_re = recall_score(y_test,ANN_prediction)
ANN_pr = precision_score(y_test,ANN_prediction)
ANN_f1 = f1_score(y_test,ANN_prediction)
print("Accuracy score:",ANN_acc)
print("Recall score:",ANN_re )
print("Precision score:",ANN_pr)
print("f1 score:",ANN_f1)

In [None]:
cmann = confusion_matrix(y_test,ANN_prediction) 
print(cmann)

# 4.Comparison of Models

In [None]:
models = [('Logistic Regression',lr_acc,lr_pr,lr_re,lr_f1),
          ('K-Nearest Neighbors (KNN)',KNN_acc,KNN_pr,KNN_re,KNN_f1),
          ('Decision Tree Classifier',dt_acc,dt_pr,dt_re,dt_f1),
          ('Random Forest Classifier',rf_acc,rf_pr,rf_re,rf_f1),
          ('Gaussian Naive Bayes',Gnb_acc,Gnb_pr,Gnb_re,Gnb_f1),
          ('Support Vector Machine (SVM)',svm_acc,svm_pr,svm_re,svm_f1),
          ('Artificial Neural Network (ANN)',ANN_acc,ANN_pr,ANN_re,ANN_f1)]

Comp_models = pd.DataFrame(data = models, columns=['Model','Accuracy Score','Precision Score','Recall Score','F1 Score'])
Comp_models

In [None]:
sns.catplot(x='Model',y='Accuracy Score',data=Comp_models,kind='point',height=4,aspect=4.5)

# 5.PCA: Principal Component Analysis

In [None]:
standard_scaler = StandardScaler()
standard_scaler.fit(df)

scaled_features = standard_scaler.transform(df)

scaled_features

In [None]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=3)
pca_model.fit(scaled_features)

X_pca = pca_model.transform(scaled_features)

print('Shape of the dataset after PCA transformation is {}'.format(X_pca.shape))

In [None]:
pca_df = pd.DataFrame(X_pca, columns=['pca0', 'pca1', 'pca2'])          
pca_df['diagnosis'] = df['diagnosis']
print('Shape of PCA dataset is {}'.format(pca_df.shape))
pca_df.head()

In [None]:
df_target  = df['diagnosis']
X = pca_df.drop(['diagnosis'], axis=1)
y = df_target 

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
nb_model_pca = GaussianNB()
nb_model_pca.fit(X_train, y_train)
nb_predict_pca = nb_model_pca.predict(X_test)
nb_accuracy_pca = accuracy_score(y_test, nb_predict_pca)

print('The accuracy score after PCA using Naive bayes : ',nb_accuracy_pca)

In [None]:
rf_pca = RandomForestClassifier(random_state=42)
rf_pca.fit(X_train, y_train)
rf_pca_predict = rf_pca.predict(X_test)
rf_acc_pca = accuracy_score(y_test,rf_pca_predict) 

print('The accuracy score after PCA using Random Forest : ',rf_acc_pca)

In [None]:
KNN_pca = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p = 2)
KNN_pca.fit(X_train, y_train)
KNN_pca_predict = KNN_pca.predict(X_test)
KNN_acc_pca = accuracy_score(y_test,KNN_pca_predict)

print('The accuracy score after PCA using KNN : ',KNN_acc_pca)

In [None]:
dt_pca = DecisionTreeClassifier(random_state=42)
dt_pca.fit(X_train, y_train)
dt_pca_predict = dt_pca.predict(X_test)
dt_acc_pca = accuracy_score(y_test,dt_pca_predict) 

print('The accuracy score after PCA using Decision Tree : ',dt_acc_pca)

In [None]:
svm_model_pca = svm.SVC(random_state=42)
svm_model_pca.fit(X_train, y_train)
svm_predict_pca = svm_model_pca.predict(X_test)
svm_accuracy_pca = accuracy_score(y_test, svm_predict_pca)

print('The accuracy score after PCA using Suppoer Vector Machine : ',svm_accuracy_pca)

In [None]:
lr_model_pca = LogisticRegression()
lr_model_pca.fit(X_train, y_train)
lr_predict_pca = lr_model_pca.predict(X_test)
lr_accuracy_pca = accuracy_score(y_test, lr_predict_pca)

print('The accuracy score after PCA using Logistic Regression : ',lr_accuracy_pca)

In [None]:
models = [('K-Nearest Neighbors (KNN)',KNN_acc,KNN_acc_pca),
         ('Random Forest Classifier',rf_acc,rf_acc_pca),
         ('Gaussian Naive Bayes',Gnb_acc,nb_accuracy_pca),
          ('Decision Tree Classifier',dt_acc,dt_acc_pca),
         ('Suppoer Vector Machine',svm_acc,svm_accuracy_pca),
         ('Logistic Regression',lr_acc,lr_accuracy_pca)]

Comp_models = pd.DataFrame(data = models, columns=['Model','Accuracy without PCA','Accuracy with PCA'])
Comp_models

# 6.Comparison of Test&Train Percentage

While training the previous models, we allocated 20% of the model for testing. Now, let's compare the accuracy results of models with different ratios.

## 25% rate for testing

In [None]:
X25_train, X25_test, y25_train, y25_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
#Logistic Regression
lr_model_25=LogisticRegression()
lr_model_25.fit(X25_train, y25_train)
lr_prediction_25 = lr_model_25.predict(X25_test)
lr_acc_25 = accuracy_score(y25_test,lr_prediction_25)

#KNeighborsClassifier
KNN_25 = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p = 2)
KNN_25.fit(X25_train, y25_train)
KNN_predictions_25 = KNN_25.predict(X25_test)
KNN_acc_25 = accuracy_score(y25_test,KNN_predictions_25)

#Decision Tree
dt_classifier_25 = DecisionTreeClassifier(random_state=42)
dt_classifier_25.fit(X25_train, y25_train)
dt_predictions_25 = dt_classifier_25.predict(X25_test)
dt_acc_25 = accuracy_score(y25_test,dt_predictions_25)

#Random Forest Classifier
rf_classifier_25 = RandomForestClassifier(random_state=42)
rf_classifier_25.fit(X25_train, y25_train)
rf_predictions_25 = rf_classifier_25.predict(X25_test)
rf_acc_25 = accuracy_score(y25_test,rf_predictions_25)

#Gaussian Naive Bayes
Gnb_25 = GaussianNB() 
Gnb_25.fit(X25_train, y25_train) 
Gnb_predictions_25 = Gnb_25.predict(X25_test)
Gnb_acc_25 = accuracy_score(y25_test,Gnb_predictions_25)

#Support Vector Machine
svm_model_25 = svm.SVC(random_state=42)
svm_model_25.fit(X25_train, y25_train)
svm_prediction_25 = svm_model_25.predict(X25_test)
svm_acc_25 = accuracy_score(y25_test,svm_prediction_25)

#Artificial Neural Network (ANN)
#ANN_model_25 = Sequential() 
#ANN_model_25.add(Dense(activation = "relu", input_dim = 30,  
#                     units = 8, kernel_initializer = "uniform")) 
#ANN_model_25.add(Dense(activation = "relu", units = 14,  
#                     kernel_initializer = "uniform")) 
#ANN_model_25.add(Dense(activation = "sigmoid", units = 1,  
#                     kernel_initializer = "uniform")) 
#ANN_model_25.compile(optimizer = 'adam' , loss = 'binary_crossentropy',  
#                   metrics = ['accuracy'] ) 
#ANN_model_25.fit(X25_train , y25_train , batch_size = 8 ,epochs = 400 )
#ANN_prediction_25 = ANN_model_25.predict(X25_test) 
#ANN_prediction_25 = (ANN_prediction_25 > 0.5) 
#ANN_acc_25 = accuracy_score(y25_test,ANN_prediction_25)


## 30% rate for testing

In [None]:
X30_train, X30_test, y30_train, y30_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
#Logistic Regression
lr_model_30=LogisticRegression()
lr_model_30.fit(X30_train, y30_train)
lr_prediction_30 = lr_model_30.predict(X30_test)
lr_acc_30 = accuracy_score(y30_test,lr_prediction_30)

#KNeighborsClassifier
KNN_30 = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p = 2)
KNN_30.fit(X30_train, y30_train)
KNN_predictions_30 = KNN_30.predict(X30_test)
KNN_acc_30 = accuracy_score(y30_test,KNN_predictions_30)

#Decision Tree
dt_classifier_30 = DecisionTreeClassifier(random_state=42)
dt_classifier_30.fit(X30_train, y30_train)
dt_predictions_30 = dt_classifier_30.predict(X30_test)
dt_acc_30 = accuracy_score(y30_test,dt_predictions_30)

#Random Forest Classifier
rf_classifier_30 = RandomForestClassifier(random_state=42)
rf_classifier_30.fit(X30_train, y30_train)
rf_predictions_30 = rf_classifier_30.predict(X30_test)
rf_acc_30 = accuracy_score(y30_test,rf_predictions_30)

#Gaussian Naive Bayes
Gnb_30 = GaussianNB() 
Gnb_30.fit(X30_train, y30_train) 
Gnb_predictions_30 = Gnb_30.predict(X30_test)
Gnb_acc_30 = accuracy_score(y30_test,Gnb_predictions_30)

#Support Vector Machine
svm_model_30 = svm.SVC(random_state=42)
svm_model_30.fit(X30_train, y30_train)
svm_prediction_30 = svm_model_30.predict(X30_test)
svm_acc_30 = accuracy_score(y30_test,svm_prediction_30)


## 35% rate for testing

In [None]:
X35_train, X35_test, y35_train, y35_test = train_test_split(X, y, test_size=0.35, random_state=42)

In [None]:
#Logistic Regression
lr_model_35= LogisticRegression()
lr_model_35.fit(X35_train, y35_train)
lr_prediction_35 = lr_model_35.predict(X35_test)
lr_acc_35 = accuracy_score(y35_test,lr_prediction_35)

#KNeighborsClassifier
KNN_35 = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p = 2)
KNN_35.fit(X35_train, y35_train)
KNN_predictions_35 = KNN_35.predict(X35_test)
KNN_acc_35 = accuracy_score(y35_test,KNN_predictions_35)

#Decision Tree
dt_classifier_35 = DecisionTreeClassifier(random_state=42)
dt_classifier_35.fit(X35_train, y35_train)
dt_predictions_35 = dt_classifier_35.predict(X35_test)
dt_acc_35 = accuracy_score(y35_test,dt_predictions_35)

#Random Forest Classifier
rf_classifier_35 = RandomForestClassifier(random_state=42)
rf_classifier_35.fit(X35_train, y35_train)
rf_predictions_35 = rf_classifier_35.predict(X35_test)
rf_acc_35 = accuracy_score(y35_test,rf_predictions_35)

#Gaussian Naive Bayes
Gnb_35 = GaussianNB() 
Gnb_35.fit(X35_train, y35_train) 
Gnb_predictions_35 = Gnb_35.predict(X35_test)
Gnb_acc_35 = accuracy_score(y35_test,Gnb_predictions_35)

#Support Vector Machine
svm_model_35 = svm.SVC(random_state=42)
svm_model_35.fit(X35_train, y35_train)
svm_prediction_35 = svm_model_35.predict(X35_test)
svm_acc_35 = accuracy_score(y35_test,svm_prediction_35)


## Comparison of Accuracy Scores

In [None]:
models_total = [('Logistic Regression',lr_acc,lr_acc_25,lr_acc_30,lr_acc_35 ),
          ('K-Nearest Neighbors (KNN)',KNN_acc,KNN_acc_25,KNN_acc_30,KNN_acc_35 ),
          ('Decision Tree Classifier',dt_acc,dt_acc_25,dt_acc_30,dt_acc_35 ),
          ('Random Forest Classifier',rf_acc,rf_acc_25,rf_acc_30,rf_acc_35 ),
          ('Gaussian Naive Bayes',Gnb_acc,Gnb_acc_25,Gnb_acc_30,Gnb_acc_35 ),
          ('Support Vector Machine (SVM)',svm_acc,svm_acc_25,svm_acc_30,svm_acc_35 )]

Comp_models_total = pd.DataFrame(data = models_total, columns=['Model','Accuracy Score-20%','Accuracy Score-25%','Accuracy Score-30%','Accuracy Score-35%'])
Comp_models_total

# 7.Grid Search Application on Models

## 7.1.Logistic Regression 

In [None]:
lr_gs = LogisticRegression()
lr_params = {
    'penalty' : ['l2','elasticnet','none'],
    'class_weight' : ['dict','balanced','None'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'multi_class' : ['auto', 'ovr', 'multinomial']
}
gs_lr = GridSearchCV(estimator = lr_gs, param_grid = lr_params, scoring = 'accuracy', 
                        cv = 5, verbose = 1, n_jobs = -1)
gs_lr.fit(X_train,y_train)

In [None]:
gs_lr.best_params_

In [None]:
Grid_lr = LogisticRegression(class_weight='dict',multi_class='auto',penalty='l2',solver='newton-cg')
Grid_lr.fit(X_train, y_train)
Grid_lr_predictions = Grid_lr.predict(X_test)
Grid_lr_acc = accuracy_score(y_test,Grid_lr_predictions)
print("Accuracy score with Grid Search:",Grid_lr_acc)
print("Accuracy score without Grid Search:",lr_acc)

In [None]:
plot_confusion_matrix(Grid_lr,X_test, y_test,cmap= plt.cm.Blues)

## 7.2.KNeighborsClassifier 

In [None]:
KNN_gs = KNeighborsClassifier()
KNN_params = {
    'n_neighbors' : [5,7,9,11,13,15,19,23],
    'algorithm': ['auto','ball_tree', 'kd_tree', 'brute'],
    'p' : [1,2]
}
gs_knn = GridSearchCV(estimator = KNN_gs, param_grid = KNN_params, scoring = 'accuracy', 
                        cv = 5, verbose = 1, n_jobs = -1)
gs_knn.fit(X_train,y_train)

In [None]:
gs_knn.best_params_

In [None]:
Grid_KNN = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 2 ,algorithm='auto')
Grid_KNN.fit(X_train, y_train)
Grid_KNN_predictions = Grid_KNN.predict(X_test)
Grid_KNN_acc = accuracy_score(y_test,Grid_KNN_predictions)
print("Accuracy score with Grid Search:",Grid_KNN_acc)
print("Accuracy score without Grid Search:",KNN_acc)

In [None]:
plot_confusion_matrix(Grid_KNN,X_test, y_test,cmap= plt.cm.Blues)

## 7.3.Decision Tree

In [None]:
dt_classifier_gs = DecisionTreeClassifier()
dt_classifier_params = {
    'criterion': ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_features' : ['auto','sqrt','log2',None],
    'class_weight' : ['dict', 'balanced', None]

}
gs_dt_classifier = GridSearchCV(estimator = dt_classifier_gs, param_grid = dt_classifier_params, scoring = 'accuracy', 
                        cv = 5, verbose = 1, n_jobs = -1)
gs_dt_classifier.fit(X_train,y_train)

In [None]:
gs_dt_classifier.best_params_

In [None]:
Grid_dt = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_features=None, splitter='random',random_state=42)
Grid_dt.fit(X_train, y_train)
Grid_dt_predictions = Grid_dt.predict(X_test)
Grid_dt_acc = accuracy_score(y_test,Grid_dt_predictions)
print("Accuracy score with Grid Search:",Grid_dt_acc)
print("Accuracy score without Grid Search:",dt_acc)

In [None]:
plot_confusion_matrix(Grid_dt,X_test, y_test,cmap= plt.cm.Blues)

## 7.4.Random Forest Classifier

In [None]:
rf_classifier_gs = RandomForestClassifier()
rf_classifier_params = {
    'criterion': ['gini', 'entropy'],
    'max_features' : ['auto','sqrt','log2',None]

}
gs_rf_classifier = GridSearchCV(estimator = rf_classifier_gs, param_grid = rf_classifier_params, scoring = 'accuracy', 
                        cv = 5, verbose = 1, n_jobs = -1)
gs_rf_classifier.fit(X_train,y_train)

In [None]:
gs_rf_classifier.best_params_

In [None]:
Grid_rf = RandomForestClassifier(criterion='entropy', max_features=None,random_state=42)
Grid_rf.fit(X_train, y_train)
Grid_rf_predictions = Grid_rf.predict(X_test)
Grid_rf_acc = accuracy_score(y_test,Grid_rf_predictions)
print("Accuracy score with Grid Search:",Grid_rf_acc)
print("Accuracy score without Grid Search:",rf_acc)

In [None]:
plot_confusion_matrix(Grid_rf,X_test, y_test,cmap= plt.cm.Blues)

## 7.5.Gaussian Naive Bayes

In [None]:
Gnb_gs = GaussianNB() 
Gnb_params = {
    'var_smoothing': np.logspace(0,-9, num=100)
}
gs_Gnb = GridSearchCV(estimator = Gnb_gs, param_grid = Gnb_params, scoring = 'accuracy', 
                        cv = 5, verbose = 1, n_jobs = -1)
gs_Gnb.fit(X_train,y_train)

In [None]:
gs_Gnb.best_params_

In [None]:
Grid_Gnb = GaussianNB(var_smoothing= 0.15199110829529336 )
Grid_Gnb.fit(X_train, y_train)
Grid_Gnb_predictions = Grid_Gnb.predict(X_test)
Grid_Gnb_acc = accuracy_score(y_test,Grid_Gnb_predictions)
print("Accuracy score with Grid Search:",Grid_Gnb_acc)
print("Accuracy score without Grid Search:",Gnb_acc)

In [None]:
plot_confusion_matrix(Grid_Gnb,X_test, y_test,cmap= plt.cm.Blues)

## 7.6.Support Vector Machine

In [None]:
svm_model_gs = svm.SVC()
svm_model_params = {
    'degree' : [2,3,4,5,6],
    'gamma' : ['scale','auto'],
    'shrinking' : [True,False],
    'probability' : [True, False],
    'class_weight' : ['dict', 'balanced', None],
    'verbose' : [True, False],
    'decision_function_shape' : ['ovo', 'ovr']
}
gs_svm_model = GridSearchCV(estimator = svm_model_gs, param_grid = svm_model_params, scoring = 'accuracy', 
                        cv = 5, verbose = 1, n_jobs = -1)
gs_svm_model.fit(X_train,y_train)

In [None]:
gs_svm_model.best_params_

In [None]:
Grid_svm = svm.SVC(class_weight = None, decision_function_shape='ovo',degree=2,gamma='scale',probability=True,
                   shrinking=True, verbose= True ,random_state=42)
Grid_svm.fit(X_train, y_train)
Grid_svm_predictions = Grid_svm.predict(X_test)
Grid_svm_acc = accuracy_score(y_test,Grid_svm_predictions)
print("Accuracy score with Grid Search:",Grid_svm_acc)
print("Accuracy score without Grid Search:",svm_acc)

In [None]:
plot_confusion_matrix(Grid_svm,X_test, y_test,cmap= plt.cm.Blues)

## Comparison of Accuracy Scores

In [None]:
models_total = [('Logistic Regression',lr_acc,Grid_lr_acc ),
          ('K-Nearest Neighbors (KNN)',KNN_acc,Grid_KNN_acc ),
          ('Decision Tree Classifier',dt_acc,Grid_dt_acc ),
          ('Random Forest Classifier',rf_acc,Grid_rf_acc ),
          ('Gaussian Naive Bayes',Gnb_acc,Grid_Gnb_acc),
          ('Support Vector Machine (SVM)',svm_acc,Grid_svm_acc )]

Comp_models_total = pd.DataFrame(data = models_total, columns=['Model','Accuracy Score  without Grid Search','Accuracy Score with Grid Search'])
Comp_models_total