In [None]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
#importing our cancer dataset

dataset = pd.read_csv("../input/breast-cancer-data/Breast_cancer_data.csv")
dataset.head()

In [None]:
print("Cancer data set dimensions : {}".format(dataset.shape))

In [None]:
#Drop the column with all missing values
train = dataset.dropna(axis = 1)
#Get a count of the number of malignant(1) and benign(0) cells
train['diagnosis'].value_counts()

In [None]:
#Visualize the count
sns.countplot(train['diagnosis'], label = 'count')

In [None]:
#look at the data types to see which columns need to be encoded
train.dtypes

In [None]:
#Create a pair plot
sns.pairplot(train.iloc[: ,0:6], hue = 'diagnosis')

In [None]:
#Get the correlation of the columns
train.corr()

In [None]:
#Visualize the correlation
f,ax = plt.subplots(figsize=(20, 20))
sns.heatmap(train.corr(), annot = True, fmt= '.2f')

In [None]:
train.head()

In [None]:
### Split the dataset into independent(X) and dependent(Y) datasets
X = train.iloc[:,0:4].values
Y = train.iloc[:,5].values

In [None]:
#Split the dataset into 75% training and 25% testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [None]:
#Scale the data (Feature Scaling)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
#Create a function for models
def models(X_train, Y_train):
    
    #Logistic Regression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(max_iter=100)
    log.fit(X_train, Y_train)
    
    #Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier()
    tree.fit(X_train, Y_train)
    
    #Random Forest
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier()
    forest.fit(X_train, Y_train)
    
    #Naive Bayes
 #   from sklearn.naive_bayes import MultinomialNB
 #   NB = MultinomialNB(alpha = 1.0, class_prior=None, fit_prior=True)
 #   NB.fit(X_train, Y_train)
    
    #K-nearest neighbors
    from sklearn.neighbors import KNeighborsClassifier
    KNN = KNeighborsClassifier()
    KNN.fit(X_train, Y_train)
    
    #Support Vector Machines
    from sklearn.svm import SVC
    SVM = SVC()
    SVM.fit(X_train, Y_train)
    
    #Gradient Boosting Classifier
    from sklearn.ensemble import GradientBoostingClassifier
    GBR = GradientBoostingClassifier()
    GBR.fit(X_train,Y_train)
    

    #Print the model accuracy of training data
    print('[0]Logistic Regression Training Accuracy               : ',log.score(X_train, Y_train))
    print('[1]Decision Tree Training Accuracy                     : ',tree.score(X_train, Y_train))
    print('[2]Random Forest Training Accuracy                     : ',forest.score(X_train, Y_train))
#    print('[3]Naive Bayes Training Accuracy                       : ',NB.score(X_train, Y_train)) (negative value error)
    print('[3]K-nearest neighbors Training Accuracy               : ',KNN.score(X_train, Y_train))
    print('[4]Support Vector Machines Training Accuracy : ',SVM.score(X_train, Y_train))
    print('[5]Gradient Boosting Classifier Training Accuracy : ',GBR.score(X_train, Y_train))
    
    return log, tree, forest, KNN, SVM, GBR

In [None]:
# lets Make a function for Grid Search CV
from sklearn.model_selection import GridSearchCV
def Classification_model_gridsearchCV(model,param_grid,data_X,data_y):
    clf = GridSearchCV(model,param_grid,cv=10,scoring="accuracy")
    # this is how we use grid serch CV we are giving our model
    # the we gave parameters those we want to tune
    # Cv is for cross validation
    # scoring means to score the classifier
    
    clf.fit(X_train,Y_train)
    print("The best parameter found on development set is :")
    # this will gie us our best parameter to use
    print(clf.best_params_)
    print("the bset estimator is ")
    print(clf.best_estimator_)
    print("The best score is ")
    # this is the best score that we can achieve using these parameters#
    print(clf.best_score_)

In [None]:
# Here we have to take parameters that are used for Decison tree Classifier
# you will understand these terms once you follow the link above
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_split': [2,3,4,5,6,7,8,9,10], 
              'min_samples_leaf':[2,3,4,5,6,7,8,9,10] }
# here our gridasearchCV will take all combinations of these parameter and apply it to model 
# and then it will find the best parameter for model
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
Classification_model_gridsearchCV(tree,param_grid,X_train,Y_train)
# call our function

In [None]:
k_range = list(range(1, 30))
leaf_size = list(range(1,30))
weight_options = ['uniform', 'distance']
param_grid = {'n_neighbors': k_range, 'leaf_size': leaf_size, 'weights': weight_options}
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()
Classification_model_gridsearchCV(KNN,param_grid,X_train,Y_train)

In [None]:
param_grid = [
              {'C': [1, 10, 100, 1000], 
               'kernel': ['linear']
              },
              {'C': [1, 10, 100, 1000], 
               'gamma': [0.001, 0.0001], 
               'kernel': ['rbf']
              },
 ]
from sklearn.svm import SVC
SVM = SVC()
Classification_model_gridsearchCV(SVM,param_grid,X_train,Y_train)

In [None]:
# param_grid = {'learning_rate': [0.01,0.02,0.03,0.04],
#                   'subsample'    : [0.9, 0.5, 0.2, 0.1],
#                   'n_estimators' : [100,500,1000, 1500],
#                   'max_depth'    : [4,6,8,10]
#                  }
# from sklearn.ensemble import GradientBoostingClassifier
# GBR = GradientBoostingClassifier()
# Classification_model_gridsearchCV(GBR,param_grid,X_train,Y_train)

In [None]:
#Getting all the models
model = models(X_train, Y_train)

In [None]:
#test model accuracy on test data using confusion matrix
from sklearn.metrics import confusion_matrix
for i in range (len(model)):
    print('Model :',model[i])
    cm = confusion_matrix(Y_test,model[i].predict(X_test))

    TP = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TN = cm[1][1]

    print(cm)
    print('Testing Accuracy =',(TP + TN)/(TP + FP + FN + TN))
    print('Precision =',(TP)/(TP + FP))
    print('Recall =',(TP)/(TP + FN))
    print()

In [None]:
#Another way to get matrix of the models
from sklearn.metrics import classification_report, accuracy_score
for i in range (len(model)):
    print('Model :',model[i])
    print(classification_report(Y_test,model[i].predict(X_test)))
    print(accuracy_score(Y_test,model[i].predict(X_test)))
    print()

In [None]:
features_mean = list(dataset.columns[0:5])

In [None]:
color_function = {0: "blue", 1: "red"} # Here Red color will be 1 which means M and blue foo 0 means B
colors = dataset["diagnosis"].map(lambda x: color_function.get(x))# mapping the color fuction with diagnosis column
pd.plotting.scatter_matrix(dataset[features_mean], c=colors, alpha = 0.5, figsize = (5, 5)); # plotting scatter plot matrix

In [None]:
X_train.shape

In [None]:
Y_test.shape

In [None]:
X_train = X_train.reshape(398,4,1)
X_test = X_test.reshape(171,4,1)

In [None]:
#CNN model

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv1D, MaxPool1D,Flatten,Dense,Dropout,BatchNormalization
from tensorflow.keras.optimizers import Adam

modelCNN = Sequential()
modelCNN.add(Conv1D(filters=16,kernel_size=2,activation='relu',input_shape=(4,1)))
modelCNN.add(BatchNormalization())
modelCNN.add(Dropout(0.2))

modelCNN.add(Conv1D(32,2,activation='relu'))
modelCNN.add(BatchNormalization())
modelCNN.add(Dropout(0.2))

modelCNN.add(Flatten())
modelCNN.add(Dense(32,activation='relu'))
modelCNN.add(Dropout(0.2))

modelCNN.add(Dense(1,activation='sigmoid'))
modelCNN.summary()
modelCNN.compile(optimizer=Adam(learning_rate=0.0001),loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
epoch = 100
history = modelCNN.fit(X_train,Y_train,epochs=epoch,verbose=1,validation_data=(X_test,Y_test))

In [None]:
def plotLearningCurve(history,epochs):
  epochRange = range(1,epochs+1)
  plt.plot(epochRange,history.history['accuracy'])
  plt.plot(epochRange,history.history['val_accuracy'])
  plt.title('Model Accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend(['Train','Validation'],loc='upper left')
  plt.show()

  plt.plot(epochRange,history.history['loss'])
  plt.plot(epochRange,history.history['val_loss'])
  plt.title('Model Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend(['Train','Validation'],loc='upper left')
  plt.show()

In [None]:
plotLearningCurve(history,epoch)

In [None]:
X_pred = modelCNN.predict(X_test)

In [None]:
X_pred

In [None]:
Y_test

In [None]:
asd = []
for x in X_pred.tolist():
    if x[0]>0.8:
        asd.append(1)
    else:
        asd.append(0)
print(asd)

In [None]:
#test model accuracy on test data using confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test,asd)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

print(cm)
print('Testing Accuracy =',(TP + TN)/(TP + FP + FN + TN))
print('Precision =',(TP)/(TP + FP))
print('Recall =',(TP)/(TP + FN))