# <a id='7.5'>Breast Cancer Detection</a>

##  <a id='0'>0. Loading Libraries</a>

In [None]:
#importing libraries
#main ones
import pandas as pd
import numpy as np

#classifiers and metrics
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve, train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_curve, precision_recall_curve, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

#for plotting
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
%matplotlib inline
import itertools
from itertools import chain


#extra
import warnings
import time

#for neural links
from keras.models import Sequential
from keras.layers import Dense

warnings.filterwarnings('ignore')


> ## <a id='1'>1. Loading Data</a>

In [None]:
#readin data
data = pd.read_csv('../input/data.csv')


In [None]:
#taking a look at data
data.info()

##  <a id='1.1'>1.1 Refining Data</a>

In [None]:
#dropping extra columns

data = data.drop(['Unnamed: 32','id'],axis = 1)

data.diagnosis.replace(to_replace = dict(M = 1, B = 0), inplace = True)

In [None]:
#looking at data and labels

data.describe()

In [None]:
# making it 2 datasets

M = data[(data['diagnosis'] != 0)]
B = data[(data['diagnosis'] == 0)]

In [None]:
#how the initial distribution is

trace = go.Pie(labels = ['benign','malignant'], values = data['diagnosis'].value_counts(), 
               textfont=dict(size=15), opacity = 0.8,
               marker=dict(colors=['gray', 'red'], 
                           line=dict(color='#000000', width=1.5)))


layout = dict(title =  'Distribution of diagnosis variable')
           
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

In [None]:
#dividing in three parts

features_mean= list(data.columns[0:11])
features_se= list(data.columns[11:20])
features_worst=list(data.columns[21:31])

In [None]:
#correlation
correlation = data.corr()
matrix_cols = correlation.columns.tolist()
corr_array  = np.array(correlation)

#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   xgap = 2,
                   ygap = 2,
                   colorscale='Reds',
                   colorbar   = dict() ,
                  )
layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                     ),
                        yaxis   = dict(tickfont = dict(size = 12)),
                        xaxis   = dict(tickfont = dict(size = 12)),
                       )
                  )
fig = go.Figure(data = [trace],layout = layout)
py.iplot(fig)

In [None]:
y = np.array(data.diagnosis.tolist())
data = data.drop('diagnosis', 1)
X = np.array(data.as_matrix())

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.12, random_state = random_state)
print(X_train.shape)
print(X_test.shape)

##  <a id='1.2'>1.2 Metrics and Plots</a>

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title = 'Confusion matrix"',
                          cmap = plt.cm.RdGy) :
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])) :
        plt.text(j, i, cm[i, j],
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Show metrics 
def show_metrics():
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    print('Accuracy  =     {:.3f}'.format((tp+tn)*100/(tp+tn+fp+fn)))
    print('Precision =     {:.3f}'.format(tp*100/(tp+fp)))
    print('Recall    =     {:.3f}'.format(tp*100/(tp+fn)))
    print('F1_score  =     {:.3f}'.format(2*(((tp/(tp+fp))*(tp/(tp+fn))*100)/
                                                 ((tp/(tp+fp))+(tp/(tp+fn))))))

In [None]:
def cross_val_metrics(model) :
    scores = ['accuracy', 'precision', 'recall']
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 5, scoring = sc)
        print('[%s] : %0.5f (+/- %0.5f)'%(sc, scores.mean(), scores.std()))

##  <a id='2'>2. Logistic Regression</a>

In [None]:
lgr_clf_start_time = time.time()

lgr_clf=LogisticRegression(random_state = random_state)
lgr_clf.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - lgr_clf_start_time))


In [None]:
y_pred =lgr_clf.predict(X_test)
y_score = lgr_clf.decision_function(X_test)
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Logistic Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()

cross_log = cross_val_metrics(lgr_clf)

### <a id='2.1'>2.1 Tuning Parameters</a>

In [None]:
log_clf = LogisticRegression(random_state = random_state)
param_grid = {
            'penalty' : ['l2','l1'],  
            'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
            }

CV_log_clf = GridSearchCV(estimator = log_clf, param_grid = param_grid , scoring = 'accuracy', verbose = 1, n_jobs = -1)
CV_log_clf.fit(X_train, y_train)

best_parameters = CV_log_clf.best_params_
print('The best parameters for using this model is', best_parameters)

In [None]:
CV_log_clf = LogisticRegression(C = best_parameters['C'], 
                                penalty = best_parameters['penalty'], 
                                random_state = random_state)

CV_log_clf.fit(X_train, y_train)
y_pred = CV_log_clf.predict(X_test)
y_score = CV_log_clf.decision_function(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tuned Logistic Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()
cross_log = cross_val_metrics(CV_log_clf)

##  <a id='3'>3. Decision Tree</a>

In [None]:
dtree_start_time = time.time()

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - dtree_start_time))

y_pred = dtree.predict(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title=' Decision Tree Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()
cross_log = cross_val_metrics(dtree)

###  <a id='3.1'>3.1 Tuning Parameters</a>

In [None]:
def Classification_model_gridsearchCV(model,param_grid,X_data,y_data):
    clf = GridSearchCV(model,param_grid,cv=10,scoring="accuracy")
  
    clf.fit(X_train,y_train)
    print("The best parameter found on development set is :")
    print(clf.best_params_)
    print("the bset estimator is ")
    print(clf.best_estimator_)
    print("The best score is ")
    print(clf.best_score_)

In [None]:
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_split': [2,3,4,5,6,7,8,9,10], 
              'min_samples_leaf':[2,3,4,5,6,7,8,9,10] }

dtree= DecisionTreeClassifier()
Classification_model_gridsearchCV(dtree,param_grid,X_train,y_train)

In [None]:
dtree = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
dtree.fit(X_train, y_train)

y_pred = dtree.predict(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tuned Logistic Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()
cross_log = cross_val_metrics(dtree)

##  <a id='4'>4. SVM</a>

In [None]:
svm_start_time = time.time()

clf_svm=svm.SVC()
clf_svm.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - svm_start_time))

y_pred = clf_svm.predict(X_test)
y_score = clf_svm.decision_function(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='SVM Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()
cross_log = cross_val_metrics(clf_svm)

###  <a id='4.1'>3.1 Tuning Parameters</a>

In [None]:
clf_svm=svm.SVC(kernel='linear', C=1000000) 
clf_svm.fit(X_train, y_train)
y_pred = clf_svm.predict(X_test)
y_score = clf_svm.decision_function(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tuned SVM Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()
cross_log = cross_val_metrics(clf_svm)

In [None]:
clf_svm=svm.SVC(kernel='linear', C=0.1) 
clf_svm.fit(X_train, y_train)
y_pred = clf_svm.predict(X_test)
y_score = clf_svm.decision_function(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tuned SVM Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()
cross_log = cross_val_metrics(clf_svm)

In [None]:
clf_svm=svm.SVC(kernel='rbf', C=1000000) 
clf_svm.fit(X_train, y_train)
y_pred = clf_svm.predict(X_test)
y_score = clf_svm.decision_function(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tuned SVM Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()
cross_log = cross_val_metrics(clf_svm)

In [None]:
clf_svm=svm.SVC(kernel='rbf', C=0.1) 
clf_svm.fit(X_train, y_train)
y_pred = clf_svm.predict(X_test)
y_score = clf_svm.decision_function(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tuned SVM Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()
cross_log = cross_val_metrics(clf_svm)

##  <a id='5'>5. KNN</a>

In [None]:
knn_start_time = time.time()

clf_knn = KNeighborsClassifier(n_neighbors=1)
clf_knn.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - knn_start_time))


y_pred =clf_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='KNN Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()

cross_log = cross_val_metrics(clf_knn)

###  <a id='5.1'>5.1 Tuning Parameters</a>

In [None]:
clf_knn = KNeighborsClassifier(n_neighbors=10)
clf_knn.fit(X_train, y_train)
y_pred =clf_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tuned KNN Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()

cross_log = cross_val_metrics(clf_knn)

In [None]:
clf_knn = KNeighborsClassifier(n_neighbors=50)
clf_knn.fit(X_train, y_train)
y_pred =clf_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tuned KNN Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()

cross_log = cross_val_metrics(clf_knn)

**Overfitting!!!!**

##  <a id='6'>6. GaussianNB</a>

In [None]:
GNB_clf_start_time = time.time()

GNB_clf = GaussianNB()
GNB_clf.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - GNB_clf_start_time))

y_pred = GNB_clf.predict(X_test)
#y_score = GNB_clf.decision_function(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='GaussianNB Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()
cross_log = cross_val_metrics(GNB_clf)

##  <a id='7'>7. Neural Network</a>

In [None]:

from keras import metrics
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical

network_start_time = time.time()

network = Sequential()
network.add(Dense (60, activation='relu' , input_shape=(30,)))
network.add(Dense (60, activation='relu'))
network.add(Dense (1, activation='sigmoid'))
            
network.compile(optimizer='sgd',
             loss='binary_crossentropy',
             metrics=['accuracy',])
            
history = network.fit(X_train, y_train, epochs = 10, batch_size=4, validation_data=(X_test, y_test))


print("--- %s seconds ---" % (time.time() - network_start_time))

network.evaluate(X_test, y_test)


loss = history.history['loss']
val_loss = history.history ['val_loss']

epochs = range(1, 11)

plt.plot(epochs, loss, 'bo', label= 'training loss')
plt.plot(epochs, val_loss, 'b', label= 'Validation loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()

plt.show()



Different optimizers:

In [None]:
network2_start_time = time.time()

network2 = Sequential()
network2.add(Dense (60, activation='relu' , input_shape=(30,)))
network2.add(Dense (60, activation='relu'))
network2.add(Dense (1, activation='sigmoid'))
            
network2.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['accuracy',])
            
history2 = network2.fit(X_train, y_train, epochs = 10, batch_size=4, validation_data=(X_test, y_test))


print("--- %s seconds ---" % (time.time() - network2_start_time))

network2.evaluate(X_test, y_test)

loss = history2.history['loss']
val_loss = history2.history ['val_loss']

epochs = range(1, 11)

plt.plot(epochs, loss, 'bo', label= 'training loss')
plt.plot(epochs, val_loss, 'b', label= 'Validation loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()

plt.show()

In [None]:
network3_start_time = time.time()

network3 = Sequential()
network3.add(Dense (60, activation='relu' , input_shape=(30,)))
network3.add(Dense (60, activation='relu'))
network3.add(Dense (1, activation='sigmoid'))
            
network3.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy',])
            
history3 = network3.fit(X_train, y_train, epochs = 10, batch_size=4, validation_data=(X_test, y_test))


print("--- %s seconds ---" % (time.time() - network3_start_time))

network3.evaluate(X_test, y_test)

loss = history3.history['loss']
val_loss = history3.history ['val_loss']

epochs = range(1, 11)

plt.plot(epochs, loss, 'bo', label= 'training loss')
plt.plot(epochs, val_loss, 'b', label= 'Validation loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()

plt.show()

In [None]:
network4_start_time = time.time()

network4 = Sequential()
network4.add(Dense (60, activation='relu' , input_shape=(30,)))
network4.add(Dense (60, activation='relu'))
network4.add(Dense (1, activation='sigmoid'))
            
network4.compile(optimizer='adamax',
             loss='binary_crossentropy',
             metrics=['accuracy',])
            
history4 = network4.fit(X_train, y_train, epochs = 10, batch_size=4, validation_data=(X_test, y_test))


print("--- %s seconds ---" % (time.time() - network4_start_time))

network4.evaluate(X_test, y_test)

loss = history4.history['loss']
val_loss = history4.history ['val_loss']

epochs = range(1, 11)

plt.plot(epochs, loss, 'bo', label= 'training loss')
plt.plot(epochs, val_loss, 'b', label= 'Validation loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()

plt.show()

## <a id='8'>8. Results and Comparing</a>

In [None]:
models_metrics = {'logistic regression': [0.97891, 0.98106, 0.96224], 
                 'tuned logistic regression': [0.98242, 0.99036, 0.96235],
                 'decision tree' : [0.92445,0.87080,0.91528],
                 'tuned decision tree' : [0.91566,0.93318,0.91561],
                 'svm' : [0.97544,0.97173,0.96235],
                 'tuned svm' : [0.97541,0.98548,0.94817],
                  'knn' : [0.95075,0.93891,0.92901],
                 'tuned knn' : [0.96494,0.98986,0.91528],
                  'gaussiannb' : [0.92808,0.91233,0.89657]
                }
df = pd.DataFrame(data = models_metrics)
df.rename(index={0:'Accuracy',1:'Precision', 2: 'Recall'}, 
                 inplace=True)
ax = df.plot(kind='bar', figsize = (15,10), ylim = (0.86, 1), 
        color = ['#c6e2ff', '#7c96b3', '#fa85af', '#ccb1bd', '#a2b970', '#9bae88', '#d2ccda', '#9690a8' , '#f08080'],
        rot = 0, title ='Models performance (cross val mean)',
        edgecolor = 'grey', alpha = 0.5)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01, p.get_height() * 1.0005))
plt.show()

In [None]:
models_metrics = {'logistic regression': [0.0044], 
                 'decision tree' : [0.0270],
                 'svm' : [0.0078],
                  'knn' : [0.0017],
                  'gaussiannb' : [0.0017]
                }
df = pd.DataFrame(data = models_metrics)
df.rename(index={0:'time'}, 
                 inplace=True)
ax = df.plot(kind='bar', figsize = (15,10), ylim = (0.0015, 0.028), 
        color = ['#c6e2ff', '#fa85af', '#a2b970','#d2ccda', '#f08080'],
        rot = 0, title ='Models performance (cross val mean)',
        edgecolor = 'grey', alpha = 0.5)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01, p.get_height() * 1.0005))
plt.show()