In [None]:
#importing google drive files into colab

from google.colab import drive
drive.mount('drive')

In [None]:
#checking cpu info
!cat /proc/cpuinfo

In [None]:
#installing keras

!pip install -q keras

In [None]:
#checking if gpu is functional/working

import tensorflow as tf

tf.test.gpu_device_name()

In [None]:
#checking gpu type in use

from tensorflow.python.client import device_lib

device_lib.list_local_devices()

## About Breast Cancer Wisconsin (Diagnostic) 

Data Set Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: "Robust Linear Programming Discrimination of Two Linearly Inseparable Sets", Optimization Methods and Software 1, 1992, 23-34].

This database is also available through the UW CS ftp server: ftp ftp.cs.wisc.edu cd math-prog/cpo-dataset/machine-learn/WDBC/

Also can be found on UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29

Attribute Information:

1) ID number 2) Diagnosis (M = malignant, B = benign) 3-32)

Ten real-valued features are computed for each cell nucleus:

a) radius (mean of distances from center to points on the perimeter) b) texture (standard deviation of gray-scale values) c) perimeter d) area e) smoothness (local variation in radius lengths) f) compactness (perimeter^2 / area - 1.0) g) concavity (severity of concave portions of the contour) h) concave points (number of concave portions of the contour) i) symmetry j) fractal dimension ("coastline approximation" - 1)

The mean, standard error and "worst" or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features. For instance, field 3 is Mean Radius, field 13 is Radius SE, field 23 is Worst Radius.

All feature values are recoded with four significant digits.

Missing attribute values: none

Class distribution: 357 benign, 212 malignant

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing data and reading the first few columns
data = pd.read_csv('data.csv')
data.head()

In [None]:
# dropping the unnamed column since it doesn't provide much insight into the data
del data['Unnamed: 32']

In [None]:
data.head()

In [None]:
""" separating the data into features(X) and targets(y);
    we have one target variable, diagnosis in this case """

X = data.iloc[:, 2:].values #features
y = data.iloc[:, 1].values #target

In [None]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

In [None]:
#Scaling the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Now that we have prepared data, we will import Keras and its packages.**

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
# Initialising the ANN model
classifier = Sequential()

In [None]:
# Adding the input layer and the first hidden layer
classifier.add(Dense(output_dim=16, init='uniform', activation='relu', input_dim=30))
# Adding dropout to prevent overfitting
classifier.add(Dropout(p=0.1))

input_dim - number of columns of the dataset 

output_dim - number of outputs to be fed to the next layer, if any

activation - activation function which is ReLU in this case

init - the way in which weights should be provided to an ANN

In [None]:
# Adding the second hidden layer
classifier.add(Dense(output_dim=16, init='uniform', activation='relu'))
# Adding dropout to prevent overfitting
classifier.add(Dropout(p=0.1))

In [None]:
# Adding the output layer
classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))

In [None]:
# Compiling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size=100, nb_epoch=100)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
print("Our accuracy is {}%".format(((cm[0][0] + cm[1][1])/57)*100))

In [None]:
sns.heatmap(cm,annot=True)
plt.savefig('h.png')

In [None]:
#serialising the ml model
import pickle

pickle.dump(classifier, open('/content/cancer_pred.pickle', 'wb'))

In [None]:
!kill -9 -1