# Problem definition

The problem is to predict whether a cancer is benign or malignant using Breast Cancer Wisconsin dataset. Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass.They describe characteristics of the cell nuclei present in the image.

# Features Definition

<ol>
	<li>ID number</li> 
    <li>Diagnosis (M = malignant, B = benign) </li>
    <li> (3 - 32) :Ten real-valued features are computed for each cell nucleus: 
		<ol type="a">
			<li> radius (mean of distances from center to points on the perimeter)</li> 
			<li>texture (standard deviation of gray-scale values)</li> 
            <li> perimeter</li>
            <li> area </li>
            <li> smoothness (local variation in radius lengths) </li>
            <li> compactness (perimeter^2 / area - 1.0) </li>
            <li> concavity (severity of concave portions of the contour) </li>
            <li> concave points (number of concave portions of the contour) </li>
            <li> symmetry</li>
            <li> fractal dimension ("coastline approximation" - 1)</li>
		</ol>
	 </li>
</ol>

# Load Libraries

In [None]:
from pandas import read_csv
from pandas import set_option
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Load dataset

In [None]:
dataset = read_csv("../input/data.csv",header = 0)

# Analyze data

## Descriptive Statistics

In [None]:
print('The dataset has {} instances and {} attributes.\n'.format(dataset.shape[0], dataset.shape[1]))
print('The attributes are : \n {} '.format(dataset.dtypes))

set_option('display.width', 100)
print('The first 5 rows:\n{}'.format(dataset.head(5)))
print('The last 5 rows:\n{}'.format(dataset.tail(5)))

In [None]:
print(dataset.describe())

In [None]:
print('Class Distribution \n{}'.format(dataset.groupby('diagnosis').size()))

## Data cleaning

In [None]:
dataset = dataset.drop('id', 1)
dataset = dataset.drop('Unnamed: 32', 1)
print(dataset.head(5))
print(dataset.tail(5))

In [None]:
print(dataset.describe())

In [None]:
dataset.isnull().sum().sum()

## Data Visualization

### Histograms

In [None]:
dataset.hist(sharex=False, sharey=False,xlabelsize=1, ylabelsize=1 )
pyplot.show()

### Density plots

In [None]:
dataset.plot(kind='density', subplots=True, layout=(8,8), sharex=False, legend=False, fontsize=1)
pyplot.show()

###  Box and Whisker plots

In [None]:
dataset.plot(kind= 'box' , subplots=True, layout=(8,8), sharex=False, sharey=False,
    fontsize=1)
pyplot.show()

### Correlations between Attributes

In [None]:
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(dataset.corr(), vmin=-1, vmax=1, interpolation='none' )
fig.colorbar(cax)
pyplot.show()

# Split the dataset
Let's use 80% of our dataset for modelling and 20% for validation

In [None]:
print(dataset.diagnosis.unique())
data = dataset.values
X = data[:,1:31].astype(float)
Y = data[:,0]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,
    test_size=validation_size, random_state=seed)
print(Y_train)

In [None]:
print(Y_validation)

In [None]:
print(X_train)

In [None]:
print(X_validation)

 # Prediction models

We will evaluate six algorithms: 
<ol>
	<li>Logistic Regression : LR</li> 
    <li>Linear Discriminant Analysis : LDA</li>
    <li>Classification and Regression Tree : CART</li>
    <li>Support Vector Machine : SVM </li>
    <li>Gaussian Naive Bayes : NB</li>
    <li>K-Nearest Neighbors: KNN</li>
</ol>

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

In [None]:
num_folds = 10
seed = 7
scoring = 'accuracy'
results = []
names = []
print('Mean and Standard Deviation accuracy with 10 folds')
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print('{}: {} ({})'.format(name, cv_results.mean(), cv_results.std()))

In [None]:
fig = pyplot.figure()
fig.suptitle( 'Algorithm Comparison' )
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

The result shows a good accuracy for LDA, but SVM perform poorly. 
Let's standardize the input and see how it will affect the results

# Prediction model on standardize inputs

In [None]:
pipelines = []
pipelines.append(( 'S_LR' , Pipeline([( 'Scaler' , StandardScaler()),( 'LR' ,
    LogisticRegression())])))
pipelines.append(( 'S_LDA' , Pipeline([( 'Scaler' , StandardScaler()),( 'LDA' ,
    LinearDiscriminantAnalysis())])))
pipelines.append(( 'S_KNN' , Pipeline([( 'Scaler' , StandardScaler()),( 'KNN' ,
    KNeighborsClassifier())])))
pipelines.append(( 'S_CART' , Pipeline([( 'Scaler' , StandardScaler()),( 'CART' ,
    DecisionTreeClassifier())])))
pipelines.append(( 'S_NB' , Pipeline([( 'Scaler' , StandardScaler()),( 'NB' ,
    GaussianNB())])))
pipelines.append(( 'S_SVM' , Pipeline([( 'Scaler' , StandardScaler()),( 'SVM' , SVC())])))
results = []
names = []
print("Mean and Standard Deviation Accuracy with 10 folds ")
for name, model in pipelines:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print('{}: {} ,  {}'.format(name, cv_results.mean(), cv_results.std()))

In [None]:
fig = pyplot.figure()
fig.suptitle( 'Scaled Algorithm Comparison' )
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

With scaled input, SVM and LR perform better. 
Let's tune their parameter, and see if we can improve.

# Algorithm Tuning

## Tuning Support Vector Machine

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
kernel_values = [ 'linear' ,  'poly' ,  'rbf' ,  'sigmoid' ]
param_grid = dict(C=c_values, kernel=kernel_values)
model = SVC()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print("Best: {} using {}".format(grid_result.best_score_, grid_result.best_params_))

## Tuning Logistic Regression 

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
penalty = ['l1', 'l2']
param_grid = dict(C=c_values, penalty=penalty)
model = LogisticRegression()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print("Best: {} using {}".format(grid_result.best_score_, grid_result.best_params_))

After tuning, Logistic regression has the highest accuracy score (98.24%), with an L2 penalty and a C value of 0.1.

# Validate the model on the validation data

## Prepare the model

In [None]:
scaler = StandardScaler().fit(X_train)
s_X = scaler.transform(X_train)
model = LogisticRegression(C=0.1, penalty='l2')
model.fit(s_X, Y_train)

## Estimate Accuracy 

In [None]:
 s_X_validation = scaler.transform(X_validation)
predictions = model.predict(s_X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))