In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Understanding the question**

1. What is breast cancer?

A cancer that forms in the cells of the breasts.
Breast cancer can occur in women and rarely in men.
Symptoms of breast cancer include a lump in the breast, bloody discharge from the nipple and changes in the shape or texture of the nipple or breast.
Its treatment depends on the stage of cancer. It may consist of chemotherapy, radiation, hormone therapy and surgery.

In the dataset breast cancer is diagnosed as 'benign' or 'malignant'

'benign' meaning 'not harmful in effect'

'malignant' meaning 'infectious'

In [None]:
!pip install thinkx
#a library provided by Allen B. Downey, for statistics

### **1. Exploratory Data Analysis**


In [None]:
#python libraries used for data wrangling and data visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import thinkstats2
import thinkplot

In [None]:
#importing the data
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
#looking into the data
data.head()

In [None]:
data.shape

In [None]:
data.info()

column 'Unnamed:32' with float64 dtype has all null values.

In [None]:
#removing columns with null values
data.drop(['Unnamed: 32'], axis = 1, inplace = True)

In [None]:
#checking for if any null values left
data.isnull().sum()

In [None]:
# removing the 'id' column as it is not required for our analysis

data.drop(['id'], axis = 1, inplace = True)

In [None]:
#summary statistics of the data

data.describe()

### **2. Data Visualization**

In [None]:
#count of diagnosis outcome

plt.figure()

sns.countplot(x='diagnosis', data = data, palette = 'magma')

In [None]:
data['diagnosis'] = data['diagnosis'].map({'M':1, 'B':0})


In [None]:
corr = data.corr()

mask = np.zeros_like(corr)

mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):

    f, ax = plt.subplots(figsize=(20, 20))

    ax = sns.heatmap(corr,annot=True, mask=mask, square=True)

In [None]:
data.columns

In [None]:


#records which were diagnosed as malignant
M_data = data[data['diagnosis']==1]

#records which diagnosed as benign
B_data = data[data['diagnosis']==0]



In [None]:
M_data['area_mean'].describe()

In [None]:
B_data['area_mean'].describe()

In [None]:
#compairing the area of a normal cell and a cancer cell

sns.set(style="darkgrid")
plt.figure(figsize=(10,6))
fig = sns.kdeplot(M_data['area_mean'], shade=True, color="r")
fig = sns.kdeplot(M_data['area_worst'], shade=True, color="r")
fig = sns.kdeplot(B_data['area_mean'], shade=True, color="b")
fig = sns.kdeplot(B_data['area_worst'], shade=True, color="b")
plt.legend()
plt.show()

From this graph we can observe that, cells that were diagnosed as 'benign' has a meanspread between 143 to 992 ,with the largest cell being with the area 1210 while the cells diagnosed as 'malignant' has a wider spread (between 361 to 2501) and the largest cell has an area of 4254.

Cancer cells are usually larger in area.

In [None]:
#compairing the radius of normal cell and a cancer cell using 
#cumulative distribution function(CDF)

cdf_benign = thinkstats2.Cdf(B_data['radius_mean'], label='Normal Cell')
cdf_malignant = thinkstats2.Cdf(M_data['radius_mean'], label='Cancer Cell')

plt.figure(figsize=(10,8))
thinkplot.Cdfs([cdf_benign, cdf_malignant])
thinkplot.Show(xlabel='Radius of Cell', ylabel='CDF')

The mean, standard error and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features. For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.

In [None]:
mean_col = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

standarderror_col = ['radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se','diagnosis']

worst_col = ['radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst','diagnosis']

In [None]:
#pairplot for mean columns

sns.pairplot(data[mean_col], kind="scatter", hue="diagnosis", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()

In [None]:
#pairplot for standarderror columns

sns.pairplot(data[standarderror_col], kind="scatter", hue="diagnosis", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()

In [None]:
#pairplot for worst columns

sns.pairplot(data[worst_col], kind="scatter", hue="diagnosis", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()

### **3. Preprocessing**

In [None]:
from sklearn.preprocessing import StandardScaler



In [None]:
X = data.drop(['diagnosis'], axis = 1)
y = data['diagnosis']

In [None]:
scaler = StandardScaler()

scaled = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                random_state=42)

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits = 10,n_repeats = 3, random_state = 1)

### **4. Classification Modeling**

**4.1. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
#define model and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers, penalty=penalty, C=c_values)

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train,y_train)

In [None]:
y_pred = grid_search.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

accuracy_score(y_test, y_pred)

In [None]:
#summarize results
print(f'Best: {grid_result.best_score_} using: {grid_result.best_params_}')

**4.2. Ridge Classifier**

In [None]:
from sklearn.linear_model import RidgeClassifier

In [None]:
#define model
model = RidgeClassifier()

In [None]:
#define parameters
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

grid = dict(alpha=alpha)

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train,y_train)

In [None]:
#summarize results
print(f'Best: {grid_result.best_score_} using: {grid_result.best_params_}')

**4.3. K-Nearest Neighbors (KNN)**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model = KNeighborsClassifier()

In [None]:
#define model parameters
n_neighbors = np.arange(1,21,2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
#summarize results

print(f'Best: {grid_result.best_score_} using: {grid_result.best_params_}')

**4.4. Support Vector Machine**

In [None]:
from sklearn.svm import SVC

In [None]:
# define model and parameters
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

In [None]:
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
#summarize results

print(f'Best: {grid_result.best_score_} using: {grid_result.best_params_}')

**4.5. Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# define models and parameters
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

In [None]:
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
#summarize results

print(f'Best: {grid_result.best_score_} using: {grid_result.best_params_}')