In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
data.head()
df = data.drop('Unnamed: 32', axis = 1)
df.head()

1) **ID number**

2) **Diagnosis** (M = malignant, B = benign)


- Ten real-valued features are computed for each cell nucleus:

1) **radius** (mean of distances from center to points on the perimeter)

2) **texture** (standard deviation of gray-scale values)

3) **perimeter**

4) **area**

5) **smoothness** (local variation in radius lengths)

6) **compactness** (perimeter^2 / area - 1.0)

7) **concavity** (severity of concave portions of the contour)

8) **concave points** (number of concave portions of the contour)

9) **symmetry**

10) **fractal dimension** ("coastline approximation" - 1)

- The mean, standard error and "worst" or largest (mean of the three largest values) of these features were computed for each image,resulting in 30 features. For instance, field 3 is Mean Radius, field 13 is Radius SE, field 23 is Worst Radius.

- All feature values are recoded with four significant digits.

In [None]:
df.info()

- We have only one 'Object' type variable that is the target variable, **diagnosis**

In [None]:
print('The Number of missing values per feature is:')
df.isna().sum()

In [None]:
# separating out the target feature
target = df['diagnosis']

In [None]:
print('We have a pretty balanced dataset with 357 Benign cases and 212 Malignant cases.')
target.value_counts()

In [None]:
df_predictors = df.drop(['diagnosis', 'id'], axis = 1)
df_predictors.head()

- **For a feature to be considered a discrete numeric one, I will choose it to have less than 25 discrete values:**

In [None]:
print('Number of discrete numerical features are {}'.format(len([feature for feature in df_predictors.columns
if df_predictors[feature].nunique() < 25])))

- **Therefore, all the predictors are continuous features.**

In [None]:
df_predictors.columns

## Univariate Analysis

In [None]:
plt.figure(figsize=(20,15))
pn = 1
for feature in df_predictors.columns:
    if pn <= 30:
        ax = plt.subplot(5,6,pn)
        sns.distplot(df_predictors[feature], hist_kws = {'color': 'm', 'alpha': 1}, kde_kws = {'color': 'b'} )
        plt.xlabel(feature)
    pn += 1
plt.tight_layout()
plt.show()

- **We can observe that most predictors follow near normal distribution but a few are right skewed. therefore they need to be dealt with before model creation.**

## Bivariate Analysis

In [None]:
dfmod = data.drop(['id', 'Unnamed: 32'], axis = 1)
plt.figure(figsize = (20,15))
pn = 1
for feature in df_predictors.columns:
    if pn <= 30:
        ax = plt.subplot(5,6,pn)
        dfmod.groupby('diagnosis')[feature].median().plot.bar(color = ['g', 'm'])
        plt.xlabel(feature)
    pn += 1
plt.tight_layout()
plt.show()

- **I Have considered 0.85 as a threshold for strong correlation between features**

In [None]:
plt.figure(figsize = (20,15))
cm = dfmod.corr()
sns.heatmap(cm[abs(cm) > 0.85], annot = True, square = True, linewidths = 2, linecolor = 'black', cmap = 'rainbow', robust = True, mask = np.triu(cm, k = 0))

- **The above heatmap showcases the highly correlated predictors, Of these, we can eleminate a few for our model building.**

In [None]:
selected_features = ['radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean','symmetry_mean', 'fractal_dimension_mean',
                     'radius_se', 'texture_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'smoothness_worst', 'symmetry_worst', 'fractal_dimension_worst']

In [None]:
len(selected_features)

In [None]:
data = df_predictors.copy()
for feature in selected_features:
    if 0 in df_predictors[feature].unique():
        pass
    else:
        df_predictors[feature] = np.log(df_predictors[feature])

In [None]:
for feature in selected_features:
    sns.distplot(df_predictors[feature])
    plt.title(feature)
    plt.show()

In [None]:
plt.figure(figsize = (20,15))
pn = 1
for feature in selected_features:
    if pn <= 30:
        ax = plt.subplot(5,6,pn)
        sns.boxplot(y = data[feature], color = 'orange')
        plt.title(feature)
    pn += 1
plt.tight_layout()   
plt.show()

In [None]:
target = df.diagnosis

In [None]:
tr = target.copy()
tr = pd.Series(np.where(tr == 'M', 1,0))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
mod = ExtraTreesClassifier()
mod.fit(df_predictors[selected_features], tr)

In [None]:
feat_imp = pd.Series(mod.feature_importances_, index = selected_features)
feat_imp.nlargest(15).plot(kind = 'bar')
plt.title('Feature Importance')
plt.show()

In [None]:
feature_list = feat_imp.sort_values(ascending = False)[:11].index

In [None]:
target = df['diagnosis']
tr = pd.Series(np.where(target == 'M', 1, 0))

In [None]:
for feature in feature_list:
    if 0 in df[feature].unique():
        pass
    else:
        df[feature] = np.log(df[feature])

In [None]:
X = df[feature_list]
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, tr, test_size = 0.3, random_state = 0)
X_train.shape, X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from xgboost import XGBClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score, plot_confusion_matrix
from sklearn.model_selection import cross_val_score

## Logistic Regression

In [None]:
lr = LogisticRegression(random_state=0)
clflr = GridSearchCV(lr, param_grid = {'penalty': ['l1', 'l2'], 'C':np.arange(0.1, 10, 0.2), 'solver': ['liblinear', 'lbfgs']}, cv = 5, scoring = 'recall')
clflr.fit(X_train, y_train)
print('''Recall score on training set is : {}
with parameters as: {}'''.format(clflr.best_score_, clflr.best_params_))

In [None]:
lrf = LogisticRegression(C= 6.1000000000000005, penalty= 'l1', solver= 'liblinear', random_state=0)
lrf.fit(X_train, y_train)
y_predlr = lrf.predict(X_test)
print('Recall score on test data for Logistic regression  is: {}'.format(recall_score(y_test, y_predlr)))
recall_lr = recall_score(y_test, y_predlr)

## Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(random_state=0)
clfdt = GridSearchCV(dt, param_grid = {'criterion': ['gini', 'entropy']}, cv = 5, scoring = 'recall')
clfdt.fit(X_train, y_train)
print('''Recall score on training set is : {}
with parameters as: {}'''.format(clfdt.best_score_, clfdt.best_params_))

In [None]:
dtf = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
dtf.fit(X_train, y_train)
y_preddt = dtf.predict(X_test)
print('Recall score on test data for Decision Tree Classifier is: {}'.format(recall_score(y_test, y_preddt)))
recall_dt = recall_score(y_test, y_preddt)

## Random Forest Classifier

In [None]:
rf = RandomForestClassifier(random_state=0)
clfrf = GridSearchCV(rf, param_grid = {'n_estimators': np.arange(100, 170, 10), 'criterion': ['gini', 'entropy']}, cv = 5, scoring = 'recall')
clfrf.fit(X_train, y_train)
print('''Recall score on training set is : {}
with parameters as: {}'''.format(clfrf.best_score_, clfrf.best_params_))

In [None]:
rff = RandomForestClassifier(criterion= 'gini', n_estimators= 120, random_state=0)
rff.fit(X_train, y_train)
y_predrf = rff.predict(X_test)
print('Recall score on test data for Random Forest Classifier is: {}'.format(recall_score(y_test, y_predrf)))
recall_rf = recall_score(y_test, y_predrf)

## SVM

In [None]:
svm = SVC(random_state=0)
clfsvm = GridSearchCV(svm, param_grid = {'C': np.arange(0.1, 10, 0.2), 'kernel': ['rbf', 'linear', 'poly']}, cv = 5, scoring = 'recall')
clfsvm.fit(X_train, y_train)
print('''Recall score on training set is : {}
with parameters as: {}'''.format(clfsvm.best_score_, clfsvm.best_params_))

In [None]:
svmf = SVC(C = 6.7, kernel = 'linear', random_state=0)
svmf.fit(X_train, y_train)
y_predsvm = svmf.predict(X_test)
print('Recall score on test data for svm is: {}'.format(recall_score(y_test, y_predsvm)))
recall_svm = recall_score(y_test, y_predsvm)

## Adaboost

In [None]:
ada = AdaBoostClassifier(random_state=0)
clfada = GridSearchCV(ada, param_grid = {'n_estimators': np.arange(50,160,10), 'learning_rate': np.arange(0.1,2,0.1)}, cv = 5, scoring = 'recall')
clfada.fit(X_train, y_train)
print('''Recall score on training set is : {}
with parameters as: {}'''.format(clfada.best_score_, clfada.best_params_))

In [None]:
adaf = AdaBoostClassifier(n_estimators=50, learning_rate=1, random_state=0)
adaf.fit(X_train, y_train)
y_predada = adaf.predict(X_test)
print('Recall score on test data for Adaboost is: {}'.format(recall_score(y_test, y_predada)))
recall_ada = recall_score(y_test, y_predada)

## Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(random_state=0)
clfgb = GridSearchCV(gb, param_grid = {'n_estimators': np.arange(50,160,10), 'learning_rate': np.arange(0.1,1.9,0.2)}, cv = 5, scoring = 'recall')
clfgb.fit(X_train, y_train)
print('''Recall score on training set is : {}
with parameters as: {}'''.format(clfgb.best_score_, clfgb.best_params_))

In [None]:
gbf = GradientBoostingClassifier(learning_rate=1.5, n_estimators=60,random_state=0)
gbf.fit(X_train, y_train)
y_predgb = gbf.predict(X_test)
print('Recall score on test data for Gradient boost is: {}'.format(recall_score(y_test, y_predgb)))
recall_gb = recall_score(y_test, y_predgb)

## XgBoost

In [None]:
xgb = XGBClassifier(random_state=0)
clfxgb = GridSearchCV(xgb, param_grid = {'n_estimators': np.arange(50,160,10), 'learning_rate': np.arange(0.1,2.1,0.1)}, cv = 5, scoring = 'recall')
clfxgb = clfxgb.fit(X_train, y_train)
print('''Recall score on training set is : {}
with parameters as: {}'''.format(clfxgb.best_score_, clfxgb.best_params_))

In [None]:
xgbf = XGBClassifier(learning_rate=1.7, n_estimators=50,random_state=0)
xgbf.fit(X_train, y_train)
y_predxgb = xgbf.predict(X_test)
print('Recall score on test data for Gradient boost is: {}'.format(recall_score(y_test, y_predxgb)))
recall_xgb = recall_score(y_test, y_predxgb)

In [None]:
model = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'AdaBoost', 'Gradient Boost', 'XgBoost']
scores = [recall_lr,recall_dt,recall_rf,recall_svm,recall_ada,recall_gb,recall_xgb]

In [None]:
score = pd.DataFrame()
score['Model'] = model
score['Test Score'] = scores
score.set_index('Model')
score.sort_values(by = 'Test Score',ascending = False)

In [None]:
#plt.subplot(2,2,1, frameon = True)
plot_confusion_matrix(lrf, X_test, y_test, display_labels = ['Benign', 'Malignant'])
plt.title('Logistic Regression Model')
plt.show()
#plt.subplot(2,2,2,frameon = True)
plot_confusion_matrix(xgbf, X_test, y_test, display_labels = ['Benign', 'Malignant'])
plt.title('XgBoost Model')
plt.show()
#plt.subplot(2,2,3,frameon = True)
plot_confusion_matrix(gbf, X_test, y_test, display_labels = ['Benign', 'Malignant'])
plt.title('Gradient Boost Model')
plt.show()

- **Here I have taken the evaluation metric as Recall as we should minimize our False Positives in this use case.**
- **This is because It would be fatal for the patients if our model started predicting Malignant as Benign.**