In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, roc_auc_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings  
warnings.filterwarnings('ignore')

<div class="alert alert-block alert-info"><strong>Content</strong></div>


<div class="list-group">
    <a class="list-group-item list-group-item-action" href="#ds">The Dataset</a>
    <a class="list-group-item list-group-item-action" href="#eda">Exploratory Data Analysis</a>
    <a class="list-group-item list-group-item-action" href="#dpp">Data Pre-Processing</a>
    <a class="list-group-item list-group-item-action" href="#mtt">Model Training & Testting</a>
</div>

<div class="alert alert-block alert-success" id='ds'><strong>The Dataset</strong></div>

The Breast Cancer datasets is available UCI machine learning repository maintained by the University of California, Irvine. The dataset contains 569 samples of malignant and benign tumor cells.

The first two columns in the dataset contain unique ID and the corresponding diagnosis (M=malignant, B=benign), respectively. The columns 3-32 contain 30 features that have been computed from digitized images of the cell nuclei.

* M = Malignant (Cancerous) - Present (M)
* B = Benign (Not Cancerous) - Absent (B)

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

<div class="alert alert-block alert-success" id='eda'><strong>Exploratory Data Analysis </strong></div>

In [None]:
data.head()

Dataset contain 2 categorical varaibles (id, diagnosis) and 30 numerical vairables.

**Numerical Data:** *Data have meaning as a measurement, such as a person’s height, weight, IQ, or blood pressure; or they’re a count, such as the number of stock shares a person owns, how many teeth a dog has, or how many pages you can read of your favorite book before you fall asleep.*

**Categorical Data:** *Data represent characteristics such as a person’s gender, marital status, hometown, or the types of movies they like. Categorical data can take on numerical values (such as “1” indicating male and “2” indicating female), but those numbers don’t have mathematical meaning.*

referance: https://www.dummies.com/education/math/statistics/types-of-statistical-data-numerical-categorical-and-ordinal/

In [None]:
data.info()

Dataset contains 33 columns and 569 rows. There is mysterious column "Unnamed" probably an error in the dataset, let's drop the unnecessary that column.

In [None]:
data.drop(['id'], axis=1, inplace=True)

data.drop(['Unnamed: 32'], axis=1 , inplace=True)

In [None]:
data.isnull().sum()

No missing values avaliable in the dataset. Let's study the target variable.

In [None]:
data.groupby(['diagnosis'])['diagnosis'].count()

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(12, 6))

sns.countplot(x='diagnosis', data=data, ax=axes[0])
plt.pie(data.groupby('diagnosis')['diagnosis'].count(), labels=['B','M'], autopct='%1.1f%%')

fig.suptitle('Class Distribution', fontsize=12)
axes[0].set_xlabel('Diagnosis')
axes[0].set_ylabel('Count')

axes[1].set_xlabel('')
axes[1].set_ylabel('')

plt.show()

In [None]:
data.describe()

In [None]:
def plotDistribution(columns):
    fig, axes=plt.subplots(ncols=2, nrows=len(columns), figsize=(20, 30))
    fig.tight_layout(pad = 4.0)

    for i, column in enumerate(columns):
        sns.distplot(data.loc[data.diagnosis=='M', column], label='Melignant', ax=axes[i][0])
        sns.distplot(data.loc[data.diagnosis=='B', column], label='Benign', ax=axes[i][1])

        for j in range(2):
            axes[i][j].tick_params(axis='x', labelsize=12)
            axes[i][j].tick_params(axis='y', labelsize=12)

        axes[0][0].set_title('Melignant', fontsize=13)
        axes[0][1].set_title('Benign', fontsize=13)


In [None]:
plotDistribution(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean'])

In [None]:
plotDistribution(['radius_se', 'texture_se', 'perimeter_se', 'area_se',
       'smoothness_se', 'compactness_se', 'concavity_se',
       'concave points_se', 'symmetry_se', 'fractal_dimension_se'])

In [None]:
plotDistribution(['radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave points_worst', 'symmetry_worst',
       'fractal_dimension_worst'])

According to distribution plots, most of the variables are not normally distributed. However, Logistic Regression doesn't require normally distributed IVs. On the other hand, most of the variables contain outliters. 

**What are outliers in the data?**

> An outlier is an observation that lies an abnormal distance from other values in a random sample from a population. https://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm


Let's draw box plots to further clarify this.

In [None]:
def plotBoxplot(columns, data):
    fig, axes = plt.subplots(ncols=3, nrows=4, figsize=(20,20))
    fig.tight_layout(pad=4.0)

    col = 0
    row = 0
    colors = ['#bad9e9', '#7ab6d6', '#3c8abd']

    for i, column in enumerate(columns):
        sns.boxplot(y=column, data=data, ax=axes[row][col], color=colors[col])

        if (i + 1) % 3 == 0:
            row += 1
            col = 0
        else:
            col += 1

In [None]:
plotBoxplot(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean'], data)

In [None]:
plotBoxplot(['radius_se', 'texture_se', 'perimeter_se', 'area_se',
       'smoothness_se', 'compactness_se', 'concavity_se',
       'concave points_se', 'symmetry_se', 'fractal_dimension_se'], data)

In [None]:
plotBoxplot(['radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave points_worst', 'symmetry_worst',
       'fractal_dimension_worst'], data)

> Outliers should be investigated carefully. Often they contain valuable information about the process under investigation or the data gathering and recording process. Before considering the possible elimination of these points from the data, one should try to understand why they appeared and whether it is likely similar values will continue to appear. Of course, outliers are often bad data points. https://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm

Logistic regression assumes that there is no severe multicollinearity among the explanatory variables. Multicollinearity occurs when two or more explanatory variables are highly correlated to each other, such that they do not provide unique or independent information in the regression model. If the degree of correlation is high enough between variables, it can cause problems when fitting and interpreting the model. Let's check correlation between features.

In [None]:
corr = data.corr()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='GnBu')
plt.show()

<div class="alert alert-block alert-success" id='dpp'><strong>Data Pre-Proecessing</strong></div>

According to the correlation plot, there are some highly correlated features such as radius_mean, area_worst, perimeter_worst. The below method select highly correlated features and remove them from the dataset: https://www.dezyre.com/recipes/drop-out-highly-correlated-features-in-python

As you see in correlation plot, there are some correlated variables in the dataset.

1. radius_mean, perimeter_mean and area_mean are correlated -> i will use area_mean
2. compactness_mean, concavity_mean and concave points_mean are correlated -> will use concavity_mean
2. texture_mean and texture_worst are correlated -> i wil use texture_mean
3. radius_se, perimeter_se and area_se are correlated -> i will use area_se
4. radius_worst, perimeter_worst and area_worst are correlated -> i will use area_worst
5. compactness_se, concavity_se and concave points_se are correlated -> i will use concavity_se
6. compactness_worst, concavity_worst and concave points_worst are correlated -> i will use concavity_worst
7. area_worst and area_mean are correlated -> i will use area_mean
8. concavity_mean and concavity_worst are correlated -> i will use concavity_worst

In [None]:
data['diagnosis'] = (data['diagnosis'] == 'M').astype(int)

In [None]:
dataProcessed = data.drop(['diagnosis'], axis=1)

In [None]:
dropList = ['radius_mean', 'perimeter_mean', 'compactness_mean', 'concave points_mean', 'radius_worst','perimeter_worst', 'texture_worst','perimeter_se','radius_se','compactness_se','concave points_se','compactness_worst','concave points_worst', 'area_worst', 'concavity_mean']
dataProcessed = dataProcessed.drop(dropList, axis=1)

In [None]:
dataProcessed.info()

In [None]:
corr = dataProcessed.corr()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='GnBu')
plt.show()

After dropping correlated features, we were able to reduce features into 15. As discussed earlier, there are some outliers in the dataset and Logistic Regression is sensitive to outliers. Therefore, let's treat outliers. There are many different ways to treat outliers, since this dataset is smaller I will use the data imputation technique. Here another great article about outliers: https://www.linkedin.com/pulse/techniques-outlier-detection-treatment-suhas-jk

In [None]:
dataProcessed.head()

In [None]:
def outlierLimit(column):
    q1, q3 = np.nanpercentile(column, [25, 75])
    iqr = q3 - q1
    
    upLimit = q3 + 1.5 * iqr
    loLimit = q1 - 1.5 * iqr
    
    return upLimit, loLimit

In [None]:
for column in dataProcessed.columns:
    if dataProcessed[column].dtype != 'object':
        upLimit, loLimit = outlierLimit(dataProcessed[column])
        dataProcessed[column] = np.where((dataProcessed[column] > upLimit) | (dataProcessed[column] < loLimit), np.nan, dataProcessed[column])

![](https://www.whatissixsigma.net/wp-content/uploads/2015/07/Box-Plot-Diagram-to-identify-Outliers-figure-1.png)

Using the above function, we are selecting outliers that above and below the upper limit and lower limit. Then change into missing value if vale is above or lower the limit. For more info read this https://www.whatissixsigma.net/box-plot-diagram-to-identify-outliers/

In [None]:
dataProcessed.isnull().sum()

Now you can see there are missing values in our dataset. Like I told you earlier, now we can treat the missing values using k-Nearest Neighbors

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=4)
dataProcessed.iloc[:, :] = imputer.fit_transform(dataProcessed)

In [None]:
dataProcessed.isnull().sum()

In [None]:
dataProcessed.head()

Even though magnitudes are not the same, Logistic regression is not sensitive to the magnitude of features. Therefore, we do not require data normalization or standardization.

* https://digitalcommons.georgiasouthern.edu/information-tech-facpubs/111/
* https://builtin.com/data-science/when-and-why-standardize-your-data
* https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/

<div class="alert alert-block alert-success" id='mtt'><strong>Model Training & Testing</strong></div>

In [None]:
Y = data['diagnosis']
X = dataProcessed

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=50)

### Logistic Regression

![](https://media.geeksforgeeks.org/wp-content/uploads/20200522215734/download35.jpg)

**Advantages of Logistic Regression**

1. Doesn't require normally distributted IVs
2. Support IVs in any measuremet scale (numerical, categorical)
3. DV doesn't need linear relationship with IVs 

**Assumptions of Logistic Regression**

1. Sample Size - Small samples with large number of predictors will reduce power
2. No Multicollinearity - IVs that are correlated with other IVs
3. No Extreme Outliers - Estimates of the logistic regression are sensitive to the unusual observations

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score 
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=50)
lr.fit(x_train, y_train)

In [None]:
yPredict = lr.predict(x_test)

In [None]:
print('Accuracy: {}'.format(accuracy_score(y_test, yPredict)))
print('Recall: {}'.format(recall_score(y_test, yPredict)))

Recall is 94% and it is a pretty good result without any optimizations. However, according to the scope of this task, we need to identify cancerous samples as much as possible(true positive). We can check the number of identified patients using a confusion matrix. Let's understand confusion matrix:

* TP(True Positive) - correctly predicted, samples contain cancerous cells
* TN(True Negetive) - correctly predicted, samples do not contain cancerous cells
* FP(False Positive) - incorrectly predicted, samples do not have cancerous cells (0 -> 1)
* FN(False Negetive) - incorrectly predicted, samples have cancerous cell (1 -> 0)

In [None]:
def drawConfusionMatrix(confusion):
    groups = ['TN','FP','FN','TP']

    counts = ['{0:0.0f}'.format(value) for value in confusion.flatten()]
    labels = np.asarray([f'{v1}\n{v2}' for v1, v2 in zip(groups, counts)]).reshape(2, 2)

    sns.heatmap(confusion, annot=labels, cmap='Blues', cbar=False, fmt='')

In [None]:
drawConfusionMatrix(confusion_matrix(y_test, yPredict))

According to the result, model has been able to correctly classify 56 samples and incorrectly classify 3 samples as normal. To improve this model we need to improve recall by reducing False Negative(FN). As you know, the default threshold is 0.5 and we can tune this hyperparameter. To do that let's plot the ROC curve first.

In [None]:
lr.predict_proba(x_test)[:,1]

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(x_test)[:,1])

plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='Logistic')

plt.title('ROC Curve')
plt.xlabel('False Positive Rate ==>')
plt.ylabel('True Positive Rate ==>')

plt.legend()
plt.show()

In [None]:
roc_auc_score(y_test, lr.predict_proba(x_test)[:,1])

In [None]:
thresholds

In [None]:
recalls = []
for th in thresholds:
    predictTh = np.where(lr.predict_proba(x_test)[:,1] > th, 1, 0)
    recalls.append(recall_score(y_test, predictTh))
    
recalls

In [None]:
recallDf = pd.concat([pd.Series(thresholds), pd.Series(recalls)], axis=1)
recallDf.columns = ['threshold', 'recall']
recallDf.sort_values(by='recall', ascending=False, inplace=True)

recallDf

In [None]:
plt.figure(figsize=(12, 6))

sns.barplot(x='threshold', y='recall', data=recallDf)

plt.title('Recall vs Treshold')
plt.xticks(rotation=90)
plt.tight_layout()

Which threshold do we need to select? it depends on the trade-off that we want to make. If you are more concerned about high sensitivity pick the threshold that maximizes the true positive rate. In this case, I would like to have higher sensitivity and some level of false-positive rate. Therefore, I would like to select threshold => 0.284187.

In [None]:
yPredictTh = np.where(lr.predict_proba(x_test)[:,1] > thresholds[15], 1, 0) 

In [None]:
drawConfusionMatrix(confusion_matrix(y_test, yPredictTh))

In [None]:
print('Accuracy: {}'.format(accuracy_score(y_test, yPredictTh)))
print('Recall: {}'.format(recall_score(y_test, yPredictTh)))

As you see above, after we adjust the threshold value our prediction is better than the previous one and recall has been improved by1.7%. 

Hope you've enjoyed my work, if you like my work and need to share something with me leave a comment :)