In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing our data

In [None]:
df_raw = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df_raw.head()

## Let's take a quick look at our data

In [None]:
df_malignant = df_raw.loc[df_raw['diagnosis'] == 'M']
df_benign = df_raw.loc[df_raw['diagnosis'] == 'B']

for column in df_raw.columns:
    if column not in  ['id', 'diagnosis', 'Unnamed: 32']:
        plt.figure(figsize = (5, 5))

        sns.distplot(a = df_malignant[[column]], hist = False, color = 'red')
        sns.distplot(a = df_benign[[column]], hist = False, color = 'blue')
        
        plt.title(column + ' Benign (blue) vs. Malignant (red)')
        plt.show()

## Nothing seems out of the ordinary, so let's proceed with processing our data. We will try to standardize our data, perform our train/test split, then develop a Logistc Regression Model.

## Now let's process our dataframe. To get our data ready, we will do the following:
- Replace missing data
- Remove unecessary columns
- Relabel our target column to 0's and 1's (for prediction)
- Remove unecessary columns (id, unnamed)
- Standardize our numerical columns (the math behind ML algorithms work better when our numerical values are standardized. For each column, the data will be centered around 0 and 68% of the data will be between -1 and 1

In [None]:
df_processed = df_raw.copy()

#Dropping unecessary columns
df_processed.drop(['id', 'Unnamed: 32'], axis = 1, inplace = True)

#Renaming Malignant to 1, and Benign to 0
df_processed.diagnosis.replace({'M': 1, 'B': 0}, inplace = True)

#Looking at how many NaN's we have (we have none)
print('Number of NaNs (we have none)')
print(df_processed.isnull().sum())

#Standard Scaling our Data
input_features = list(df_processed.columns)
input_features.remove('diagnosis')

df_processed[input_features] = StandardScaler().fit_transform(df_processed[input_features])
        
df_processed.head()

## Performing our train/test split

In [None]:
x = df_processed[input_features].values
y = df_processed['diagnosis'].values 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 36)

## Prediction using a simple Logistic Regressor

### Logsitic Regression is similar to Linear Regression, except that instead of a line being fit to our data, a probability curve is fit instead. This curve is determined the same way that a Linear Regression line is determined - through minimizing a specific function. The specifics of the curve are also determine based on the hyperparameters of the model (of which there are many). As an example, if our input data for a patient corresponds to a 50% + chance of having a malignant tumor, then our model would determine that the tumor of that patient is malignant. Similariy, if our input variables correspond to a 50% - chance of having a malginant tumor, then our model would determine that the tumor is benign.

### There are a lot of hyperparameters we can adjust for our Logistic Regressor. We could do a deep dive into each one of parameters and the math behind it, but in some cases (like this one) it's easier to do a brute force method. So, we will split our training data using 3-cross fold validation, and we will use GridCV to try every combination of parameters available to find the one with the highest accuracy. 

In [None]:
#Creating our model
model = LogisticRegression()

#The parameters to loop through
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

#Creating our GridSearchCV
clf = GridSearchCV(model, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

#Finding our best model
best_model = clf.fit(x_train, y_train)
print('Best Model Parameters: ')
print(best_model.best_estimator_)

In [None]:
#Creating our model with the best hyperparameters
final_model = LogisticRegression(C=0.08858667904100823, solver = 'liblinear')

#Fitting our model to our test data
final_model.fit(x_train, y_train)

#Making our predictions
y_pred = final_model.predict(x_test)

## Let's look at how our model performed

In [None]:
#Calculating our precision, recall, and fscore
scores = precision_recall_fscore_support(y_test, y_pred, average = 'binary')


print('Model Accuracy: ', final_model.score(x_test, y_test))
print('Model Precision: ', scores[0])
print('Model Recall: ', scores[1])
print('Model F1-score: ', scores[2])

print('-----------------------------------------------')
print('Model Accuracy is the % of correct predictions')
print('Model Precision is (TP / TP + FP), or the likelihood that our next positive prediction will be accurate')
print('Model Recall is (TP / TP + FN), or the % of true positives that we correctly predicted')
print('Model F1-score is a useful metric that takes Precision and Recall into account')

In [None]:
print('A Confusion Matrix is also a good way to look at how a model is performing. The boxes in the diagonal from the top left to the bottom right are for the correct predictions. All other boxes are incorrect predictions.')

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, linewidths = .5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual');
plt.xlabel('Predicted');
plt.title('Confusion Matrix');

## It looks like our model is performing well. It has an accuracy, precision, recall, and F-1 score of 0.99, 1.0, 0.97, and 0.98. This is also visualized in our confusion matrix. 

# Summary

## Through the course of this problem, we have done the following:
## - Performed an exploratory analysis of our data
## - Performed data pre-processing (data standardization, etc.)
## - Split our data into a training and testing dataset
## - Used GridCV with 3-cross fold validation on our training dataset to find the most optimal hyperparameters
## - Developed a Logistic Regression model with the best hyperparameters
## - Evaluated the results of the model on our test dataset