# Breast cancer classification

Example taken from https://www.learndatasci.com/glossary/binary-classification/

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

## Data

Install sklearn if necessary

In [None]:
# pip install scikit-learn

In [None]:
import sklearn.datasets
dataset = sklearn.datasets.load_breast_cancer(as_frame=True)

In [None]:
dataset['data'].head()

In [None]:
dataset['target'].value_counts()

In [None]:
X = dataset['data']
y = dataset['target']
y

## Train and test sets

In [None]:
import sklearn.model_selection
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y , test_size=0.5, random_state=0)

In [None]:
X_train

In [None]:
X_test

Standarize the explanatory variables $z = (x-\bar{x})/s$

In [None]:
import sklearn.preprocessing
ss_train = sklearn.preprocessing.StandardScaler()
X_train = ss_train.fit_transform(X_train)
X_train[0]

In [None]:
ss_test = sklearn.preprocessing.StandardScaler()
X_test = ss_test.fit_transform(X_test)
X_test
X_test[0]

## Logistic Regression

In [None]:
import sklearn.linear_model
model = sklearn.linear_model.LogisticRegression()
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
pd.set_option('display.max_rows', 500)
pd.DataFrame({"Prediction": predictions, "Target": y_test})

## Confusion matrix

In [None]:
import sklearn.metrics
cm_array = sklearn.metrics.confusion_matrix(y_test, predictions)
confusion_matrix = pd.DataFrame(cm_array.T, index=("Negative", "Positive"), columns=("Benign","Malign"))
confusion_matrix

In [None]:
test_patients = confusion_matrix.to_numpy().sum()
print(f"Total test patients: {test_patients}")
benigns = confusion_matrix["Benign"].sum()
print(f"Benigns: {benigns}")
maligns = confusion_matrix["Malign"].sum()
print(f"Maligns: {maligns}")
negatives = confusion_matrix.loc["Negative"].sum()
print(f"Negatives: {negatives}")
positives = confusion_matrix.loc["Positive"].sum()
print(f"Positives: {positives}")

In [None]:
true_negatives = confusion_matrix.loc["Negative", "Benign"]
print(f"True negatives: {true_negatives}")
false_negatives = confusion_matrix.loc["Negative", "Malign"]
print(f"False negatives: {false_negatives}")
false_positives = confusion_matrix.loc["Positive", "Benign"]
print(f"False positives: {false_positives}")
true_positives = confusion_matrix.loc["Positive", "Malign"]
print(f"True positives: {true_positives}")

## Sensitivity and specificity (medical jargon)

In [None]:
sensitivity = true_positives / maligns
print(f"Sensitivity {sensitivity:.3f}")
specificity = true_negatives / benigns
print(f"Specificity {specificity:.3f}")

https://en.wikipedia.org/wiki/Sensitivity_and_specificity
- **Sensitivity**: test power
- **Specificity**: test confidence level

## Precision, recall and accuracy (data science)

In [None]:
precision = true_positives / positives 
print(f"Precision: {precision:.3f}")
recall = true_positives / maligns   
print(f"Recall: {recall:.3f}")
accuracy = (true_negatives + true_positives) / test_patients
print(f"Accuracy: {accuracy:.3f}")

https://en.wikipedia.org/wiki/Precision_and_recall
- **Recall** = test power = sensitivity
- **Precision** ≠ confidence level  

## Efficiency and contamination (High energy physics)

In [None]:
efficiency = true_positives / maligns
contamination = false_positives / positives

- **Efficiency** = test power = sensitivity = recall
- **Contamination** = 1 - precision  