### Prostate cancer is the most common cancer in men in the UK. It usually develops slowly, so there may be no signs for many years. Symptoms of prostate cancer do not usually appear until the prostate is large enough to affect the tube that carries urine from the bladder out of the penis (urethra).

## Information source: https://www.nhs.uk/conditions/prostate-cancer/

### This dataset contains clinical information about 100 patients and 10 variables. Here we going to create a simple analysis of the data and test some traditional classifiers

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("../input/prostate-cancer/Prostate_Cancer.csv")

In [None]:
data.shape

In [None]:
data.tail()

In [None]:
data.dtypes

### No missing values found

In [None]:
data.isnull().sum()

### Encoding the categorical variable found (diagnosis_result), wich is the target

In [None]:
data_model = data.drop(['id'], axis=1)
data_model['diagnosis_result'] = data_model['diagnosis_result'].astype('category')
data_model['diagnosis_result'] = data_model['diagnosis_result'].cat.codes
data_model['diagnosis_result'].dtype

In [None]:
data_model.tail()

### We're going to check the correlation between the features and the target

In [None]:
correlations = data_model.corr(method='pearson')
correlations

In [None]:
plt.figure(figsize = (20, 8))
sb.heatmap(correlations, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.8)

### The 3 most correlated are:
* perimeter
* area
* compactness

### Checking the relationship between them usnig swarmplot

In [None]:
sb.swarmplot(x=data_model['diagnosis_result'],
              y=data_model['perimeter'])

In [None]:
sb.swarmplot(x=data_model['diagnosis_result'],
              y=data_model['area'])

In [None]:
sb.swarmplot(x=data_model['diagnosis_result'],
              y=data_model['compactness'])

In [None]:
y = data_model.diagnosis_result
X = data_model[['perimeter', 'area', 'compactness']]

### Classes are slightly imbalanced

In [None]:
data_model['diagnosis_result'].value_counts()

In [None]:
data_model['diagnosis_result'].value_counts().plot(kind='bar', title='Count (target)')

### Let's use 4 traditional classifiers:
* Logistic Regression
* SVM
* Decision Tree
* Naive Bayes

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB

### * 10-fold cross-validation
### * train test 80/20

In [None]:
from sklearn.model_selection import cross_val_score, KFold, train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
kf = KFold(n_splits=10, random_state=0, shuffle=True)

In [None]:
X_train.shape

In [None]:
X_test.shape

### We're going to save the results in a dict

In [None]:
results_dict = {}

In [None]:
lr = LogisticRegression(C=0.5, random_state=1)
mean_auc_lr = cross_val_score(lr, X_train, y_train, n_jobs=-1, cv=kf, scoring='roc_auc').mean()
results_dict['Logistic Regression'] = mean_auc_lr
results_dict

In [None]:
svm = svm.SVC()
mean_auc_svm = cross_val_score(svm, X_train, y_train, n_jobs=-1, cv=kf, scoring='roc_auc').mean()
results_dict['SVM'] = mean_auc_svm
results_dict

In [None]:
dt = DecisionTreeRegressor()
mean_auc_dt = cross_val_score(dt, X_train, y_train, n_jobs=-1, cv=kf, scoring='roc_auc').mean()
results_dict['Decision Tree'] = mean_auc_dt
results_dict

In [None]:
nb = GaussianNB()
mean_auc_nb = cross_val_score(nb, X_train, y_train, n_jobs=-1, cv=kf, scoring='roc_auc').mean()
results_dict['NB'] = mean_auc_nb
results_dict

In [None]:
x = ['Logistic Regression', 'SVM', 'Decision Tree', 'NB']
y = [results_dict['Logistic Regression'], results_dict['SVM'], results_dict['Decision Tree'], results_dict['NB']]
plt.title("AUC comparison")
plt.ylabel("AUC")
plt.bar(x,y)

### Testing with the method that achieved the highest AUC value

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error
nb.fit(X_train, y_train)
predicted = nb.predict(X_test)
roc_auc = roc_auc_score(y_test, predicted)
mae = mean_absolute_error(y_test, predicted)

print("Mean Absolute Error: {} | ROC AUC: {}".format(mae, roc_auc))

### Printing the confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, predicted)
confusion

In [None]:
from sklearn.metrics import plot_confusion_matrix

disp = plot_confusion_matrix(nb, X_test, y_test,
                                 display_labels=data_model['diagnosis_result'],
                                 cmap=plt.cm.Blues)

disp.ax_.set_title("Confusion Matrix")
disp.confusion_matrix
plt.show()

### In the medical domain, two measures are widely used: **Sensitivity** and **Specificity**. To calculate them we need:
* True Positive (TP)
* True Negative (TN)
* False Positive (FP)
* Flase Negative (FN)

In [None]:
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

In [None]:
sensitivity = TP/(TP+FN)
specificity = TN/(TN+FP)

"Sensitivity: {} | Specifictity: {}".format(sensitivity, specificity)