In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split 

import pandas
import numpy
import matplotlib as mpl
import matplotlib.pyplot as plt
import xgboost as xgb

import tensorflow as tf
from tensorflow import keras
from tensorflow import feature_column
from tensorflow.python.keras import layers
from tensorflow.python.keras.layers import Input, Dense, Activation,Dropout
from tensorflow.python.keras.models import Model
from keras.utils import plot_model

import seaborn as sns
import os
import tempfile
from imblearn.over_sampling import SMOTE

scaler = StandardScaler()
oversample = SMOTE()

In [None]:
df = pandas.read_csv('../colo_data_mar22_d032222.csv')

In [None]:
df = df[[

    'age',
    'sex',
    
    'race7',
    'educat',
    'marital',
    'occupat',

    'cig_stat',
    'cig_years',
    'cigpd_f',
    'cigar',
    'pipe',

    'fh_cancer',
    'colo_fh',
    'colo_fh_cnt',

    'bmi_curr',

    'asp',
    'asppd',
    'ibup',
    'ibuppd',
    
    'arthrit_f',
    'bronchit_f',
    'colon_comorbidity',
    'diabetes_f',
    'divertic_f',
    'emphys_f',
    'gallblad_f',
    'hearta_f',
    'hyperten_f',
    'liver_comorbidity',
    'osteopor_f',
    'polyps_f',
    'stroke_f',

    'colo_cancer'

]]

In [None]:
# filling null values
for column in df:

    # raw numeric features
    if (column == 'age' or column == 'bmi_curr' or column == 'cig_years'): 
        df[column] = df[column].fillna(df[column].mean())
        
    # default siblings to 0 (average is 0.11)
    elif (column == 'colo_fh_cnt'): 
        df[column] = df[column].fillna(0)
       
    # rest are categorized  
    else: df[column] = df[column].fillna(df[column].mode()[0])

df.isnull().sum()

In [None]:
y = df['colo_cancer']
X = df.drop('colo_cancer', axis = 1)

In [None]:
neg, pos = numpy.bincount(y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

X, y = oversample.fit_resample(X, y)

neg, pos = numpy.bincount(y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size = 0.2, random_state = 42)
train_features, val_features, train_labels, val_labels = train_test_split(train_features, train_labels, test_size = 0.25, random_state = 42)

train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

In [None]:
XGB_decisionTree = xgb.XGBClassifier()
XGB_decisionTree = XGB_decisionTree.fit(X=train_features, y=train_labels, eval_set=[(val_features, val_labels)])

In [None]:
prediction = XGB_decisionTree.predict(test_features)

print("Accuracy:",XGB_decisionTree.score(test_features, test_labels))
print()
print(metrics.classification_report(test_labels, prediction))

# metrics for positive class
Precision_pos = metrics.precision_score(test_labels, prediction, pos_label=1)
Recall = metrics.recall_score(test_labels, prediction, pos_label=1)
F1_score_pos = metrics.f1_score(test_labels, prediction, pos_label=1)

#metrics for negative class
Precision_neg = metrics.precision_score(test_labels, prediction, pos_label=0)
Specificity = metrics.recall_score(test_labels, prediction, pos_label=0)
F1_score_neg = metrics.f1_score(test_labels, prediction, pos_label=0)

# recall and specificity are opposites 

auc = metrics.roc_auc_score(test_labels, prediction)
prc = metrics.average_precision_score(test_labels, prediction)

#metrics:
print(
    "Metrics for positive class (most important)\n"+
    "Precision: "+str(Precision_pos)+
    "\nRecall: "+str(Recall)+
    "\nF1_score: "+str(F1_score_pos)
)

print()

print(
    "Metrics for negative class\n"+
    "Precision: "+str(Precision_neg)+
    "\nSpecificity: "+str(Specificity)+
    "\nF1_score: "+str(F1_score_neg)
)

print()

print(
    "AUC-ROC: "+str(auc)+
    "\nPRC: "+str(prc)
)

In [None]:
def plot_cm(labels, predictions, p=0.5):
  cm = metrics.confusion_matrix(labels, predictions > p)
  plt.figure(figsize=(5,5))
  sns.heatmap(cm, annot=True, fmt="d")
  plt.title('Confusion matrix @{:.2f}'.format(p))
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')

  print('(Good) No Cancer Detected (True Negatives):', cm[0][0])
  print('(Okay) No Cancer Thought to Have Cancer (False Positives):', cm[0][1])
  print('(Bad) Cancer Thought to Have No Cancer (False Negatives):', cm[1][0])
  print('(Good) Cancer Detected (True Positives):', cm[1][1])
  print('Total Cancer Cases:', numpy.sum(cm[1]))

In [None]:
plot_cm(test_labels, prediction)

In [None]:
metrics.RocCurveDisplay.from_predictions(test_labels, prediction)
plt.show()

In [None]:
metrics.PrecisionRecallDisplay.from_predictions(test_labels, prediction)
plt.show()