In [53]:
import pandas as pd
import numpy as np
import warnings
import joblib

import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix, precision_recall_curve

import keras

# Config notebook
warnings.filterwarnings('ignore')

# Evaluate Best Models

In [54]:
def plot_auc_roc(y_test, y_pred, title):
    fpr = {}
    tpr = {}
    roc_auc = {}
    for i in range(9):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fig_roc = go.Figure()
    for i in range(9):
        fig_roc.add_trace(go.Scatter(x=fpr[i], y=tpr[i], mode='lines', name=f'Class {i} (AUC = {roc_auc[i]:.2f})'))

    fig_roc.update_layout(title=title, xaxis_title='Rate False Positive', yaxis_title='Rate True Positive', width=1200, height=800)
    fig_roc.show()


def plot_precision_recall(y_test, y_pred, title):
    precision = {}
    recall = {}
    pr_auc = {}
    for i in range(9):
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_pred[:, i])
        pr_auc[i] = auc(recall[i], precision[i])

    fig_pr = go.Figure()
    for i in range(9):
        fig_pr.add_trace(go.Scatter(x=recall[i], y=precision[i], mode='lines', name=f'Class {i}')) #  (AUC = {pr_auc})
    fig_pr.update_layout(title=title, xaxis_title='Recall', yaxis_title='Precision', width=1200, height=800)
    fig_pr.show()

# Load data splitted

In [55]:
path = ''

In [56]:
# Load PCA data
X_train_pca = joblib.load(path + './data_files/X_train_PCA.joblib')
X_test_pca = joblib.load(path + './data_files/X_test_PCA.joblib')
y_train_pca = joblib.load(path + './data_files/y_train_PCA.joblib')
y_test_pca = joblib.load(path + './data_files/y_test_PCA.joblib')

# Load LSTM data
X_train = joblib.load(path + './data_files/X_train.joblib')
X_test = joblib.load(path + './data_files/X_test.joblib')
y_train = joblib.load(path + './data_files/y_train.joblib')
y_test = joblib.load(path + './data_files/y_test.joblib')

# Load DistilBert data
X_train_distil = joblib.load(path + './data_files/X_train_DISTIL.joblib')
X_test_distil = joblib.load(path + './data_files/X_test_DISTIL.joblib')
y_train_distil = joblib.load(path + './data_files/y_train_DISTIL.joblib')
y_test_distil = joblib.load(path + './data_files/y_test_DISTIL.joblib')


# LSTM Model

In [57]:
# Load models
LSTM_model = keras.saving.load_model(path + './best_models/kerasTuningLSTM.keras')

In [58]:
prev = False

if prev:
    # Predict
    y_pred = LSTM_model.predict(X_test)
    y_pred_class = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    # joblib.dump(y_pred, './data_files/y_pred.joblib')
    # joblib.dump(y_pred_class, './data_files/y_pred_class.joblib')

else:
    y_pred = joblib.load(path + './data_files/y_pred.joblib')
    y_pred_class = joblib.load(path + './data_files/y_pred_class.joblib')
    y_true = np.argmax(y_test, axis=1)

### Confusion Matrix

In [59]:
cm_1 = confusion_matrix(y_true, y_pred_class)

fig_cm_1 = px.imshow(cm_1, text_auto=True, labels=dict(x="Predict", y="True", color="Count"),
                     x=list(range(9)), y=list(range(9)), color_continuous_scale=px.colors.sequential.Viridis)
fig_cm_1.update_layout(title='Confusion Matrix - Model LSTM', width=1200, height=800)
fig_cm_1.show()

### AUC-ROC

In [60]:
plot_auc_roc(y_test, y_pred, 'AUC-ROC - Model LSTM')

### Precision-Recall

In [61]:
plot_precision_recall(y_test, y_pred, 'Precision-Recall - Model LSTM')

In [62]:
print('Classification Report - LSTM')
print(53 * '-')
print(classification_report(y_true, y_pred_class))

Classification Report - LSTM
-----------------------------------------------------
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       142
           1       0.14      0.99      0.24       113
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00       171
           4       0.00      0.00      0.00        61
           5       0.00      0.00      0.00        68
           6       0.84      0.07      0.12       238
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         9

    accuracy                           0.15       829
   macro avg       0.11      0.12      0.04       829
weighted avg       0.26      0.15      0.07       829



## LSTM with PCA

In [63]:
PCA_model = keras.saving.load_model(path + './best_models/KerasLSTM_PCA.keras')

In [64]:
prev = False

if prev:
    # Predict
    y_pred_pca = PCA_model.predict(X_test_pca)
    y_pred_class_pca = np.argmax(y_pred_pca, axis=1)
    y_true_pca = np.argmax(y_test_pca, axis=1)

    joblib.dump(y_pred_pca, path + './data_files/y_pred_pca.joblib')
    joblib.dump(y_pred_class_pca, path + './data_files/y_pred_class_pca.joblib')

else:
    y_pred_pca = joblib.load('data_files/y_pred_pca.joblib')
    y_pred_class_pca = joblib.load('data_files/y_pred_class_pca.joblib')
    y_true_pca = np.argmax(y_test, axis=1)



### Confusion Matrix

In [65]:
cm_2 = confusion_matrix(y_true_pca, y_pred_class_pca)

fig_cm_2 = px.imshow(cm_2, text_auto=True, labels=dict(x="Predict", y="True", color="Count"),
                     x=list(range(9)), y=list(range(9)), color_continuous_scale=px.colors.sequential.Viridis)
fig_cm_2.update_layout(title='Confusion Matrix - Model LSTM with PCA', width=1200, height=800)
fig_cm_2.show()


### AUC-ROC

In [66]:
plot_auc_roc(y_test_pca, y_pred_pca, 'AUC-ROC - Model LSTM with PCA')

### Precision-Recall

In [67]:
plot_precision_recall(y_test_pca, y_pred_pca, 'Precision-Recall - LSTM with PCA')

In [68]:
print('Classification Report - LSTM with PCA')
print(53 * '-')
print(classification_report(y_true_pca, y_pred_class_pca))

Classification Report - LSTM with PCA
-----------------------------------------------------
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       142
           1       0.13      0.20      0.16       113
           2       0.00      0.00      0.00        22
           3       1.00      0.01      0.01       171
           4       0.00      0.00      0.00        61
           5       0.00      0.00      0.00        68
           6       0.29      0.79      0.42       238
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         9

    accuracy                           0.25       829
   macro avg       0.16      0.11      0.07       829
weighted avg       0.31      0.25      0.15       829



# DistilBertClassification Model

In [69]:
distil_model = keras.saving.load_model(path + './best_models/DistilbertClassifier_32.keras')

In [70]:
prev = False

if prev:
    # Predict
    y_pred_distil = distil_model.predict(X_test_distil)
    y_pred_class_distil = np.argmax(y_pred_distil, axis=1)
    y_true_distil = np.argmax(y_test_distil, axis=1)

    joblib.dump(y_pred_distil, path + './data_files/y_pred_distil.joblib') 
    joblib.dump(y_pred_class_distil, path + './data_files/y_pred_class_distil.joblib')

else:
    y_pred_distil = joblib.load('./data_files/y_pred_distil.joblib')
    y_pred_class_distil = joblib.load('./data_files/y_pred_class_distil.joblib')
    y_true_distil = np.argmax(y_test_distil, axis=1)


### Confusion Matrix

In [71]:
cm_3 = confusion_matrix(y_true_distil, y_pred_class_distil)

fig_cm_3= px.imshow(cm_3, text_auto=True, labels=dict(x="Predict", y="True", color="Count"),
                     x=list(range(9)), y=list(range(9)), color_continuous_scale=px.colors.sequential.Viridis)
fig_cm_3.update_layout(title='Confusion Matrix - Model DistilBert Classifier', width=1200, height=800)
fig_cm_3.show()


### AUC-ROC

In [72]:
plot_auc_roc(y_test_distil, y_pred_distil, 'AUC-ROC - Model DistilBert Classifier')

### Precision-Recall

In [73]:
plot_precision_recall(y_test_distil, y_pred_distil, 'Precision-Recall - DistilBert Classifier')

In [74]:
print('Classification Report - DistilBert Classifier')
print(53 * '-')
print(classification_report(y_true_distil, y_pred_class_distil))

Classification Report - DistilBert Classifier
-----------------------------------------------------
              precision    recall  f1-score   support

           0       0.62      0.54      0.58       113
           1       0.75      0.46      0.57        91
           2       0.55      0.33      0.41        18
           3       0.65      0.71      0.68       137
           4       0.46      0.38      0.41        48
           5       0.56      0.62      0.59        55
           6       0.70      0.87      0.78       191
           7       0.00      0.00      0.00         4
           8       0.64      1.00      0.78         7

    accuracy                           0.65       664
   macro avg       0.55      0.55      0.53       664
weighted avg       0.64      0.65      0.64       664



# Conclusion

<p>The <b>LSTM models, with and without PCA</b>, did not show any significant performance. It is necessary to increase the number of neurons and adjust the hyperparameter, consequently requiring greater computing power.</p>

<p>On the other hand, the <b>DistilBert Classifier model</b> showed reasonable performance but still requires adjustments and, above all, more real data. There is a significant imbalance between the classes, making the model biased. The adjustment for dealing with the imbalance was made within the available processing capacity, without using paid structures.</p>

<p>Adjusting hyperparameter and improving data cleaning can significantly enhance the model. Moreover, more advanced pre-trained NLP models and other ML and Neural Network techniques can be employed.</p>

<p>I decided not to continue as my objective for this project was achieved: creating models using Keras TensorFlow.</p>

<h5><b>Note: These models are not reliable for predicting cancer treatment.</b></h5>