### Functions

In [1]:
## Import Libraries
import numpy as np
import pandas as pd
import re as regex
import spacy
from pathlib import Path
import time


import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import plot_confusion_matrix
from simple_colors import *
from sklearn.metrics import f1_score

import string
from collections import Counter
import re as regex
from sklearn import metrics
from sklearn.metrics import classification_report

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

import neattext.functions as nfx
import nltk

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import time

In [3]:
from itertools import cycle
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [4]:
def show_summary_report(actual, prediction, predictProba, modelTag, imageTag):

    if isinstance(actual, pd.Series):
        actual = actual.values
    if actual.dtype.name == 'object':
        actual = actual.astype(int)
    if prediction.dtype.name == 'object':
        prediction = prediction.astype(int)

    accuracy_ = accuracy_score(actual, prediction)
    precision_ = precision_score(actual, prediction,
                                 pos_label='positive',
                                 average='macro')
    recall_ = recall_score(actual, prediction,
                           pos_label='positive',
                           average='macro')
    f1_score_ = f1_score(actual, prediction,
                           pos_label='positive',
                           average='macro')
    roc_auc_ = roc_auc_score(actual, predictProba,
                             multi_class= 'ovr',
                             average='macro')

    print('Accuracy  : %.4f [TP / N] Proportion of predicted labels that match the true labels. Best: 1, Worst: 0' % accuracy_)
    print('Precision : %.4f [TP / (TP + FP)] Not to label a negative sample as positive.        Best: 1, Worst: 0' % precision_)
    print('Recall    : %.4f [TP / (TP + FN)] Find all the positive samples.                     Best: 1, Worst: 0' % recall_)
    print('f1-score  : %.4f [2 * (Precision * Recall)/ (Precision + Recall)]                    Best: 1, Worst: 0' % f1_score_)
    print('ROC AUC   : %.4f                                                                     Best: 1, Worst: < 0.5' % roc_auc_)
    print('-' * 107)
    print('TP: True Positives, FP: False Positives, TN: True Negatives, FN: False Negatives, N: Number of samples')

    # Confusion Matrix
    confuseMatrix = confusion_matrix(actual, prediction)
    confuseLabels= ('joy', 'sadness', 'anger', 'fear')
    # plot
    fig, ax = plt.subplots(1, 1, figsize = (8, 6))
    fig.subplots_adjust(left = 0.02, right = 0.98, wspace = 0.2)

    # Confusion Matrix
    sns.heatmap(confuseMatrix.T, square= True, annot= True, fmt= 'd', cbar= False, cmap= 'Blues', ax= ax,
               xticklabels= confuseLabels, yticklabels= confuseLabels)

    ax.set_title(f'Confusion Matrix ({modelTag})')
    ax.set_xlabel('True label')
    ax.set_ylabel('Predicted label')
    
    plt.savefig(f'{imageTag}.png', facecolor='w', bbox_inches="tight",
            pad_inches=0.3, transparent=True)
    
    plt.show()
    
    return [accuracy_, precision_, recall_, f1_score_, roc_auc_]

In [5]:
def svcRocPrecisionRecall(X, y, model, vectorizer, modelTag, imageTag):
    new_Y05 = label_binarize(y, classes=[1, 2, 3, 4])
    n_classes = new_Y05.shape[1]
    X_train05, X_test05, y_train05, y_test05 = train_test_split(X, new_Y05, test_size=0.2, random_state= 1)
    counts05= vectorizer
    A05= counts05.fit_transform(X_train05.values.astype('str'), y_train05)
    B05= counts05.transform(X_test05.values.astype('str'))
    classifier = OneVsRestClassifier(model)
    classifier.fit(A05, y_train05)
    y_score= classifier.predict(B05)
#     predictProbDict= classifier.predict_proba(B05)
    predictProbDict= classifier.decision_function(B05)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
    #     print(i)
        fpr[i], tpr[i], _ = roc_curve(y_test05[:, i], predictProbDict[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test05.ravel(), predictProbDict.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    # Compute precision and recall area for each class
    precision= dict()
    recall= dict()
    average_precision = dict()
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test05[:, i], predictProbDict[:, i])
        average_precision[i] = average_precision_score(y_test05[:, i], predictProbDict[:, i])

    # A "micro-average": quantifying score on all classes jointly
    precision["micro"], recall["micro"], _ = precision_recall_curve(y_test05.ravel(), predictProbDict.ravel()
    )
    average_precision["micro"] = average_precision_score(y_test05, predictProbDict, average="micro")
        
    # plot
    fig, ax= plt.subplots(1, 3, figsize= (18, 6))
    fig.subplots_adjust(left= 0.02, right= 0.98, wspace= 0.2)
    colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    # Precision/Recall
    f_scores = np.linspace(0.2, 0.8, num=4)
    lines, labels = [], []
    for f_score in f_scores:
        x = np.linspace(0.01, 1)
        y = f_score * x / (2 * x - f_score)
        (l,) = ax[0].plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
        ax[0].annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))

    display = PrecisionRecallDisplay(
        recall= recall["micro"],
        precision= precision["micro"],
        average_precision= average_precision["micro"],
    )
    display.plot(ax= ax[0], name="Micro-average precision-recall", color="deeppink", linewidth= 1, linestyle=':')

    for i, color in zip(range(n_classes), colors):
        display = PrecisionRecallDisplay(
            recall=recall[i],
            precision=precision[i],
            average_precision=average_precision[i],
        )
        display.plot(ax= ax[0], name= f"Precision-recall for class {i+1}", color= color, linewidth= 1)

    # add the legend for the iso-f1 curves
    handles, labels = display.ax_.get_legend_handles_labels()
    handles.extend([l])
    labels.extend(["iso-f1 curves"])
    # set the legend and the axes
    ax[0].set_xlim([0.0, 1.0])
    ax[0].set_ylim([0.0, 1.05])
    ax[0].legend(handles=handles, labels=labels, loc="best")
    ax[0].set_title(f'Extension of Precision-Recall curve to multi-class ({modelTag})')
    
#     Plot all ROC curves
#     plt.figure(figsize=(20, 8))
#     plt.figure(figsize=(10, 7))
#     plt.subplot(121)
    ax[1].plot(
        fpr["micro"],
        tpr["micro"],
        label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=2,
    )

    ax[1].plot(
        fpr["macro"],
        tpr["macro"],
        label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=2,
    )
    
    lw= 1
#     colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    for i, color in zip(range(n_classes), colors):
        ax[1].plot(
            fpr[i],
            tpr[i],
            color=color,
            lw= lw,
            label="ROC curve of class {0} (area = {1:0.2f})".format(i+1, roc_auc[i]),
        )
    
    ax[1].plot([0, 1], [0, 1], "k--", lw= lw)
    ax[1].set_xlim([-0.05, 1.0])
    ax[1].set_ylim([0.0, 1.05])
    ax[1].set_xlabel("False Positive Rate")
    ax[1].set_ylabel("True Positive Rate")
    ax[1].set_title(f'Some extension of Receiver operating characteristic to multiclass ({modelTag})')
    ax[1].legend(loc="lower right")
    
#     plt.subplot(122)
    ax[2].plot(
        fpr["micro"],
        tpr["micro"],
        label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=2,
    )

    ax[2].plot(
        fpr["macro"],
        tpr["macro"],
        label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=2,
    )
    
    lw= 1
#     colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    for i, color in zip(range(n_classes), colors):
        ax[2].plot(
            fpr[i],
            tpr[i],
            color=color,
            lw= lw,
            label="ROC curve of class {0} (area = {1:0.2f})".format(i+1, roc_auc[i]),
        )
    
    ax[2].plot([0, 1], [0, 1], "k--", lw= lw)
    ax[2].set_xlim([-0.05, 0.5])
    ax[2].set_ylim([0.5, 1.02])
    ax[2].set_xlabel("False Positive Rate")
    ax[2].set_ylabel("True Positive Rate")
    ax[2].set_title(f'Magnified Roc-Curve ({modelTag})')
    ax[2].legend(loc="lower right")
    
    plt.savefig(f'{imageTag}.png', facecolor='w', bbox_inches="tight",
            pad_inches=0.3, transparent=True)
    
    plt.show()
    return [n_classes, fpr, tpr, roc_auc, fpr['micro'], tpr['micro'], roc_auc['micro'],
            fpr['macro'], tpr['macro'], roc_auc['macro'], precision["micro"], recall["micro"],
            average_precision["micro"]]

In [6]:
def probaRocPrecisionRecall(X, y, model, vectorizer, modelTag, imageTag):
    new_Y05 = label_binarize(y, classes=[1, 2, 3, 4])
    n_classes = new_Y05.shape[1]
    X_train05, X_test05, y_train05, y_test05 = train_test_split(X, new_Y05, test_size=0.2, random_state= 1)
    counts05= vectorizer
    A05= counts05.fit_transform(X_train05.values.astype('str'), y_train05)
    B05= counts05.transform(X_test05.values.astype('str'))
    classifier = OneVsRestClassifier(model)
    classifier.fit(A05, y_train05)
    y_score= classifier.predict(B05)
#     predictProbDict= classifier.predict_proba(B05)
    predictProbDict= classifier.predict_proba(B05)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
    #     print(i)
        fpr[i], tpr[i], _ = roc_curve(y_test05[:, i], predictProbDict[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test05.ravel(), predictProbDict.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    # Compute precision and recall area for each class
    precision= dict()
    recall= dict()
    average_precision = dict()
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test05[:, i], predictProbDict[:, i])
        average_precision[i] = average_precision_score(y_test05[:, i], predictProbDict[:, i])

    # A "micro-average": quantifying score on all classes jointly
    precision["micro"], recall["micro"], _ = precision_recall_curve(y_test05.ravel(), predictProbDict.ravel()
    )
    average_precision["micro"] = average_precision_score(y_test05, predictProbDict, average="micro")
        
    # plot
    fig, ax= plt.subplots(1, 3, figsize= (18, 6))
    fig.subplots_adjust(left= 0.02, right= 0.98, wspace= 0.2)
    colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    # Precision/Recall
    f_scores = np.linspace(0.2, 0.8, num=4)
    lines, labels = [], []
    for f_score in f_scores:
        x = np.linspace(0.01, 1)
        y = f_score * x / (2 * x - f_score)
        (l,) = ax[0].plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
        ax[0].annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))

    display = PrecisionRecallDisplay(
        recall= recall["micro"],
        precision= precision["micro"],
        average_precision= average_precision["micro"],
    )
    display.plot(ax= ax[0], name="Micro-average precision-recall", color="deeppink", linewidth= 1, linestyle=':')

    for i, color in zip(range(n_classes), colors):
        display = PrecisionRecallDisplay(
            recall=recall[i],
            precision=precision[i],
            average_precision=average_precision[i],
        )
        display.plot(ax= ax[0], name= f"Precision-recall for class {i+1}", color= color, linewidth= 1)

    # add the legend for the iso-f1 curves
    handles, labels = display.ax_.get_legend_handles_labels()
    handles.extend([l])
    labels.extend(["iso-f1 curves"])
    # set the legend and the axes
    ax[0].set_xlim([0.0, 1.0])
    ax[0].set_ylim([0.0, 1.05])
    ax[0].legend(handles=handles, labels=labels, loc="best")
    ax[0].set_title(f'Extension of Precision-Recall curve to multi-class ({modelTag})')
    
#     Plot all ROC curves
#     plt.figure(figsize=(20, 8))
#     plt.figure(figsize=(10, 7))
#     plt.subplot(121)
    ax[1].plot(
        fpr["micro"],
        tpr["micro"],
        label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=2,
    )

    ax[1].plot(
        fpr["macro"],
        tpr["macro"],
        label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=2,
    )
    
    lw= 1
#     colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    for i, color in zip(range(n_classes), colors):
        ax[1].plot(
            fpr[i],
            tpr[i],
            color=color,
            lw= lw,
            label="ROC curve of class {0} (area = {1:0.2f})".format(i+1, roc_auc[i]),
        )
    
    ax[1].plot([0, 1], [0, 1], "k--", lw= lw)
    ax[1].set_xlim([-0.05, 1.0])
    ax[1].set_ylim([0.0, 1.05])
    ax[1].set_xlabel("False Positive Rate")
    ax[1].set_ylabel("True Positive Rate")
    ax[1].set_title(f'Some extension of Receiver operating characteristic to multiclass ({modelTag})')
    ax[1].legend(loc="lower right")
    
#     plt.subplot(122)
    ax[2].plot(
        fpr["micro"],
        tpr["micro"],
        label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=2,
    )

    ax[2].plot(
        fpr["macro"],
        tpr["macro"],
        label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=2,
    )
    
    lw= 1
#     colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    for i, color in zip(range(n_classes), colors):
        ax[2].plot(
            fpr[i],
            tpr[i],
            color=color,
            lw= lw,
            label="ROC curve of class {0} (area = {1:0.2f})".format(i+1, roc_auc[i]),
        )
    
    ax[2].plot([0, 1], [0, 1], "k--", lw= lw)
    ax[2].set_xlim([-0.05, 0.5])
    ax[2].set_ylim([0.5, 1.02])
    ax[2].set_xlabel("False Positive Rate")
    ax[2].set_ylabel("True Positive Rate")
    ax[2].set_title(f'Magnified Roc-Curve ({modelTag})')
    ax[2].legend(loc="lower right")
    
    plt.savefig(f'{imageTag}.png', facecolor='w', bbox_inches="tight",
            pad_inches=0.3, transparent=True)
    
    plt.show()
    return [n_classes, fpr, tpr, roc_auc, fpr['micro'], tpr['micro'], roc_auc['micro'],
            fpr['macro'], tpr['macro'], roc_auc['macro'], precision["micro"], recall["micro"],
            average_precision["micro"]]

In [7]:
def testRocPrecisionRecall(X, y, model, vectorizer, modelTag, imageTag):
    new_Y04 = label_binarize(y, classes=[1, 2, 3, 4])
    n_classes = new_Y04.shape[1]
    X_train04, X_test04, y_train04, y_test04 = train_test_split(X, new_Y04, test_size=0.2, random_state= 1)
    counts04= vectorizer
    A04= counts04.fit_transform(X_train04.values.astype('str'), y_train04)
    B04= counts04.transform(X_test04.values.astype('str'))
    classifier = OneVsRestClassifier(model)
    classifier.fit(A04, y_train04)
    y_score= classifier.predict(B04)
#     predictProbDict= classifier.predict_proba(B03)
    predictProbDict= classifier.decision_function(B04)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
    #     print(i)
        fpr[i], tpr[i], _ = roc_curve(y_test04[:, i], predictProbDict[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test04.ravel(), predictProbDict.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    # Compute precision and recall area for each class
    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test04[:, i], predictProbDict[:, i])
        average_precision[i] = average_precision_score(y_test04[:, i], predictProbDict[:, i])

    # A "micro-average": quantifying score on all classes jointly
    precision["micro"], recall["micro"], _ = precision_recall_curve(y_test04.ravel(), predictProbDict.ravel()
    )
    average_precision["micro"] = average_precision_score(y_test04, predictProbDict, average="micro")

    # plot
    fig, ax= plt.subplots(2, 2, figsize= (16, 12))
    fig.subplots_adjust(left= 0.02, right= 0.98, wspace= 0.2)
    colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    lw= 1
    f_scores = np.linspace(0.2, 0.8, num=4)
    lines, labels = [], []
    for f_score in f_scores:
        x = np.linspace(0.01, 1)
        y = f_score * x / (2 * x - f_score)
        (l,) = ax[0, 0].plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
        ax[0, 0].annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))

    display = PrecisionRecallDisplay(
        recall=recall["micro"],
        precision=precision["micro"],
        average_precision=average_precision["micro"],
    )
    display.plot(ax= ax[0, 0], name="Micro-average precision-recall", color="deeppink", lw= lw, linestyle=":")

    for i, color in zip(range(n_classes), colors):
        display = PrecisionRecallDisplay(
            recall=recall[i],
            precision=precision[i],
            average_precision=average_precision[i],
        )
        display.plot(ax=ax[0, 0], name=f"Precision-recall for class {i+1}", color= color, lw= lw,)

    # add the legend for the iso-f1 curves
    handles, labels = display.ax_.get_legend_handles_labels()
    handles.extend([l])
    labels.extend(["iso-f1 curves"])
    # set the legend and the axes
    ax[0, 0].set_xlim([0.0, 1.0])
    ax[0, 0].set_ylim([0.0, 1.05])
    ax[0, 0].legend(handles=handles, labels=labels, loc="best")
    ax[0, 0].set_title(f"Extension of Precision-Recall curve to multi-class ({modelTag})")
    
#     Plot all ROC curves
#     plt.figure(figsize=(20, 8))
#     plt.figure(figsize=(10, 7))
#     plt.subplot(121)
    ax[1, 0].plot(
        fpr["micro"],
        tpr["micro"],
        label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=2,
    )

    ax[1, 0].plot(
        fpr["macro"],
        tpr["macro"],
        label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=2,
    )
    
#     lw= 1
#     colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    for i, color in zip(range(n_classes), colors):
        ax[1, 0].plot(
            fpr[i],
            tpr[i],
            color=color,
            lw= lw,
            label="ROC curve of class {0} (area = {1:0.2f})".format(i+1, roc_auc[i]),
        )
    
    ax[1, 0].plot([0, 1], [0, 1], "k--", lw= lw)
    ax[1, 0].set_xlim([-0.05, 1.0])
    ax[1, 0].set_ylim([0.0, 1.05])
    ax[1, 0].set_xlabel("False Positive Rate")
    ax[1, 0].set_ylabel("True Positive Rate")
    ax[1, 0].set_title(f"Some extension of Receiver operating characteristic to multiclass ({modelTag})")
    ax[1, 0].legend(loc="lower right")
    
#     plt.subplot(122)
    ax[1, 1].plot(
        fpr["micro"],
        tpr["micro"],
        label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=2,
    )

    ax[1, 1].plot(
        fpr["macro"],
        tpr["macro"],
        label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=2,
    )
    
#     lw= 1
#     colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    for i, color in zip(range(n_classes), colors):
        ax[1, 1].plot(
            fpr[i],
            tpr[i],
            color=color,
            lw= lw,
            label="ROC curve of class {0} (area = {1:0.2f})".format(i+1, roc_auc[i]),
        )
    
    ax[1, 1].plot([0, 1], [0, 1], "k--", lw= lw)
    ax[1, 1].set_xlim([-0.05, 0.5])
    ax[1, 1].set_ylim([0.5, 1.02])
    ax[1, 1].set_xlabel("False Positive Rate")
    ax[1, 1].set_ylabel("True Positive Rate")
    ax[1, 1].set_title(f"Magnified Roc-Curve ({modelTag})")
    ax[1, 1].legend(loc="lower right")
    
    plt.savefig(f'{imageTag}.png', facecolor='w', bbox_inches="tight",
            pad_inches=0.3, transparent=True)
    
    plt.show()
    return [n_classes, fpr, tpr, roc_auc, fpr['micro'], tpr['micro'], roc_auc['micro'],
            fpr['macro'], tpr['macro'], roc_auc['macro'], precision["micro"], recall["micro"],
            average_precision["micro"]]

In [8]:
def calcRocAuc(X, y, model, vectorizer, modelTag, imageTag):
    new_Y03 = label_binarize(y, classes=[1, 2, 3, 4])
    n_classes = new_Y03.shape[1]
    X_train03, X_test03, y_train03, y_test03 = train_test_split(X, new_Y03, test_size=0.2, random_state= 1)
    counts03= vectorizer
    A03= counts03.fit_transform(X_train03.values.astype('str'), y_train03)
    B03= counts03.transform(X_test03.values.astype('str'))
    classifier = OneVsRestClassifier(model)
    classifier.fit(A03, y_train03)
    y_score= classifier.predict(B03)
#     predictProbDict= classifier.predict_proba(B03)
    predictProbDict= classifier.decision_function(B03)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
    #     print(i)
        fpr[i], tpr[i], _ = roc_curve(y_test03[:, i], predictProbDict[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test03.ravel(), predictProbDict.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure(figsize=(20, 8))
#     plt.figure(figsize=(10, 7))

    plt.subplot(121)
    plt.plot(
        fpr["micro"],
        tpr["micro"],
        label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=2,
    )

    plt.plot(
        fpr["macro"],
        tpr["macro"],
        label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=2,
    )
    
    lw= 1
    colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    for i, color in zip(range(n_classes), colors):
        plt.plot(
            fpr[i],
            tpr[i],
            color=color,
            lw= lw,
            label="ROC curve of class {0} (area = {1:0.2f})".format(i+1, roc_auc[i]),
        )
    
    plt.plot([0, 1], [0, 1], "k--", lw= lw)
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"Some extension of Receiver operating characteristic to multiclass ({modelTag})")
    plt.legend(loc="lower right")
    
    plt.subplot(122)
    plt.plot(
        fpr["micro"],
        tpr["micro"],
        label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=2,
    )

    plt.plot(
        fpr["macro"],
        tpr["macro"],
        label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=2,
    )
    
    lw= 1
    colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
    for i, color in zip(range(n_classes), colors):
        plt.plot(
            fpr[i],
            tpr[i],
            color=color,
            lw= lw,
            label="ROC curve of class {0} (area = {1:0.2f})".format(i+1, roc_auc[i]),
        )
    
    plt.plot([0, 1], [0, 1], "k--", lw= lw)
    plt.xlim([-0.05, 0.5])
    plt.ylim([0.5, 1.02])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"Magnified Roc-Curve ({modelTag})")
    plt.legend(loc="lower right")
    
    plt.savefig(f'{imageTag}.png', facecolor='w', bbox_inches="tight",
            pad_inches=0.3, transparent=True)
    
    plt.show()
    return [n_classes, fpr, tpr, roc_auc, fpr['micro'], tpr['micro'], roc_auc['micro'],
            fpr['macro'], tpr['macro'], roc_auc['macro']]