# 1. Import the required Libraries

In [1]:
pip install Unidecode

Collecting Unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 24.8 MB/s eta 0:00:01[K     |██▉                             | 20 kB 10.1 MB/s eta 0:00:01[K     |████▏                           | 30 kB 8.2 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 3.5 MB/s eta 0:00:01[K     |███████                         | 51 kB 3.6 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 4.2 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 4.5 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 4.5 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 5.0 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 4.2 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 4.2 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 4.2 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 4.2 MB/s eta 0:00:01

In [61]:
import re
import pickle
import warnings
import unidecode
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from tensorflow.keras import preprocessing, models, backend
from tensorflow.keras.losses import binary_crossentropy
from custom_utility import emoticon_dictionary, contraction_dictionary

pd.options.display.max_columns = None # To display all the columns of a Dataframe.
pd.options.display.max_colwidth = 120 # Increase the width of columns when display a DataFrame to avoid wrapping of text(s).
warnings.filterwarnings('ignore') # Ignore any warnings and do not show them in the cell output.

In [2]:
from google.colab import drive
# Mount GDrive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


----

# 2. Utility Class for Text Pre-processing

* Here, we will define some utility class with various methods to perform text-preprocessing.

In [5]:
class TextPreprocess:
    
    '''
    Class containing all the methods for doing text pre-processing.
    This uses the constants defined in the custom package's ('custom_utility') modules: 'emoticon_dictionary'
    and 'contraction_dictionary'.
    '''
    
    def __init__(self):
        
        # Fetch the dictionary having all the emoticons' symbol and their names from the custom built package.
        self.emoticonsDict = emoticon_dictionary.emoticonsDict
        
        # Fetch the dictionary having all the contraction characters and their decontracted words from the custom package.
        self.contractionMap = contraction_dictionary.contractionMap
        
        
    def removeHTMLTags(self, text):
        '''
        Function to remove the HTML Tags from a given text.

        Parameter:
        ---------
        text: str
            Text from which the HTML tags has to be removed.
        '''

        # Reference: 'Remove html tags using BeautifulSoup' - https://www.geeksforgeeks.org/remove-all-style-scripts-and-html-tags-using-beautifulsoup/

        # Create a BeautifulSoup object to parse the given html text content
        soup = BeautifulSoup(text, 'html.parser')

        # Remove the <style> and <script> tags from the html content because they contains the styling sheet and javascript
        # file references and won't give any meaningful context.
        for data in soup(['style', 'script']):

            # Remove tag
            data.decompose()

        # Return the html tag free content
        return ' '.join(soup.stripped_strings)


    def removeAccentedChars(self, text):
        '''
        Function to remove the accented characters from a given text.

        Parameter:
        ---------
        text: str
            Text from which the accented character has to be removed.
        '''
        
        # Reference: "remove accented characters python" - https://www.geeksforgeeks.org/how-to-remove-string-accents-using-python-3/

        # Remove accents
        return unidecode.unidecode(text)


    def lowercase(self, text):
        '''
        Function to convert a given text to its lowercase.

        Parameter:
        ---------
        text: str
            Text that has to be converted to lowercase.
        '''

        return text.lower()


    def removeIPLinkNum(self, text, ipAddress=True, hyperlink=False, numbers=True):
        '''
        Function to remove IP Address and Number from the given text.

        Parameter:
        ---------
        text: str
            Text from which IP Address and number(s) have to be removed.
        '''

        # Replace IP Address with empty string.
        # Reference: 'Remove IP Address Python' - https://www.geeksforgeeks.org/extract-ip-address-from-file-using-python/#:~:text=The%20regular%20expression%20for%20valid,%5C.)%7B
        if ipAddress == True:

            text = re.sub(r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', '', text)

        # Remove hyperlinks
        # Reference: 'Regex for hperlinks Python' - https://www.geeksforgeeks.org/python-check-url-string/
        if hyperlink == True:

            text = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", "", text)

        # Remove numbers.
        if numbers == True:

            text = re.sub(r'[0-9]', '', text)

        # Remove the extra space if any.
        text = re.sub(r'[ ][ ]+', ' ', text)

        return text


    # Replace Emoticons with correponding words
    def replaceEmoticons(self, text):

        for emoticon in self.emoticonsDict:

            word = "_".join(self.emoticonsDict[emoticon].split())

            text = text.replace(emoticon, ' ' + word + ' ')

            # Remove the extra space if any.
            text = re.sub(r'[ ][ ]+', ' ', text)

        return text


    def removeSpecialChars(self, text, removeAll=False):
        '''
        Function to remove the special characters from the given text.

        Parameter:
        ---------
        text: str
            Text from which the special characters have to be removed.
        removeAll: boolean
            Flag to check whether to remove all special characters or all except ' . ? !
        '''

        if removeAll == True:

            text = re.sub(r'[^A-Za-z ]+', '', text) # Remove all special characters.

        else:

            text = re.sub(r'[^A-Za-z\'.?! ]+', '', text) # Remove all special characters except ' . ? !

        # Remove the extra space if any.
        text = re.sub(r'[ ][ ]+', ' ', text)

        return text


    def processSpecialTokens(self, text, isBERTUsed=False):
        '''
        Function to add one space around sentence end markers and remove duplicates.

        Parameter:
        ---------
        text: str
            Text in which space has to be added around sentence end tokens.
        isBERTUsed: boolean
            Boolean flag to indicate if BERT is used in the modelling, then do not apply this pre-processing.
        '''

        if (isBERTUsed == False):

            text = re.sub(r'[!]+[ ]*[!]*', ' ! ', text) # Add space around ! with exclmrk.
            text = re.sub(r'[?]+[ ]*[?]*', ' ? ', text) # Replace ? with qstmrk.
            text = re.sub(r'[.]+[ ]*[.]*', ' . ', text) # Replace . with eosmkr.

            # Remove the extra space if any.
            text = re.sub(r'[ ][ ]+', ' ', text)

        return text


    def decontract(self, text, isBERTUsed=False):
        '''
        Function to decontract a given text.

        Parameter:
        ---------
        text: str
            Text to be decontracted.
        isBERTUsed: boolean
            Boolean flag to indicate if BERT is used in the modelling, then do not apply this pre-processing.
        '''

        if isBERTUsed==False:

            # Iterate through all the contraction keys and replace the keys with their corresponding values (expanded form)
            for word in self.contractionMap.keys():

                text = self.lowercase(text) # Convert to lowercase.
                text = re.sub(word, self.contractionMap[word], text) # Replace the contracted word with its decontracted form.

        return text


    def preprocess(self, text, html=True, accent=True, lower=True, ipLinkNum=True, emoticon=True, specialChar=True, 
                   specialToken=True, decontraction=True, isBERTUsed=False, removeAllSpecialChar=False, hyperlink=False):
        '''
        Function to perform all the data-preprocessing on a given text.

        Parameters:
        ----------
        text: str
            Text on which the pre-processing has to be performed.
        html: boolean
            Flag to check whether to remove html tags from the text or not.
        accent: boolean
            Flag to check whether to remove the accented characters from the text or not.
        lower: boolean
            Flag to check whether to perform lowercase on the text or not.
        ipLinkNum: boolean
            Flag to check whether to remove the IP Address, Hyperlink(s) and number(s) from the text or not.
        emoticon: boolean
            Flag to check whether to replace the emoticons with their corresponding words in the text or not.
        specialChar: boolean
            Flag to check whether to remove the special characters from the text or not.
        specialToken: boolean
            Flag to check whether to replace the special tokens with their corresponding words in the text or not.
        decontraction: boolean
            Flag to check whether to do decontraction in the given text or not.
        isBERTUsed: boolean
            Boolean flag to indicate if BERT is used in the modelling, then do not apply this pre-processing.
        removeAllSpecialChar: boolean
            Flag to check whether to remove all special characters or all except ' . ? !
        hyperlink: boolean
            Flag to check whether to remove the hyperlink from the text or not.
        '''

        if html == True:

            # Call the function 'removeHTMLTags()' to remove the html tags from the html content
            text = self.removeHTMLTags(text)

        if accent == True:

            # Call the function 'removeAccentedChars()' to remove the accented characters from the text.
            text = self.removeAccentedChars(text)

        if lower == True:

            # Call the function 'lowercase()' to convert the text to its lowercase.
            text = self.lowercase(text)

        if ipLinkNum == True: 

            # Call the 'removeIPLinkNum()' to remove the IP Address, Hyperlinks and numbers from the text.
            text = self.removeIPLinkNum(text, hyperlink=hyperlink)

        if emoticon == True:

            # Call the 'replaceEmoticons()' to replace emoticons by their corresponding words.
            text = self.replaceEmoticons(text)

        if specialChar == True:

            # Call the 'removeSpecialChars()' to remove the special characters from a text.
            text = self.removeSpecialChars(text, removeAllSpecialChar)

        if specialToken == True:

            # Call the 'processSpecialTokens' to add space around sentence end tokens.
            text = self.processSpecialTokens(text, isBERTUsed)

        if decontraction == True:

            # Call the 'decontract()' function to decontract a given text. 
            text = self.decontract(text, isBERTUsed)

        return text

#### Example of 'TextPreprocess' Class to pre-process a sample text

In [7]:
%%time
# Create an object of the class 'TextPreprocess'.
textPreprocessObj = TextPreprocess()

# Define a Text to be pre-processed.
text = 'The content is downloaded from http://www.wikipedia.com , which could\'ve also been downloaded from the IP Address\
        192.128.10.10. The content has some html content as well (<span>Some span content</span>). Few accent characters:\
        orčpžsíáýd stävänger hell. The content was downloaded in 2022. Isn\'t that a recent activity? :-) &*!@&'

# Call the 'preprocess()' method of the class to pre-process the given text.
print(textPreprocessObj.preprocess(text, hyperlink=True))

the content is downloaded from which could have also been downloaded from the ip address . the content has some html content as well some span content . few accent characters orcpzsiayd stavanger hell . the content was downloaded in . is not that a recent activity ? happy ! mention 
CPU times: user 2.16 ms, sys: 1.04 ms, total: 3.2 ms
Wall time: 3.23 ms


----

# 3. Load the pre-trained Object(s) and Model(s)

* Load the pre-trained object(s) and model(s) at once and re-use them wherever required.

## 3.1. Load the Tokenizer Object

* Load the Tokenizer Object to be used while predicting the text(s).

In [48]:
%%time
# File location having the tokenizer object trained on the training data.
#tokenizerObjFile = 'Resources/tokenizer.pkl'
tokenizerObjFile = F'/content/gdrive/My Drive/Case Study 2/tokenizer.pkl'

# Load the tokenizer object
with open(tokenizerObjFile, 'rb') as f:
    
    tokenizer = pickle.load(f)

CPU times: user 741 ms, sys: 64.3 ms, total: 805 ms
Wall time: 872 ms


## 3.3. Custom Loss Function

* Define the custom loss function used while training the model during the training phase.
* This custom function has to be defined and passed in the custom object configuration while loading the pre-trained model.

In [49]:
# Reference: https://github.com/nidhibansal1902/Jigsaw-Unintended-Bias-in-Toxicity-Classification/blob/master/Jigsaw-LSTM%20with%20Glove%20Embedding%20New.ipynb
def customLoss(yActual, yPred):
    '''
    Function to calculate loss for the toxic class label.
    
    Parameters:
    -----------
    yActual: array-like
        Actual Class Labels.
    yPred: array-like
        Predicted Class Labels.
    '''
    
    return binary_crossentropy(backend.reshape(yActual[:, 0], (-1, 1)), yPred) * yActual[:, 1]

## 3.3. Load the Pre-trained Model

* Load the pre-trained model to be used for predicting the tokenized and padded text(s)

In [50]:
%%time
# Load the pre-trained best model for doing prediction.
#model = models.load_model('BestModels/modelBiLSTM.h5')
model = models.load_model(F'/content/gdrive/My Drive/Case Study 2/BestModels/modelBiLSTM.h5', custom_objects={'customLoss': customLoss})

CPU times: user 4.45 s, sys: 1.74 s, total: 6.19 s
Wall time: 5.13 s


----

# 4. Function 1

* Function to predict the toxicity given text(s).

In [8]:
def function1(text, tokenizerObj, model, textFeature='comment_text', maxSeqLen=210, paddingType='post'):
    '''
    Function to implement the data pipeline for transforming the dataset into the required format as required by the Model
    and predict whether the given text is toxic or not, along with the toxicity score.
    
    Parameters:
    ----------
    text: str or Series or DataFrame containing the comment text(s)
        Comment Text(s) to be checked for toxicity
    tokenizerObj: Tokenizer
        Tokenizer object to be used for tokenizing the text(s).
    model: keras.engine.functional.Functional
        Pre-trained Model for doing the predictions.
    textFeature: str
        Name of the feature containing comment texts in case a DataFrame is passed as input.
    maxSeqLen: int
        Maximum sequence length.
    paddingType: str
        Type of padding to be done: post or pre.
    '''
    
    # region - Data Pre-processing -----------------------------------------------------------------------------------
    # ----------------------------------------------------------------------------------------------------------------
    
    # Check the input text and convert to a Series if it is not, for further processing.
    if isinstance(text, str):
        
        rawText = pd.Series(text)
    
    elif isinstance(text, pd.core.frame.DataFrame):
        
        # Extract the comment text feature.
        rawText = text[textFeature]
        
    else:
        
        rawText = text
        
    # Create an object of the class 'TextPreprocess'.
    textPreprocessObj = TextPreprocess()
        
    # Pre-processing the comment text(s) and store it in a list
    preprocessedText = rawText.apply(textPreprocessObj.preprocess)
    
    # endregion - Data Pre-processing --------------------------------------------------------------------------------
    # ----------------------------------------------------------------------------------------------------------------
    
    
    
    # region - Tokenization ------------------------------------------------------------------------------------------
    # ----------------------------------------------------------------------------------------------------------------

    # Do integer encoding of the input text(s).
    intEncodedTexts = tokenizerObj.texts_to_sequences(preprocessedText)
    
    # Pad the integer encoded comments texts (Post Padding) and return.
    paddedText = preprocessing.sequence.pad_sequences(intEncodedTexts, maxlen=maxSeqLen, padding=paddingType)

    # endregion - Tokenization ---------------------------------------------------------------------------------------
    # ----------------------------------------------------------------------------------------------------------------
    
    
    
    # region - Prediction --------------------------------------------------------------------------------------------
    # ----------------------------------------------------------------------------------------------------------------
    
    yPredProb = model.predict(paddedText)[0].flatten()
    yPredToxic = ['Yes' if prob >= 0.5 else 'No' for prob in yPredProb]
    
    # endregion - Prediction -----------------------------------------------------------------------------------------
    # ----------------------------------------------------------------------------------------------------------------


    
    # Create a DataFrame containing original text, Toxic Class and Toxicity Score (Probability)
    result = pd.DataFrame({'Comment Text': rawText, 'Toxic': yPredToxic, 'Probability': yPredProb})
    
    return result

## 4.1. Prediction

In [11]:
%%time
# Read the test file
#data = pd.read_csv('Data/test.csv')
data = pd.read_csv(F'/content/gdrive/My Drive/Case Study 2/test.csv')

CPU times: user 450 ms, sys: 182 ms, total: 632 ms
Wall time: 677 ms


In [26]:
%%time
# Call the 'function1()' function to know the toxicity of the given text(s).
predictionResult = function1(data['comment_text'][1195:1205], tokenizerObj=tokenizer, model=model)

CPU times: user 728 ms, sys: 1.61 s, total: 2.34 s
Wall time: 1.8 s


In [28]:
# Display the prediction results.
predictionResult

Unnamed: 0,Comment Text,Toxic,Probability
1195,"Niagara (I will refer to you from now on as Niagara, Golden Horseshoe person is too long to type), the same adoratio...",No,0.000645
1196,"I disagree. Mr. Trudeau weighed the pros and cons of these projects, and made a decision that tried to balance the ...",No,0.000349
1197,I see you are an optimistic person….,No,0.053182
1198,But didn't matter damaging my country's reputation with a fucked up show! Tourists come here and believe that a jung...,Yes,0.852723
1199,"Not exactly, Sarasi1.\n Gutenburg did stand apart in his invention.....he built and improved upon what the Chinese -...",No,0.009072
1200,"""Even the WNBA has the night off."" If you had simply left the word ""even"" out of that sentence, I wouldn't have blin...",No,0.000697
1201,Maybe he started the car again with the garage doors closed?,No,0.001099
1202,Bernstein is just an over-the-hill one-trick-pony looking for a comeback.,No,0.417608
1203,Independent Senator? Doubtful. Canada is under no obligation to take these people and shouldn't rush to offering sol...,No,0.000501
1204,Bernstein calls for doing a different thing. But news journalists are already doing it. Its called lying.,No,0.023026


In [44]:
%%time
# Let's check the prediction result for sample text.
text = 'The new candidate is irresponsible'
# Call the 'function1()' function to know the toxicity of the given text(s).
predictionResult = function1(text, tokenizerObj=tokenizer, model=model)

CPU times: user 153 ms, sys: 3.34 ms, total: 157 ms
Wall time: 93.8 ms


In [45]:
# Display the prediction result
predictionResult

Unnamed: 0,Comment Text,Toxic,Probability
0,The new candidate is irresponsible,No,0.215365


In [46]:
%%time
# Let's check the prediction result for sample text.
text = 'The new candidate is dumb and irresponsible'
# Call the 'function1()' function to know the toxicity of the given text(s).
predictionResult = function1(text, tokenizerObj=tokenizer, model=model)

CPU times: user 212 ms, sys: 10.3 ms, total: 222 ms
Wall time: 129 ms


In [47]:
# Display the prediction result
predictionResult

Unnamed: 0,Comment Text,Toxic,Probability
0,The new candidate is dumb and irresponsible,Yes,0.993099


----

# 5. Function 2

* Function to calculate the performance metrics and show the values.

## 5.1. Performance Metrics

* We will define the performance metrics required for the Problem as given in [Kaggle](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/overview/evaluation).
* A newly developed metric that combines several sub metrics to balance overall performance with various aspects of unintended bias, is used.
* **Reference has been taken from [Benchmark Kernel](https://www.kaggle.com/dborkan/benchmark-kernel/notebook#Define-bias-metrics,-then-evaluate-our-new-model-for-bias-using-the-validation-set-predictions)**

In [52]:
class PerformanceMetric:
    '''
    Class containing various methods to calculate the final metric score.
    '''

    def __init__(self):

        pass    

    def computeAUC(self, yActual, yPredProb):
        '''
        Function to calcuate the Overall AUC
        
        Parameters:
        ----------
        yActual: array-like
            Ground truth (correct) class labels for 'n' samples on CV/Test Dataset.
        yPredProb: array-like
            Predicted probabilities, as returned by a model's predict_proba method on CV/Test Dataset.
        '''
        
        try:
            
            return roc_auc_score(yActual, yPredProb)

        except:
            
            return np.nan


    def computeSubgroupAUC(self, data, subgroup, actualClassLabel, predClassLabel):
        '''
        Function to compute the AUC of the within-subgroup negative examples and the background positive examples.
        
        Parameters:
        ----------
        data: DataFrame
            Dataset containing the CV or Test dataset to be evaluated.
        subgroup: str
            Name of the Identity Subgroup.
        actualClassLabel: str
            Name of the Class Label. Note that the Class Label should have binary values i.e., 0 and 1 and not toxicity scores.
        predClassLabel: str
            Name of the Class Label containing the Predicted probabilities, as returned by a model's predict_proba method.
        '''
        
        # Create a subset of the subgroup having score greater than the threshold.
        subgroup = data[data[subgroup]]
        
        # Compute and return the AUC Score of the subgroup
        return self.computeAUC(subgroup[actualClassLabel], subgroup[predClassLabel])


    def computeBPSNAUC(self, data, subgroup, actualClassLabel, predClassLabel):
        '''
        Function to compute the AUC of the within-subgroup negative examples and the background positive examples.
        
        Parameters:
        ----------
        data: DataFrame
            Dataset containing the CV or Test dataset to be evaluated.
        subgroup: str
            Name of the Identity Subgroup.
        actualClassLabel: str
            Name of the Class Label. Note that the Class Label should have binary values i.e., 0 and 1 and not toxicity scores.
        predClassLabel: str
            Name of the Class Label containing the Predicted probabilities, as returned by a model's predict_proba method.
        '''
        
        # Create a subset of Subgroup Negative from the given dataset.
        subgroupNegativeSet = data[data[subgroup] & ~data[actualClassLabel]]
        
        # Create a subset of Background Positive from the given dataset.
        backgroundPositiveSet = data[~data[subgroup] & data[actualClassLabel]]
        
        # Combine both the Subgroup Negative and Background Positive subsets.
        finalSet = subgroupNegativeSet.append(backgroundPositiveSet)
        
        # Compute and return the BPSN AUC Score.
        return self.computeAUC(finalSet[actualClassLabel], finalSet[predClassLabel])


    def computeBNSPAUC(self, data, subgroup, actualClassLabel, predClassLabel):
        '''
        Function to compute the AUC of the within-subgroup positive examples and the background negative examples.
        
        Parameters:
        ----------
        data: DataFrame
            Dataset containing the CV or Test dataset to be evaluated.
        subgroup: str
            Name of the Identity Subgroup.
        actualClassLabel: str
            Name of the Class Label. Note that the Class Label should have binary values i.e., 0 and 1 and not toxicity scores.
        predClassLabel: str
            Name of the Class Label containing the Predicted probabilities, as returned by a model's predict_proba method.
        '''
        
        # Create a subset of Subgroup Positive from the given dataset.
        subgroupPositiveSet = data[data[subgroup] & data[actualClassLabel]]
        
        # Create a subset of Background Negative from the given dataset.
        backgroundNegativeSet = data[~data[subgroup] & ~data[actualClassLabel]]
        
        # Combine both the Subgroup Positive and Background Negative subsets.
        finalSet = subgroupPositiveSet.append(backgroundNegativeSet)
        
        # Compute and return the BNSP AUC Score.
        return self.computeAUC(finalSet[actualClassLabel], finalSet[predClassLabel])


    def computeBiasMetricsForModel(self, data, subgroups, predClassLabel, actualClassLabel):
        '''
        Function to compute per-subgroup metrics for all subgroups and one model.
        
        Parameters:
        ----------
        data: DataFrame
            Dataset containing the CV or Test dataset to be evaluated.
        subgroup: list
            List of the names of Identity Subgroups.
        actualClassLabel: str
            Name of the Class Label. Note that the Class Label should have binary values i.e., 0 and 1 and not toxicity scores.
        predClassLabel: str
            Name of the Class Label containing the Predicted probabilities, as returned by a model's predict_proba method.
        '''
        
        metricScores = list() # List to store the performance metric scores for each identity subgroup.
        
        # Define few constants to be used for dictionary keys.
        subgroupName, subgroupSize, subgroupAUC, bpsnAUC, bnspAUC = 'Subgroup', 'Subgroup Size', 'Subgroup AUC', 'BPSN AUC', \
                                                                'BNSP AUC'
        
        # Iterate through each of the identity subgroup and compute different AUC scores defined by the functions above.
        for subgroup in subgroups:
            
            # Define a dictionary to store the performance metric scores for the current subgroup in the iteration.
            metricScore = {
                subgroupName: subgroup,
                subgroupSize: len(data[data[subgroup]])
            }
            
            # Subgroup AUC
            metricScore[subgroupAUC] = self.computeSubgroupAUC(data=data, subgroup=subgroup, actualClassLabel=actualClassLabel,
                                                            predClassLabel=predClassLabel)
            
            # BPSN AUC
            metricScore[bpsnAUC] = self.computeBPSNAUC(data=data, subgroup=subgroup, actualClassLabel=actualClassLabel,
                                                            predClassLabel=predClassLabel)
            
            # BNSP AUC
            metricScore[bnspAUC] = self.computeBNSPAUC(data=data, subgroup=subgroup, actualClassLabel=actualClassLabel,
                                                            predClassLabel=predClassLabel)
            
            # Append the metric scores for the subgroup to the list 'metricScores'
            metricScores.append(metricScore)
            
        # Return the DataFrame containing the final performance metric scores for all identity subgroup.
        return pd.DataFrame(metricScores).sort_values(subgroupAUC)


    def computePowerMean(self, biasMetric, p=-5):
        '''
        Function to compute the generalized mean of Bias AUCs.
        
        Parameters:
        -----------
        biasMetric: Series
            Series containing the bias metric for all the identity subgroups.
        p: float
            Value to be used for the power on bias metrics.
        '''
        
        # Calculate the sum of the pth power of the bias metrics.
        total = sum(np.power(biasMetric, p))
        
        # Return the generalized mean of the Bias AUCs
        return np.power(total / len(biasMetric), 1/p)


    def computeOverallAUC(self, data, actualClassLabel, predClassLabel):
        '''
        Function to compute the overall AUC.
        
        Parameters:
        ----------
        data: DataFrame
            Dataset containing the CV or Test dataset to be evaluated.
        actualClassLabel: str
            Name of the Class Label. Note that the Class Label should have binary values i.e., 0 and 1 and not toxicity scores.
        predClassLabel: str
            Name of the Class Label containing the Predicted probabilities, as returned by a model's predict_proba method.
        '''
        
        # Get the actual class labels.
        yActual = data[actualClassLabel]
        
        # Get the predicted class probabilities.
        yPredProb = data[predClassLabel]
        
        # Return the AUC Score.
        return self.computeAUC(yActual=yActual, yPredProb=yPredProb)


    def computeFinalMetric(self, biasDF, overallAUC, p=-5, overallModelWeight=0.25):
        '''
        Function to compute the final metric score.
        
        Parameters:
        ----------
        biasDF: DataFrame
            Dataset containing the final performance metric scores for all identity subgroup.
        overallAUC: float    
            Overall AUC computed from the computeOverallAUC() function.
        p: float
            Value to be used for the power on bias metrics.
        overallModelWeight: float
            Weight value for the relative importance of each submetric; all four w values set to 0.25
        '''
        
        # Define few constants to be used for dictionary keys.
        subgroupAUC, bpsnAUC, bnspAUC = 'Subgroup AUC', 'BPSN AUC', 'BNSP AUC'
        
        # Get the average of the generalized mean of each metrics.
        biasScore = np.average([
            self.computePowerMean(biasDF[subgroupAUC], p),
            self.computePowerMean(biasDF[bpsnAUC], p),
            self.computePowerMean(biasDF[bnspAUC], p)
        ])
        
        # Return the final metric score.
        return (overallModelWeight * overallAUC) + ((1 - overallModelWeight) * biasScore)

In [53]:
%%time
# Create an object of the class 'Performance Metric'
performanceMetricObj = PerformanceMetric()

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 14.1 µs


In [60]:
# Define some constant variables
# Define the list of the identity subgroup columns.
identitySubgroups = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 
                     'psychiatric_or_mental_illness']

# Define the list of toxicity subgroups or auxiliary columns.
auxColumns = ['severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']

## 5.2. Function 2

In [62]:
def function2(X, y):
    '''
    Function to implement the data pipeline for transforming the dataset into the required format as required by the model and
    predict the toxicity of the given comment text(s).

    Parameters:
    ----------
    X: DataFrame
        Dataset containing the features.
    y: Series
        Dataset containing the Class Labels.
    '''

    # Create a copy of the dataset
    xData = X.copy()

    # Call the 'function1()' function to implement the datapipeline and get the predicted probabilities.
    predictedData = function1(xData, tokenizerObj=tokenizer, model=model, textFeature='comment_text')
    xData['yPredProb'] = predictedData['Probability']


    # Convert the target class to binary values: 1 when the toxicity score >= 0.5 and 0 otherwise.
    xData['target'] = y.apply(lambda x: 1 if x >= 0.5 else 0)

    # Convert the value of the above identity subgroups as true when their value >= 0.5 and false otherwise.
    # This will be required while calculating the performance metrics
    for subgroup in tqdm(identitySubgroups):
        
        xData[subgroup] = xData[subgroup].apply(lambda x: True if x >= 0.5 else False)
        
    # Call the 'computeBiasMetricsForModel()' function to get the Subgroup, BPSN and BNSP AUCs for each identity sugroups
    biasMetrics = performanceMetricObj.computeBiasMetricsForModel(data=xData, subgroups=identitySubgroups, predClassLabel='yPredProb', 
                                                                  actualClassLabel='target')
    
    # Get the final metric score
    finalMetricScore = performanceMetricObj.computeFinalMetric(biasMetrics, performanceMetricObj.computeOverallAUC(data=xData, 
                                    actualClassLabel='target', predClassLabel='yPredProb'))
    
    return finalMetricScore

## 5.2. Evaluate

In [63]:
%%time
# Read the required data from the pickle file.
#variablesPicklePath = 'Resources/finalVariables.pkl' # Variable containing the path of the pickle file.
variablesPicklePath = F'/content/gdrive/My Drive/Case Study 2/finalVariables.pkl' # Google Drive location    
with open(variablesPicklePath, 'rb') as f:
        
    # Read all the processed variables.
    xTrainText, xTrainNum, yTrainW, yTrainAux, xCVText, xCVNum, yCVW, yCVAux, xTestText, xTestNum, yTestW, yTestAux, testData, embeddingMatrix, lossWeight, vocabSize, maxSeqLength, gloveVectorDim, identitySubgroups, auxColumns = pickle.load(f)

CPU times: user 2.19 s, sys: 10.8 s, total: 13 s
Wall time: 35 s


In [64]:
# Display the top 5 records from the test data.
testData.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,preprocessed_text1,original_text_wc,preprocessed_text_wc1
755014,5044839,1,"oh left wing stupidity never ends, you know its OUR money and we need it not government bud",0.018868,0.018868,0.056604,0.660377,0.018868,,,,False,,False,False,,,False,,False,,False,False,,,,,,,False,,False,2017-03-23 18:10:01.104733+00,54,5044572.0,321633,approved,0,0,0,1,1,0.0,0,53,oh left wing stupidity never ends you know its our money and we need it not government bud,18,18
829374,5135836,0,"They have no interest in those of lesser means, unless they can gin up sympathy for mommy government to step in and ...",0.0,0.0,0.0,0.0,0.0,,,,False,,False,False,,,False,,False,,False,False,,,,,,,False,,False,2017-04-15 12:11:27.202073+00,54,5133988.0,326995,approved,1,0,0,0,4,0.0,0,4,they have no interest in those of lesser means unless they can gin up sympathy for mommy government to step in and g...,26,27
663613,1053903,0,"hurts, don't it?",0.0,0.0,0.0,0.0,0.0,,,,False,,False,False,,,False,,False,,False,False,,,,,,,False,,False,2017-02-28 00:41:40.586616+00,21,1052155.0,315248,rejected,0,0,0,0,0,0.0,0,4,hurts do not it ?,3,5
1507984,5966577,0,""" Why do the NeoComs want illegal aliens protected more than they want American citizens protected?""\nBecause in gen...",0.0,0.0,0.0,0.1,0.0,,,,False,,False,False,,,False,,False,,False,False,,,,,,,False,,False,2017-09-16 22:02:23.972802+00,102,5963226.0,378326,approved,0,0,0,0,2,0.0,0,10,why do the neocoms want illegal aliens protected more than they want american citizens protected ? because in gener...,26,27
888383,5207232,0,"fair enough, but then EVERY outrage outbreak on Twitter is completely overblown and manufactured. It's just the natu...",0.0,0.0,0.0,0.0,0.0,,,,False,,False,False,,,False,,False,,False,False,,,,,,,False,,False,2017-05-02 23:40:51.890084+00,54,,331453,approved,0,0,1,12,3,0.0,0,4,fair enough but then every outrage outbreak on twitter is completely overblown and manufactured . it is just the nat...,23,26


In [65]:
print('Shape of the Test Data: ', testData.shape)

Shape of the Test Data:  (230122, 48)


In [66]:
# Split the dataset into features and class labels.
yData = testData['target']
xData = testData.drop(columns='target')

In [67]:
%%time
# Call the 'function2()' to get the final metric score.
finalMetric = function2(X=xData, y=yData)

  0%|          | 0/9 [00:00<?, ?it/s]

CPU times: user 1h 30min 59s, sys: 6min 55s, total: 1h 37min 55s
Wall time: 30min 23s


In [68]:
print('Final Metric Score on the Test Dataset: ', finalMetric)

Final Metric Score on the Test Dataset:  0.9274952724060705


----