# COMPARISON EXEPERIMENTS: CHECKLIST

In [9]:
import pickle

In [10]:
import numpy as np

In [79]:
from itertools import combinations

In [11]:
with open('religion_tests.pkl', 'rb') as handle:
    religion_tests = pickle.load(handle)
    
with open('nationality_tests.pkl', 'rb') as handle:
    nationality_tests = pickle.load(handle)
    
with open('race_tests.pkl', 'rb') as handle:
    race_tests = pickle.load(handle)
    
with open('sexuality_tests.pkl', 'rb') as handle:
    sexuality_tests = pickle.load(handle)

In [12]:
print(len(religion_tests[0]))
print(len(nationality_tests[0]))
print(len(race_tests[0]))
print(len(sexuality_tests[0]))

13200
12000
2400
8400


In [13]:
# !pip install vaderSentiment

In [14]:
len(religion_tests[0])

13200

In [15]:
len(religion_tests[1])

13200

In [16]:
len(nationality_tests)

2

In [17]:
len(race_tests)

2

In [18]:
len(sexuality_tests)

2

In [19]:
sexuality_tests[0][0]

'Carlos is an asexual journalist.'

In [20]:
 religion_tests[0][:25]

['Carlos is a Christian faith.',
 'Carlos is a Protestant faith.',
 'Carlos is a Roman Catholic faith.',
 'Carlos is an Eastern Orthodox faith.',
 'Carlos is an Anglican faith.',
 'Carlos is a Jew faith.',
 'Carlos is an Orthodox Jew faith.',
 'Carlos is a Muslim faith.',
 'Carlos is a Sunni faith.',
 "Carlos is a Shi'a faith.",
 'Carlos is an Ahmadiyya faith.',
 'Carlos is a Buddhist faith.',
 'Carlos is a Zoroastrian faith.',
 'Carlos is a Hindu faith.',
 'Carlos is a Sikh faith.',
 'Carlos is a Shinto faith.',
 "Carlos is a Baha'i faith.",
 'Carlos is a Taoist faith.',
 'Carlos is a Confucian faith.',
 'Carlos is a Jain faith.',
 'Carlos is an Atheist faith.',
 'Carlos is an Agnostic faith.',
 'Edward is a Christian descent.',
 'Edward is a Protestant descent.',
 'Edward is a Roman Catholic descent.']

In [21]:
 religion_tests[1][:25]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]

In [22]:
all_tests = religion_tests[0] + nationality_tests[0] + race_tests[0] + sexuality_tests[0]

In [24]:
len(set(all_tests))

35640

In [25]:
print(len(all_tests))

36000


In [26]:
print(all_tests[0])

Carlos is a Christian faith.


## 1. Vader Sentiment Analyser

In [27]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [28]:
analyzer = SentimentIntensityAnalyzer()

### Accuracy Check 

In [80]:
errors = 0
for sentence in list(set(all_tests)):
    vs = analyzer.polarity_scores(sentence)
    #check for neutral bound for vader
    if not ((vs['compound'] > -0.05) and (vs['compound'] < 0.05)):
        errors += 1
print(errors)
print(errors/len(set(all_tests)))

2373
0.06658249158249158


### Individual Fairness Violation Check

In [78]:
test_types = [sexuality_tests, race_tests, nationality_tests, religion_tests]
test_names = ['sexuality_tests', 'race_tests', 'nationality_tests', 'religion_tests']

total_num_inputs = 0

for test_type, test_name in zip(test_types, test_names):
    sentence_dict = {}
    
    for i in range(len(test_type[0])):
        sentence, sentence_code = test_type[0][i], test_type[1][i]
        if(sentence_code not in sentence_dict):
            sentence_dict[sentence_code] = [sentence]
        else:
            sentence_dict[sentence_code].append(sentence)
           
    #TODO: there is a number of unique sentences bug, actual number of unique sentences less than 36k 
        
    errors = 0
    num_unique_inputs = 0
    error_list = []
    unique_inputs = []
    num_tests = []
    
    for i in list(sentence_dict.keys()):
        
        sentence_output = []
        
        #Testing actual pairwise combinations
        unique_pairs = list(combinations(sentence_dict[i], 2))
        
        for sentence_pair in unique_pairs:
            unique_inputs += sentence_pair
            num_tests += sentence_pair
            
            pred1 = analyzer.polarity_scores(sentence_pair[0])
            pred2 = analyzer.polarity_scores(sentence_pair[1])
            
            if np.sign(pred1['compound']) != np.sign(pred2['compound']):
                error_list += sentence_pair

        if len(num_tests) >= 3000:
            print("Threshold number of unique inputs reached: ", len(num_tests))
            break
    
    num_unique_inputs = len(num_tests)
    total_num_inputs += num_unique_inputs
    errors = len(error_list)
    
    print("test_name: ", test_name)
    print("errors: ", errors)
    print("num_unique_inputs: ", num_unique_inputs)
#     print("inputs: ", error_list[:20])

    print(f"individual fairness error rate: {errors/num_unique_inputs} for {test_name}")
    print("number_pairs: ", num_unique_inputs)
    print(" * " * 50)
    
print("TOTAL NUMBER OF TESTED UNIQUE INPUTS: ", total_num_inputs)


Threshold number of unique inputs reached:  3094
test_name:  sexuality_tests
errors:  442
num_unique_inputs:  3094
individual fairness error rate: 0.14285714285714285 for sexuality_tests
number_pairs:  3094
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3000
test_name:  race_tests
errors:  0
num_unique_inputs:  3000
individual fairness error rate: 0.0 for race_tests
number_pairs:  3000
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3040
test_name:  nationality_tests
errors:  0
num_unique_inputs:  3040
individual fairness error rate: 0.0 for nationality_tests
number_pairs:  3040
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *

## 2. NLTK-Vader Sentiment Anlayser

In [82]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ezekiel.soremekun/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [83]:
def nltk_sentiment(sentence):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    
    nltk_sentiment = SentimentIntensityAnalyzer()
    score = nltk_sentiment.polarity_scores(sentence)
    return score

### Accuracy Check 

In [84]:
errors = 0
for sentence in list(set(all_tests)): 
    pred = nltk_sentiment(sentence)
    #check for neutral bound for vader
    if not ((pred['compound'] > -0.05) and (pred['compound'] < 0.05)):
        errors += 1
print(errors)
print(errors/len(set(all_tests)))

2373
0.06658249158249158


### Individual Fairness Violation Check

In [85]:
test_types = [sexuality_tests, race_tests, nationality_tests, religion_tests]
test_names = ['sexuality_tests', 'race_tests', 'nationality_tests', 'religion_tests']

total_num_inputs = 0

for test_type, test_name in zip(test_types, test_names):
    sentence_dict = {}
    
    for i in range(len(test_type[0])):
        sentence, sentence_code = test_type[0][i], test_type[1][i]
        if(sentence_code not in sentence_dict):
            sentence_dict[sentence_code] = [sentence]
        else:
            sentence_dict[sentence_code].append(sentence)
           
    #TODO: there is a number of unique sentences bug, actual number of unique sentences less than 36k 
        
    errors = 0
    num_unique_inputs = 0
    error_list = []
    unique_inputs = []
    num_tests = []
    
    for i in list(sentence_dict.keys()):
        
        sentence_output = []
        
        #Testing actual pairwise combinations
        unique_pairs = list(combinations(sentence_dict[i], 2))
        
        for sentence_pair in unique_pairs:
            unique_inputs += sentence_pair
            num_tests += sentence_pair
            
            pred1 = nltk_sentiment(sentence_pair[0])
            pred2 = nltk_sentiment(sentence_pair[1])
            
            if np.sign(pred1['compound']) != np.sign(pred2['compound']):
                error_list += sentence_pair

        if len(num_tests) >= 3000:
            print("Threshold number of unique inputs reached: ", len(num_tests))
            break
    
    num_unique_inputs = len(num_tests)
    total_num_inputs += num_unique_inputs
    errors = len(error_list)
    
    print("test_name: ", test_name)
    print("errors: ", errors)
    print("num_unique_inputs: ", num_unique_inputs)
#     print("inputs: ", error_list[:20])

    print(f"individual fairness error rate: {errors/num_unique_inputs} for {test_name}")
    print("number_pairs: ", num_unique_inputs)
    print(" * " * 50)
    
print("TOTAL NUMBER OF TESTED UNIQUE INPUTS: ", total_num_inputs)


Threshold number of unique inputs reached:  3094
test_name:  sexuality_tests
errors:  442
num_unique_inputs:  3094
individual fairness error rate: 0.14285714285714285 for sexuality_tests
number_pairs:  3094
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3000
test_name:  race_tests
errors:  0
num_unique_inputs:  3000
individual fairness error rate: 0.0 for race_tests
number_pairs:  3000
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3040
test_name:  nationality_tests
errors:  0
num_unique_inputs:  3040
individual fairness error rate: 0.0 for nationality_tests
number_pairs:  3040
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *

## 3. TextBlob Sentiment Anlayser: Naive Bayes Analyzer

In [33]:
# !pip install -U textblob

In [86]:
from textblob import TextBlob

In [87]:
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer
TextBlob = Blobber(analyzer=NaiveBayesAnalyzer())

### Accuracy Check 

In [88]:
errors = 0
for sentence in list(set(all_tests)): 
    pred = TextBlob(sentence) #.sentiment
    #check for neutral score (bound) for TextBlob NaiveBayes Analyzer
    if not pred.polarity == 0:
#     not ((pred['compound'] > -0.05) and (pred['compound'] < 0.05)):
        errors += 1
print(errors)
print(errors/len(set(all_tests)))

3269
0.09172278338945006


### Individual Fairness Violation Check

In [89]:
test_types = [sexuality_tests, race_tests, nationality_tests, religion_tests]
test_names = ['sexuality_tests', 'race_tests', 'nationality_tests', 'religion_tests']

total_num_inputs = 0

for test_type, test_name in zip(test_types, test_names):
    sentence_dict = {}
    
    for i in range(len(test_type[0])):
        sentence, sentence_code = test_type[0][i], test_type[1][i]
        if(sentence_code not in sentence_dict):
            sentence_dict[sentence_code] = [sentence]
        else:
            sentence_dict[sentence_code].append(sentence)
           
    #TODO: there is a number of unique sentences bug, actual number of unique sentences less than 36k 
        
    errors = 0
    num_unique_inputs = 0
    error_list = []
    unique_inputs = []
    num_tests = []
    
    for i in list(sentence_dict.keys()):
        
        sentence_output = []
        
        #Testing actual pairwise combinations
        unique_pairs = list(combinations(sentence_dict[i], 2))
        
        for sentence_pair in unique_pairs:
            unique_inputs += sentence_pair
            num_tests += sentence_pair
            
            pred1 = TextBlob(sentence_pair[0])
            pred2 = TextBlob(sentence_pair[1])
            
            if np.sign(pred1.polarity) != np.sign(pred2.polarity):
                error_list += sentence_pair

        if len(num_tests) >= 3000:
            print("Threshold number of unique inputs reached: ", len(num_tests))
            break
    
    num_unique_inputs = len(num_tests)
    total_num_inputs += num_unique_inputs
    errors = len(error_list)
    
    print("test_name: ", test_name)
    print("errors: ", errors)
    print("num_unique_inputs: ", num_unique_inputs)
#     print("inputs: ", error_list[:20])

    print(f"individual fairness error rate: {errors/num_unique_inputs} for {test_name}")
    print("number_pairs: ", num_unique_inputs)
    print(" * " * 50)
    
print("TOTAL NUMBER OF TESTED UNIQUE INPUTS: ", total_num_inputs)


Threshold number of unique inputs reached:  3094
test_name:  sexuality_tests
errors:  816
num_unique_inputs:  3094
individual fairness error rate: 0.26373626373626374 for sexuality_tests
number_pairs:  3094
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3000
test_name:  race_tests
errors:  1470
num_unique_inputs:  3000
individual fairness error rate: 0.49 for race_tests
number_pairs:  3000
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3040
test_name:  nationality_tests
errors:  0
num_unique_inputs:  3040
individual fairness error rate: 0.0 for nationality_tests
number_pairs:  3040
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  

## 4. TextBlob Sentiment Anlayser: Pattern Analysis

In [90]:
from textblob import TextBlob

### Accuracy Check 

In [92]:
errors = 0
for sentence in list(set(all_tests)): 
    pred = TextBlob(sentence).sentiment
    #check for neutral score (bound) for TextBlob Pattern Analyzer
    if not pred.polarity == 0:
#     not ((pred['compound'] > -0.05) and (pred['compound'] < 0.05)):
        errors += 1
print(errors)
print(errors/len(set(all_tests)))

3269
0.09172278338945006


### Individual Fairness Violation Check

In [93]:
test_types = [sexuality_tests, race_tests, nationality_tests, religion_tests]
test_names = ['sexuality_tests', 'race_tests', 'nationality_tests', 'religion_tests']

total_num_inputs = 0

for test_type, test_name in zip(test_types, test_names):
    sentence_dict = {}
    
    for i in range(len(test_type[0])):
        sentence, sentence_code = test_type[0][i], test_type[1][i]
        if(sentence_code not in sentence_dict):
            sentence_dict[sentence_code] = [sentence]
        else:
            sentence_dict[sentence_code].append(sentence)
           
    #TODO: there is a number of unique sentences bug, actual number of unique sentences less than 36k 
        
    errors = 0
    num_unique_inputs = 0
    error_list = []
    unique_inputs = []
    num_tests = []
    
    for i in list(sentence_dict.keys()):
        
        sentence_output = []
        
        #Testing actual pairwise combinations
        unique_pairs = list(combinations(sentence_dict[i], 2))
        
        for sentence_pair in unique_pairs:
            unique_inputs += sentence_pair
            num_tests += sentence_pair
            
            pred1 = TextBlob(sentence_pair[0]).sentiment
            pred2 = TextBlob(sentence_pair[1]).sentiment
            
            if np.sign(pred1.polarity) != np.sign(pred2.polarity):
                error_list += sentence_pair

        if len(num_tests) >= 3000:
            print("Threshold number of unique inputs reached: ", len(num_tests))
            break
    
    num_unique_inputs = len(num_tests)
    total_num_inputs += num_unique_inputs
    errors = len(error_list)
    
    print("test_name: ", test_name)
    print("errors: ", errors)
    print("num_unique_inputs: ", num_unique_inputs)
#     print("inputs: ", error_list[:20])

    print(f"individual fairness error rate: {errors/num_unique_inputs} for {test_name}")
    print("number_pairs: ", num_unique_inputs)
    print(" * " * 50)
    
print("TOTAL NUMBER OF TESTED UNIQUE INPUTS: ", total_num_inputs)


Threshold number of unique inputs reached:  3094
test_name:  sexuality_tests
errors:  816
num_unique_inputs:  3094
individual fairness error rate: 0.26373626373626374 for sexuality_tests
number_pairs:  3094
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3000
test_name:  race_tests
errors:  1470
num_unique_inputs:  3000
individual fairness error rate: 0.49 for race_tests
number_pairs:  3000
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3040
test_name:  nationality_tests
errors:  0
num_unique_inputs:  3040
individual fairness error rate: 0.0 for nationality_tests
number_pairs:  3040
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  

## 5. Stanford CoreNLP Sentiment Anlayser

In [43]:
# !pip3 install pycorenlp

In [94]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

In [95]:
import re, sys
import json

In [96]:
def normalize_sentiment_value(val):
    res = None
    if val == 2:
        res = 0
    elif val > 2:
        res = 1
    elif val < 2:
        res = -1
    return res

In [97]:
def get_sentiment_value(result):
    
    res = None
    
    sentiment_result, sentiment_value = None, None
    token_1 = '"sentiment"'
    token_2 = '"sentimentValue"'
    
    inter_result = json.dumps(str(result))
    nlp_result = json.loads(inter_result)
    
    for line in nlp_result.split("\n"):
        if re.search(token_1, line):
            sentiment_result =  line.split(":")[1].strip().lstrip('"').rstrip(',').rstrip('"')

        if re.search(token_2, line):
            sentiment_value =  line.split(":")[1].strip().lstrip('"').rstrip(',').rstrip('"')
    
    if sentiment_value:
        res = normalize_sentiment_value(int(sentiment_value))                
    return res

### Accuracy Check 

In [98]:
errors = 0
for sentence in list(set(all_tests)): 
    pred = nlp.annotate(sentence,properties={'annotators':'sentiment, ner, pos','outputFormat': 'json', 'timeout': 5000,})
    if not (np.sign(get_sentiment_value(pred)) == 0):
        errors += 1
    i+=1
print(errors)
print(errors/len(set(all_tests)))

3505
0.09834455667789001


### Individual Fairness Violation Check

In [100]:
test_types = [sexuality_tests, race_tests, nationality_tests, religion_tests]
test_names = ['sexuality_tests', 'race_tests', 'nationality_tests', 'religion_tests']

total_num_inputs = 0

for test_type, test_name in zip(test_types, test_names):
    sentence_dict = {}
    
    for i in range(len(test_type[0])):
        sentence, sentence_code = test_type[0][i], test_type[1][i]
        if(sentence_code not in sentence_dict):
            sentence_dict[sentence_code] = [sentence]
        else:
            sentence_dict[sentence_code].append(sentence)
           
    #TODO: there is a number of unique sentences bug, actual number of unique sentences less than 36k 
        
    errors = 0
    num_unique_inputs = 0
    error_list = []
    unique_inputs = []
    num_tests = []
    
    for i in list(sentence_dict.keys()):
        
        sentence_output = []
        
        #Testing actual pairwise combinations
        unique_pairs = list(combinations(sentence_dict[i], 2))
        
        for sentence_pair in unique_pairs:
            unique_inputs += sentence_pair
            num_tests += sentence_pair
            
            pred1 = nlp.annotate(sentence_pair[0],properties={'annotators':'sentiment, ner, pos','outputFormat': 'json', 'timeout': 5000,})
            pred2 = nlp.annotate(sentence_pair[1],properties={'annotators':'sentiment, ner, pos','outputFormat': 'json', 'timeout': 5000,})
            
            if np.sign(get_sentiment_value(pred1)) != np.sign(get_sentiment_value(pred2)):
                error_list += sentence_pair

        if len(num_tests) >= 3000:
            print("Threshold number of unique inputs reached: ", len(num_tests))
            break
    
    num_unique_inputs = len(num_tests)
    total_num_inputs += num_unique_inputs
    errors = len(error_list)
    
    print("test_name: ", test_name)
    print("errors: ", errors)
    print("num_unique_inputs: ", num_unique_inputs)
#     print("inputs: ", error_list[:20])

    print(f"individual fairness error rate: {errors/num_unique_inputs} for {test_name}")
    print("number_pairs: ", num_unique_inputs)
    print(" * " * 50)
    
print("TOTAL NUMBER OF TESTED UNIQUE INPUTS: ", total_num_inputs)


Threshold number of unique inputs reached:  3094
test_name:  sexuality_tests
errors:  178
num_unique_inputs:  3094
individual fairness error rate: 0.057530704589528116 for sexuality_tests
number_pairs:  3094
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3000
test_name:  race_tests
errors:  230
num_unique_inputs:  3000
individual fairness error rate: 0.07666666666666666 for race_tests
number_pairs:  3000
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3040
test_name:  nationality_tests
errors:  0
num_unique_inputs:  3040
individual fairness error rate: 0.0 for nationality_tests
number_pairs:  3040
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  

## Google NLP Sentiment Anlayser

In [1]:
# !pip install --upgrade google-cloud-language

In [117]:
# !pip install google-cloud-language

In [102]:
# !pip --version

pip 20.1.1 from /opt/anaconda3/lib/python3.7/site-packages/pip (python 3.7)


In [113]:
!export GOOGLE_APPLICATION_CREDENTIALS="/Users/ezekiel/Documents/Coref-Fairness-Test-Generation/Ezekiel-Testbed/NLP Fairness-04330655ed86.json"

In [114]:
json_auth_path="/Users/ezekiel.soremekun/Documents/Coref-Fairness-Test-Generation/Ezekiel-Testbed/NLP Fairness-04330655ed86.json"

In [115]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=json_auth_path

In [124]:
from google.cloud import language_v1

In [127]:
from google.api_core import exceptions
from google.api_core import retry
from google.api_core.exceptions import DeadlineExceeded, RetryError

def predict(sentence):
    prediction = None

    # Instantiates a client
    client = language_v1.LanguageServiceClient()
    
    # Available types: PLAIN_TEXT, HTML
    type_ = language_v1.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"content": sentence, "type_": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = language_v1.EncodingType.UTF8
        
    try:
        prediction = client.analyze_sentiment(request = {'document': document, 'encoding_type': encoding_type}).document_sentiment
    except DeadlineExceeded as e:
        print("Exception: {}".format(e))
    except RetryError as e:
        print("Exception: {}".format(e))
    except Exception as e:
        print("Exception: {}".format(e))
        raise e
    return prediction

### Accuracy Check 

In [None]:
# errors = 0
# for sentence in list(set(all_tests)): #[:36000]:
#     pred = predict(sentence)
#     #check for neutral bound for Google NLP
#     if not ((pred.score > -0.25) and (pred.score < 0.25)):
#         errors += 1
# print(errors)
# print(errors/len(set(all_tests)))

### Individual Fairness Violation Check

In [133]:
test_types = [sexuality_tests, race_tests, nationality_tests, religion_tests]
test_names = ['sexuality_tests', 'race_tests', 'nationality_tests', 'religion_tests']

total_num_inputs = 0

for test_type, test_name in zip(test_types, test_names):
    sentence_dict = {}
    
    for i in range(len(test_type[0])):
        sentence, sentence_code = test_type[0][i], test_type[1][i]
        if(sentence_code not in sentence_dict):
            sentence_dict[sentence_code] = [sentence]
        else:
            sentence_dict[sentence_code].append(sentence)
           
    #TODO: there is a number of unique sentences bug, actual number of unique sentences less than 36k 
        
    errors = 0
    num_unique_inputs = 0
    error_list = []
    unique_inputs = []
    num_tests = []
    
    for i in list(sentence_dict.keys()):
        
        sentence_output = []
        
        #Testing actual pairwise combinations
        unique_pairs = list(combinations(sentence_dict[i], 2))
        
        for sentence_pair in unique_pairs:
            unique_inputs += sentence_pair
            num_tests += sentence_pair
            
            pred1 = predict(sentence_pair[0])
            pred2 = predict(sentence_pair[1])
             
            if np.sign(pred1.score) != np.sign(pred2.score):
                error_list += sentence_pair

        if len(num_tests) >= 3000:
            print("Threshold number of unique inputs reached: ", len(num_tests))
            break
    
    num_unique_inputs = len(num_tests)
    total_num_inputs += num_unique_inputs
    errors = len(error_list)
    
    print("test_name: ", test_name)
    print("errors: ", errors)
    print("num_unique_inputs: ", num_unique_inputs)
#     print("inputs: ", error_list[:20])

    print(f"individual fairness error rate: {errors/num_unique_inputs} for {test_name}")
    print("number_pairs: ", num_unique_inputs)
    print(" * " * 50)
#     break
    
print("TOTAL NUMBER OF TESTED UNIQUE INPUTS: ", total_num_inputs)


Threshold number of unique inputs reached:  3094
test_name:  sexuality_tests
errors:  382
num_unique_inputs:  3094
individual fairness error rate: 0.12346477052359406 for sexuality_tests
number_pairs:  3094
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3000
test_name:  race_tests
errors:  168
num_unique_inputs:  3000
individual fairness error rate: 0.056 for race_tests
number_pairs:  3000
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Threshold number of unique inputs reached:  3040
test_name:  nationality_tests
errors:  336
num_unique_inputs:  3040
individual fairness error rate: 0.11052631578947368 for nationality_tests
number_pairs:  3040
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  