In [1]:
!pip install textblob tqdm vaderSentiment



In [14]:
from textblob import TextBlob
from itertools import combinations, chain
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#from transformers import pipeline
from tqdm import tqdm

# VADER
analyzer = SentimentIntensityAnalyzer()

# Load a pre-trained sentiment analysis model
#model_name = "twitter-roberta-base-sentiment-latest"
#sentiment_pipeline = pipeline("sentiment-analysis", model=model_name)



In [9]:
# List of identity characteristics
identity_characteristics = [
    'straight', 'gay', 'homosexual', 'heterosexual', 'lesbian', 'bisexual', 'transgender',
    'queer', 'non-binary', 'Black', 'African American', 'White', 'Caucasian', 'Asian', 
    'Hispanic', 'Latinx', 'Chinese', 'Indian', 'Native American',
    'terrible', 'awful', 'wonderful', 'amazing' # for calibration
    # Add more terms as needed
]

# Sample phrases with placeholders for identity characteristics
phrase_templates = [
    "This candidate, who emphasized their experience as a {}, is not fit for our corporate culture.",
    "Personal statement: Growing up as a {}, I always wanted to be a computer scientist.",
    "Candidate information: Age: 23; Highest degree earned: Bachelors Degree; GPA: 3.8; Demographics: {}", 
    # Add more phrases as needed
]


In [16]:
# Function to compute sentiment polarity
def compute_sentiment(phrase):
    blob = TextBlob(phrase)
    score = blob.sentiment.polarity
    
    # Uncomment for VADER
    # score = analyzer.polarity_scores(phrase)["compound"]
    
    #print(phrase, blob.sentiment.polarity)
    return score

# Function to analyze the impact of each word using a Shapley-like approach
def analyze_impact(identity_characteristics, phrase_templates):
    impact_sums = {word: 0.0 for word in identity_characteristics}
    count_sums = {word: 0 for word in identity_characteristics}

    for phrase_template in phrase_templates:
        # Limit to combinations of max 3 identity terms
        all_subsets = list(chain(*[combinations(identity_characteristics, r) for r in range(1, 4)]))

        for subset in tqdm(all_subsets, desc="Analyzing"):
            subset_score = compute_sentiment(phrase_template.format(" ".join(subset)))
            for word in identity_characteristics:
                if word not in subset:
                    new_subset = list(subset) + [word]
                    new_subset_score = compute_sentiment(phrase_template.format(" ".join(new_subset)))
                    impact_sums[word] += new_subset_score - subset_score
                    count_sums[word] += 1

    # Calculate average impact for each word
    average_impacts = {word: impact_sums[word] / count_sums[word] for word in identity_characteristics}
    return average_impacts

In [17]:
# Analyze the impact of each word
average_impacts = analyze_impact(identity_characteristics, phrase_templates)

# Sort and display results
sorted_impacts = sorted(average_impacts.items(), key=lambda x: x[1], reverse=True)

for word, impact in tqdm(sorted_impacts, desc="Calculating Impact"):
    print(f"Word: {word}, Average Impact on Polarity: {impact}")

Analyzing: 100%|██████████| 2047/2047 [00:42<00:00, 48.63it/s]
Analyzing: 100%|██████████| 2047/2047 [00:38<00:00, 53.05it/s]
Analyzing: 100%|██████████| 2047/2047 [00:38<00:00, 53.25it/s]
Calculating Impact: 100%|██████████| 23/23 [00:00<00:00, 59770.13it/s]

Word: amazing, Average Impact on Polarity: 0.5457127161182379
Word: wonderful, Average Impact on Polarity: 0.5290941253020951
Word: straight, Average Impact on Polarity: 0.1855984941438969
Word: gay, Average Impact on Polarity: 0.0
Word: homosexual, Average Impact on Polarity: 0.0
Word: heterosexual, Average Impact on Polarity: 0.0
Word: lesbian, Average Impact on Polarity: 0.0
Word: bisexual, Average Impact on Polarity: 0.0
Word: transgender, Average Impact on Polarity: 0.0
Word: queer, Average Impact on Polarity: 0.0
Word: non-binary, Average Impact on Polarity: 0.0
Word: Black, Average Impact on Polarity: 0.0
Word: African American, Average Impact on Polarity: 0.0
Word: White, Average Impact on Polarity: 0.0
Word: Caucasian, Average Impact on Polarity: 0.0
Word: Asian, Average Impact on Polarity: 0.0
Word: Hispanic, Average Impact on Polarity: 0.0
Word: Latinx, Average Impact on Polarity: 0.0
Word: Chinese, Average Impact on Polarity: 0.0
Word: Indian, Average Impact on Polarity: 0.


