In [None]:
# Import libraries
import pandas as pd
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import numpy as np
import joblib
from textblob import TextBlob  # For sentiment analysis
from thop import profile

## **Understanding the Dataset**

Load and Inspect the Dataset

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/HerWill/train.csv')

# Display the first few rows to understand the data structure
print("First 5 rows of the training dataset:")
display(train_df.head())

# Display basic information about the dataset
print("\nDataset Information:")
train_df.info()

# Check for missing values in the dataset
print("\nMissing Values in Each Column:")
print(train_df.isnull().sum())

First 5 rows of the training dataset:


Unnamed: 0,id,passage,y
0,0,Women are always the ones struggling in math c...,1
1,1,Men in education often just boss around and ne...,0
2,2,Non-binary students are just confused about th...,2
3,3,Science classrooms are designed to be neutral ...,3
4,4,Male teachers often believe they are superior ...,0



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       7000 non-null   int64 
 1   passage  7000 non-null   object
 2   y        7000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 164.2+ KB

Missing Values in Each Column:
id         0
passage    0
y          0
dtype: int64


Text Cleaning and Preprocessing

In [None]:
# Clean the text data to ensure consistency and remove unwanted characters
def clean_text(text):

    # Clean the input text by converting to lowercase and removing non-alphanumeric characters.

    text = text.lower()  # Convert text to lowercase to avoid case issues
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphabetic characters
    return text

# Apply the cleaning function to the 'passage' column in the training set
train_df['cleaned_passage'] = train_df['passage'].apply(clean_text)

# Display the cleaned passages
print("Sample cleaned passages:")
display(train_df[['passage', 'cleaned_passage']].head())

Sample cleaned passages:


Unnamed: 0,passage,cleaned_passage
0,Women are always the ones struggling in math c...,women are always the ones struggling in math c...
1,Men in education often just boss around and ne...,men in education often just boss around and ne...
2,Non-binary students are just confused about th...,nonbinary students are just confused about the...
3,Science classrooms are designed to be neutral ...,science classrooms are designed to be neutral ...
4,Male teachers often believe they are superior ...,male teachers often believe they are superior ...


Define Keyword Categories for Each Class

In [None]:
# Function to check the presence of any keywords in a passage
def contains_keywords(text, keywords):
    # Check if any of the provided keywords are present in the text
    return any(keyword in text for keyword in keywords)

# Define keywords for each class
class_0_keywords = ['man', 'men', 'male', 'boy', 'boys']
class_1_keywords = ['woman', 'women', 'female', 'girl', 'girls']
class_2_keywords = ['non-binary', 'non binary']
class_3_keywords = class_0_keywords + class_1_keywords + class_2_keywords  # Combined keywords for class 3


Basic Dataset Analysis

In [None]:
# 1. Count the number of rows for each class
class_counts = train_df['y'].value_counts()
print("Class Counts:\n", class_counts)

Class Counts:
 y
1    2087
2    1891
0    1811
3    1211
Name: count, dtype: int64


In [None]:
# 2. Analyze Class 0: Passages containing and not containing class_0_keywords
class_0_passages = train_df[train_df['y'] == 0]
class_0_with_keywords = class_0_passages['cleaned_passage'].apply(lambda x: contains_keywords(x, class_0_keywords)).sum()
class_0_without_keywords = len(class_0_passages) - class_0_with_keywords
print(f"\nClass 0 passages with keywords {class_0_keywords}: {class_0_with_keywords}")
print(f"Class 0 passages without keywords {class_0_keywords}: {class_0_without_keywords}")

# 3. Analyze Class 1: Passages containing and not containing class_1_keywords
class_1_passages = train_df[train_df['y'] == 1]
class_1_with_keywords = class_1_passages['cleaned_passage'].apply(lambda x: contains_keywords(x, class_1_keywords)).sum()
class_1_without_keywords = len(class_1_passages) - class_1_with_keywords
print(f"\nClass 1 passages with keywords {class_1_keywords}: {class_1_with_keywords}")
print(f"Class 1 passages without keywords {class_1_keywords}: {class_1_without_keywords}")

# 4. Analyze Class 2: Passages containing and not containing class_2_keywords
class_2_passages = train_df[train_df['y'] == 2]
class_2_with_keywords = class_2_passages['cleaned_passage'].apply(lambda x: contains_keywords(x, class_2_keywords)).sum()
class_2_without_keywords = len(class_2_passages) - class_2_with_keywords
print(f"\nClass 2 passages with keywords {class_2_keywords}: {class_2_with_keywords}")
print(f"Class 2 passages without keywords {class_2_keywords}: {class_2_without_keywords}")

# 5. Analyze Class 3: Passages containing any class_3_keywords
class_3_passages = train_df[train_df['y'] == 3]
class_3_with_keywords = class_3_passages['cleaned_passage'].apply(lambda x: contains_keywords(x, class_3_keywords)).sum()
print(f"\nClass 3 passages with any keywords {class_3_keywords}: {class_3_with_keywords}")


Class 0 passages with keywords ['man', 'men', 'male', 'boy', 'boys']: 1806
Class 0 passages without keywords ['man', 'men', 'male', 'boy', 'boys']: 5

Class 1 passages with keywords ['woman', 'women', 'female', 'girl', 'girls']: 2068
Class 1 passages without keywords ['woman', 'women', 'female', 'girl', 'girls']: 19

Class 2 passages with keywords ['non-binary', 'non binary']: 0
Class 2 passages without keywords ['non-binary', 'non binary']: 1891

Class 3 passages with any keywords ['man', 'men', 'male', 'boy', 'boys', 'woman', 'women', 'female', 'girl', 'girls', 'non-binary', 'non binary']: 526


Additional Analysis

In [None]:
# Overall presence of keywords across all classes
overall_keyword_counts = {
    'class_0_keywords': train_df['cleaned_passage'].apply(lambda x: contains_keywords(x, class_0_keywords)).sum(),
    'class_1_keywords': train_df['cleaned_passage'].apply(lambda x: contains_keywords(x, class_1_keywords)).sum(),
    'class_2_keywords': train_df['cleaned_passage'].apply(lambda x: contains_keywords(x, class_2_keywords)).sum(),
    'class_3_keywords': train_df['cleaned_passage'].apply(lambda x: contains_keywords(x, class_3_keywords)).sum(),
}

print("\nOverall Keyword Counts in the Dataset:")
for keyword_class, count in overall_keyword_counts.items():
    print(f"{keyword_class}: {count}")


Overall Keyword Counts in the Dataset:
class_0_keywords: 4934
class_1_keywords: 2443
class_2_keywords: 0
class_3_keywords: 4937


In [None]:
# Identify the Most Frequent Words in Each Class
def most_frequent_words(passages, n=10):
    """
    Find the most frequent words in a list of passages.

    Parameters:
        passages (Series): Pandas Series containing text data.
        n (int): Number of top frequent words to return.

    Returns:
        list: List of tuples with words and their corresponding counts.
    """
    words = ' '.join(passages).split()
    return Counter(words).most_common(n)

# Analyze and display the most frequent words for each class
for class_label in [0, 1, 2, 3]:
    class_passages = train_df[train_df['y'] == class_label]['cleaned_passage']
    frequent_words = most_frequent_words(class_passages)
    print(f"\nMost frequent words in Class {class_label} (Top 10): {frequent_words}")


Most frequent words in Class 0 (Top 10): [('to', 1259), ('men', 1064), ('the', 1053), ('and', 1014), ('in', 991), ('their', 849), ('are', 779), ('often', 753), ('they', 670), ('of', 626)]

Most frequent words in Class 1 (Top 10): [('to', 1459), ('women', 1408), ('in', 1162), ('the', 1140), ('and', 1109), ('their', 1079), ('are', 989), ('often', 800), ('they', 634), ('female', 571)]

Most frequent words in Class 2 (Top 10): [('nonbinary', 1821), ('their', 1378), ('to', 1276), ('and', 851), ('the', 812), ('often', 765), ('in', 757), ('individuals', 680), ('are', 627), ('for', 494)]

Most frequent words in Class 3 (Top 10): [('and', 1007), ('of', 770), ('to', 726), ('the', 719), ('gender', 649), ('a', 501), ('their', 451), ('in', 441), ('that', 405), ('all', 388)]


In [None]:
# Identify the Most Common Bigrams in Each Class
def most_common_bigrams(passages, n=10):
    """
    Find the most common bigrams (pairs of words) in a list of passages.

    Parameters:
        passages (Series): Pandas Series containing text data.
        n (int): Number of top bigrams to return.

    Returns:
        list: List of tuples with bigrams and their corresponding counts.
    """
    vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')
    X_bigrams = vectorizer.fit_transform(passages)
    bigram_counts = X_bigrams.sum(axis=0).A1
    bigrams = vectorizer.get_feature_names_out()
    bigram_freq = list(zip(bigrams, bigram_counts))
    bigram_freq.sort(key=lambda x: x[1], reverse=True)
    return bigram_freq[:n]

# Analyze and display the most common bigrams for each class
for class_label in [0, 1, 2, 3]:
    class_passages = train_df[train_df['y'] == class_label]['cleaned_passage']
    common_bigrams = most_common_bigrams(class_passages)
    print(f"\nMost common bigrams in Class {class_label} (Top 10): {common_bigrams}")


Most common bigrams in Class 0 (Top 10): [('men think', 74), ('male nurses', 66), ('patient care', 57), ('male pharmacists', 41), ('think know', 40), ('mental health', 39), ('think theyre', 37), ('female colleagues', 36), ('lack empathy', 34), ('male dentists', 34)]

Most common bigrams in Class 1 (Top 10): [('male counterparts', 104), ('women lack', 73), ('leadership roles', 69), ('women handle', 64), ('women tend', 60), ('women generally', 50), ('compared male', 49), ('women just', 48), ('male colleagues', 46), ('female patients', 43)]

Most common bigrams in Class 2 (Top 10): [('nonbinary individuals', 634), ('nonbinary people', 382), ('gender identity', 202), ('people just', 78), ('identity issues', 65), ('just trying', 63), ('individuals face', 50), ('individuals just', 46), ('nonbinary healthcare', 45), ('nonbinary photographers', 42)]

Most common bigrams in Class 3 (Top 10): [('regardless gender', 196), ('gender identity', 173), ('nonbinary individuals', 70), ('irrespective ge

In [None]:
### Analyze Sentence Length Distribution Across Classes

# Calculate the number of words in each passage
train_df['sentence_length'] = train_df['cleaned_passage'].apply(lambda x: len(x.split()))

# For each class, compute average, minimum, and maximum sentence lengths
for class_label in [0, 1, 2, 3]:
    class_sentence_lengths = train_df[train_df['y'] == class_label]['sentence_length']
    avg_length = class_sentence_lengths.mean()
    min_length = class_sentence_lengths.min()
    max_length = class_sentence_lengths.max()
    print(f"\nClass {class_label} Sentence Lengths:")
    print(f"Average: {avg_length:.2f}, Min: {min_length}, Max: {max_length}")


Class 0 Sentence Lengths:
Average: 20.72, Min: 5, Max: 56

Class 1 Sentence Lengths:
Average: 21.11, Min: 5, Max: 164

Class 2 Sentence Lengths:
Average: 19.88, Min: 6, Max: 60

Class 3 Sentence Lengths:
Average: 21.41, Min: 9, Max: 62


In [None]:
# Analyze Positional Keyword Bias (e.g., first 5 words of passage)
def starts_with_keywords(text, keywords, num_words=5):
    """
    Check if any of the keywords appear within the first 'num_words' of the text.

    Parameters:
        text (str): The text to search within.
        keywords (list): A list of keywords to search for.
        num_words (int): Number of words from the start to consider.

    Returns:
        bool: True if any keyword is found in the specified position, False otherwise.
    """
    words = text.split()[:num_words]  # Extract the first 'num_words' words
    return contains_keywords(' '.join(words), keywords)

# For each class, count how many passages have keywords in the first 5 words
for class_label, keywords in [(0, class_0_keywords), (1, class_1_keywords), (2, class_2_keywords)]:
    class_passages = train_df[train_df['y'] == class_label]
    positional_bias = class_passages['cleaned_passage'].apply(lambda x: starts_with_keywords(x, keywords)).sum()
    print(f"\nClass {class_label} passages where first 5 words contain keywords {keywords}: {positional_bias}")


Class 0 passages where first 5 words contain keywords ['man', 'men', 'male', 'boy', 'boys']: 1743

Class 1 passages where first 5 words contain keywords ['woman', 'women', 'female', 'girl', 'girls']: 1789

Class 2 passages where first 5 words contain keywords ['non-binary', 'non binary']: 7


In [None]:
# Analyze Unique Words in Each Class
def get_unique_words(passages):
    """
    Extract unique words from a list of passages.

    Parameters:
        passages (Series): Pandas Series containing text data.

    Returns:
        set: A set of unique words.
    """
    words = ' '.join(passages).split()
    return set(words)

# Extract unique words for each class
unique_words_per_class = {}
for class_label in [0, 1, 2, 3]:
    class_passages = train_df[train_df['y'] == class_label]['cleaned_passage']
    unique_words_per_class[class_label] = get_unique_words(class_passages)

# Display the number of unique words in each class
for class_label in [0, 1, 2, 3]:
    print(f"\nNumber of unique words in Class {class_label}: {len(unique_words_per_class[class_label])}")

# Identify and display words that are unique to each class
for class_label in [0, 1, 2, 3]:
    other_classes = [label for label in [0, 1, 2, 3] if label != class_label]
    unique_words = unique_words_per_class[class_label] - set().union(*[unique_words_per_class[label] for label in other_classes])
    print(f"\nWords unique to Class {class_label}: {list(unique_words)[:10]}")  # Display first 10 unique words


Number of unique words in Class 0: 3387

Number of unique words in Class 1: 3601

Number of unique words in Class 2: 2772

Number of unique words in Class 3: 2698

Words unique to Class 0: ['miracle', 'maledominant', 'nonmale', 'progressively', 'anymore', 'procedural', 'inconsiderate', 'groomed', 'knowitall', 'midwifery']

Words unique to Class 1: ['predictions', 'submitted', 'rappers', 'got', 'detrimentally', 'troubleshooting', 'olympics', 'temptresses', 'logically', 'hockey']

Words unique to Class 2: ['norm', 'eccentric', 'egypt', 'bandwagon', 'survive', 'showings', 'rejecting', 'nonstandard', 'feasible', 'tick']

Words unique to Class 3: ['movie', 'volunteering', 'furniture', 'wellconnected', 'enable', 'bravery', 'zero', 'activity', 'determining', 'honorably']


In [None]:
### Analyze the Impact of Stopword Removal

# Define a set of common stopwords
stopwords = set(['is', 'are', 'the', 'a', 'an', 'and', 'but', 'or', 'if', 'then', 'there'])

# Remove stopwords from the cleaned passages
train_df['cleaned_passage_no_stopwords'] = train_df['cleaned_passage'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stopwords])
)

# Compare passages before and after stopword removal for Class 0
print(f"\nOriginal passage (Class 0):\n{train_df[train_df['y'] == 0]['cleaned_passage'].iloc[0]}")
print(f"\nPassage without stopwords (Class 0):\n{train_df[train_df['y'] == 0]['cleaned_passage_no_stopwords'].iloc[0]}")


Original passage (Class 0):
men in education often just boss around and never listen to others ideas they think they know everything but their arrogance gets in the way of true learning

Passage without stopwords (Class 0):
men in education often just boss around never listen to others ideas they think they know everything their arrogance gets in way of true learning


In [None]:
# Find common co-occurring words in Class 3 with biased keywords

def co_occurring_words_with_keywords(text, keywords):
    """
    Extract words from the text that are not in the specified keywords.

    Parameters:
        text (str): The text to process.
        keywords (list): List of keywords to exclude.

    Returns:
        list: List of co-occurring words not in keywords.
    """
    words = text.split()
    return [word for word in words if word not in keywords]

# Identify Class 3 passages that contain any class_3_keywords
class_3_passages_with_keywords = class_3_passages['cleaned_passage'].apply(
    lambda x: contains_keywords(x, class_3_keywords)
)

# Filter passages that contain class_3_keywords
class_3_passages_with_keywords = class_3_passages[class_3_passages['cleaned_passage'].apply(
    lambda x: contains_keywords(x, class_3_keywords)
)]

# Initialize a Counter to hold co-occurring words
co_occurring_words = Counter()

# Collect co-occurring words from Class 3 passages with keywords
for passage in class_3_passages_with_keywords['cleaned_passage']:
    co_occurring_words.update(co_occurring_words_with_keywords(passage, class_3_keywords))

# Display the top 20 co-occurring words
print("\nTop co-occurring words with biased keywords in Class 3:")
print(co_occurring_words.most_common(20))


Top co-occurring words with biased keywords in Class 3:
[('and', 429), ('to', 333), ('the', 325), ('of', 315), ('gender', 226), ('that', 224), ('in', 219), ('their', 218), ('a', 215), ('for', 166), ('is', 157), ('are', 156), ('all', 156), ('often', 102), ('this', 100), ('more', 91), ('environment', 82), ('nonbinary', 75), ('should', 73), ('can', 73)]


In [None]:
# Sentiment Analysis using TextBlob

# Apply sentiment analysis to all Class 3 passages
class_3_passages = train_df[train_df['y'] == 3].copy()  # Create a copy to avoid SettingWithCopyWarning
class_3_passages['sentiment'] = class_3_passages['cleaned_passage'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Apply sentiment analysis to Class 3 passages with keywords
class_3_passages_with_keywords['sentiment'] = class_3_passages_with_keywords['cleaned_passage'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Calculate average sentiment scores
avg_sentiment_class_3 = class_3_passages['sentiment'].mean()
avg_sentiment_class_3_with_keywords = class_3_passages_with_keywords['sentiment'].mean()

print(f"\nAverage sentiment for all Class 3 passages: {avg_sentiment_class_3:.2f}")
print(f"Average sentiment for Class 3 passages with biased keywords: {avg_sentiment_class_3_with_keywords:.2f}")


Average sentiment for all Class 3 passages: 0.17
Average sentiment for Class 3 passages with biased keywords: 0.17


In [None]:
# Define a function to extract top N-grams (bigrams or trigrams)
def get_top_ngrams(corpus, n=None, ngram_range=(2, 3)):
    """
    Extract the top N n-grams from the corpus.

    Parameters:
        corpus (list): List of text documents.
        n (int): Number of top n-grams to return.
        ngram_range (tuple): The range of n-values for different n-grams to be extracted.

    Returns:
        list: List of tuples with n-grams and their corresponding counts.
    """
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english').fit(corpus)
    bag_of_words = vectorizer.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
# Get top 20 bigrams and trigrams for Class 3
corpus_class_3 = class_3_passages['cleaned_passage'].tolist()
top_20_bigrams_class_3 = get_top_ngrams(corpus_class_3, n=20, ngram_range=(2, 2))
top_20_trigrams_class_3 = get_top_ngrams(corpus_class_3, n=20, ngram_range=(3, 3))

print("\nTop 20 Bigrams in Class 3:")
print(top_20_bigrams_class_3)

print("\nTop 20 Trigrams in Class 3:")
print(top_20_trigrams_class_3)


Top 20 Bigrams in Class 3:
[('regardless gender', 196), ('gender identity', 173), ('nonbinary individuals', 70), ('irrespective gender', 41), ('gender identities', 37), ('equal opportunities', 29), ('occupational therapy', 26), ('male nurses', 26), ('behavioral neuroscience', 25), ('identity expression', 23), ('gender equality', 23), ('based gender', 22), ('diverse perspectives', 22), ('people genders', 22), ('inclusive environment', 21), ('patient care', 21), ('mental health', 21), ('rehabilitation counseling', 19), ('transcends gender', 18), ('gender diversity', 17)]

Top 20 Trigrams in Class 3:
[('regardless gender identity', 72), ('gender identity expression', 23), ('irrespective gender identity', 17), ('based gender identity', 14), ('nonbinary individuals face', 13), ('individuals regardless gender', 12), ('unconventional gender identity', 12), ('nonbinary individuals nursing', 11), ('nonprofits face misconception', 11), ('male female nonbinary', 9), ('create inclusive environmen

In [None]:
# Example of Phrase Matching for Class 3
phrases_class_3 = ['regardless of gender', 'should be equal', 'everyone is treated equally']

def phrase_matching(corpus, phrases):
    """
    Count the number of passages that contain specific phrases.

    Parameters:
        corpus (list): List of text documents.
        phrases (list): List of phrases to search for.

    Returns:
        dict: Dictionary with phrase counts.
    """
    phrase_counts = {phrase: 0 for phrase in phrases}
    for text in corpus:
        for phrase in phrases:
            if phrase in text:
                phrase_counts[phrase] += 1
    return phrase_counts

# Perform phrase matching for Class 3
phrase_counts_class_3 = phrase_matching(corpus_class_3, phrases_class_3)
print("\nPhrase Counts for Class 3:")
print(phrase_counts_class_3)


Phrase Counts for Class 3:
{'regardless of gender': 137, 'should be equal': 0, 'everyone is treated equally': 0}


In [None]:
# Expanded phrase matching for Class 3 (Neutral)
phrases_class_3_expanded = [
    'regardless of gender', 'equal opportunities', 'inclusive environment', 'gender equality',
    'treat everyone equally', 'inclusive society', 'everyone should have the same rights',
    'no gender bias', 'regardless of gender identity', 'fair treatment'
]

# Perform phrase matching for Class 3 with expanded phrases
phrase_counts_class_3_expanded = phrase_matching(corpus_class_3, phrases_class_3_expanded)
print("\nExpanded Phrase Counts for Class 3 (Neutral):")
print(phrase_counts_class_3_expanded)


Expanded Phrase Counts for Class 3 (Neutral):
{'regardless of gender': 137, 'equal opportunities': 29, 'inclusive environment': 24, 'gender equality': 23, 'treat everyone equally': 1, 'inclusive society': 3, 'everyone should have the same rights': 0, 'no gender bias': 0, 'regardless of gender identity': 32, 'fair treatment': 2}


In [None]:
# Find common co-occurring words in Class 2 with biased keywords

def co_occurring_words_with_keywords(text, keywords):
    """
    Extract words from the text that are not in the specified keywords.

    Parameters:
        text (str): The text to process.
        keywords (list): List of keywords to exclude.

    Returns:
        list: List of co-occurring words not in keywords.
    """
    words = text.split()
    return [word for word in words if word not in keywords]

# Identify Class 2 passages that contain any class_2_keywords
class_2_passages_with_keywords = class_2_passages[class_2_passages['cleaned_passage'].apply(
    lambda x: contains_keywords(x, class_2_keywords)
)].copy()  # Create a copy to avoid SettingWithCopyWarning

# Initialize a Counter to hold co-occurring words
co_occurring_words_class_2 = Counter()

# Collect co-occurring words from Class 2 passages with keywords
for passage in class_2_passages_with_keywords['cleaned_passage']:
    co_occurring_words_class_2.update(co_occurring_words_with_keywords(passage, class_2_keywords))

# Display the top 20 co-occurring words
print("\nTop co-occurring words with biased keywords in Class 2:")
print(co_occurring_words_class_2.most_common(20))


Top co-occurring words with biased keywords in Class 2:
[]


In [None]:
# Sentiment Analysis for Class 2 using TextBlob

# Apply sentiment analysis to all Class 2 passages
class_2_passages_copy = class_2_passages.copy()
class_2_passages_copy['sentiment'] = class_2_passages_copy['cleaned_passage'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Apply sentiment analysis to Class 2 passages with keywords
class_2_passages_with_keywords_copy = class_2_passages_with_keywords.copy()
class_2_passages_with_keywords_copy['sentiment'] = class_2_passages_with_keywords_copy['cleaned_passage'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Calculate average sentiment scores
avg_sentiment_class_2 = class_2_passages_copy['sentiment'].mean()
avg_sentiment_class_2_with_keywords = class_2_passages_with_keywords_copy['sentiment'].mean()

print(f"\nAverage sentiment for all Class 2 passages: {avg_sentiment_class_2:.2f}")
print(f"Average sentiment for Class 2 passages with biased keywords: {avg_sentiment_class_2_with_keywords:.2f}")


Average sentiment for all Class 2 passages: 0.03
Average sentiment for Class 2 passages with biased keywords: nan


In [None]:
# Get top 20 bigrams and trigrams for Class 2
corpus_class_2 = class_2_passages['cleaned_passage'].tolist()
top_20_bigrams_class_2 = get_top_ngrams(corpus_class_2, n=20, ngram_range=(2, 2))
top_20_trigrams_class_2 = get_top_ngrams(corpus_class_2, n=20, ngram_range=(3, 3))

print("\nTop 20 Bigrams in Class 2:")
print(top_20_bigrams_class_2)

print("\nTop 20 Trigrams in Class 2:")
print(top_20_trigrams_class_2)


Top 20 Bigrams in Class 2:
[('nonbinary individuals', 634), ('nonbinary people', 382), ('gender identity', 202), ('people just', 78), ('identity issues', 65), ('just trying', 63), ('individuals face', 50), ('individuals just', 46), ('nonbinary healthcare', 45), ('nonbinary photographers', 42), ('nonbinary employees', 41), ('face challenges', 41), ('nonbinary staff', 41), ('traditional gender', 40), ('making difficult', 37), ('face skepticism', 37), ('special treatment', 37), ('individuals struggle', 37), ('seeking attention', 35), ('nonbinary patients', 35)]

Top 20 Trigrams in Class 2:
[('nonbinary people just', 63), ('nonbinary individuals face', 44), ('nonbinary individuals struggle', 35), ('nonbinary people face', 34), ('people just trying', 33), ('nonbinary individuals lack', 32), ('nonbinary individuals just', 31), ('nonbinary staff members', 29), ('nonbinary healthcare professionals', 24), ('nonbinary people frequently', 22), ('biases gender identity', 20), ('nonbinary individu

In [None]:
# Example of Phrase Matching for Class 2
phrases_class_2 = ['non-binary identity', 'gender identity issues', 'should conform']

# Perform phrase matching for Class 2
phrase_counts_class_2 = phrase_matching(corpus_class_2, phrases_class_2)
print("\nPhrase Counts for Class 2:")
print(phrase_counts_class_2)

# Define qualifiers to check in Class 2
qualifiers_class_2 = ['just', 'only', 'simply', 'merely']

# Count occurrences of qualifiers in Class 2
def count_qualifiers(corpus, qualifiers):
    """
    Count the number of passages containing specific qualifiers.

    Parameters:
        corpus (list): List of text documents.
        qualifiers (list): List of qualifiers to search for.

    Returns:
        dict: Dictionary with qualifier counts.
    """
    qualifier_counts = {qualifier: 0 for qualifier in qualifiers}
    for text in corpus:
        for qualifier in qualifiers:
            if qualifier in text:
                qualifier_counts[qualifier] += 1
    return qualifier_counts

qualifier_counts_class_2 = count_qualifiers(corpus_class_2, qualifiers_class_2)
print("\nQualifying Phrase Counts for Class 2:")
print(qualifier_counts_class_2)


Phrase Counts for Class 2:
{'non-binary identity': 0, 'gender identity issues': 4, 'should conform': 0}

Qualifying Phrase Counts for Class 2:
{'just': 290, 'only': 15, 'simply': 17, 'merely': 0}


In [None]:
# Common conjunctions that could signal complex sentence structures and hidden biases
conjunctions = ['but', 'although', 'however', 'even though', 'despite', 'still']

# Function to count conjunctions in the corpus
def count_conjunctions(corpus, conjunctions):
    """
    Count the number of passages containing specific conjunctions.

    Parameters:
        corpus (list): List of text documents.
        conjunctions (list): List of conjunctions to search for.

    Returns:
        dict: Dictionary with conjunction counts.
    """
    conjunction_counts = {conjunction: 0 for conjunction in conjunctions}
    for text in corpus:
        for conjunction in conjunctions:
            if conjunction in text:
                conjunction_counts[conjunction] += 1
    return conjunction_counts

# Count conjunctions in Class 2 and Class 3
conjunction_counts_class_2 = count_conjunctions(corpus_class_2, conjunctions)
conjunction_counts_class_3 = count_conjunctions(corpus_class_3, conjunctions)

print("\nConjunction Counts for Class 2 (Complex Sentences):")
print(conjunction_counts_class_2)

print("\nConjunction Counts for Class 3 (Complex Sentences):")
print(conjunction_counts_class_3)

# Pronoun usage analysis
pronouns = ['he', 'she', 'they', 'them']

# Function to count pronouns in the corpus
def count_pronouns(corpus, pronouns):
    """
    Count the number of passages containing specific pronouns.

    Parameters:
        corpus (list): List of text documents.
        pronouns (list): List of pronouns to search for.

    Returns:
        dict: Dictionary with pronoun counts.
    """
    pronoun_counts = {pronoun: 0 for pronoun in pronouns}
    for text in corpus:
        for pronoun in pronouns:
            if pronoun in text:
                pronoun_counts[pronoun] += 1
    return pronoun_counts

# Count pronouns in Class 2 and Class 3
pronoun_counts_class_2 = count_pronouns(corpus_class_2, pronouns)
pronoun_counts_class_3 = count_pronouns(corpus_class_3, pronouns)

print("\nPronoun Counts for Class 2 (Non-Binary Bias):")
print(pronoun_counts_class_2)

print("\nPronoun Counts for Class 3 (Neutral):")
print(pronoun_counts_class_3)


Conjunction Counts for Class 2 (Complex Sentences):
{'but': 130, 'although': 2, 'however': 2, 'even though': 1, 'despite': 7, 'still': 3}

Conjunction Counts for Class 3 (Complex Sentences):
{'but': 112, 'although': 0, 'however': 0, 'even though': 0, 'despite': 13, 'still': 5}

Pronoun Counts for Class 2 (Non-Binary Bias):
{'he': 1722, 'she': 20, 'they': 397, 'them': 244}

Pronoun Counts for Class 3 (Neutral):
{'he': 1015, 'she': 8, 'they': 76, 'them': 76}


In [None]:
# Pronoun and Gender Term Co-occurrence

def count_co_occurrences(corpus, phrases, keywords):
    """
    Count the number of passages where specific phrases co-occur with any of the keywords.

    Parameters:
        corpus (list): List of text documents.
        phrases (list): List of phrases to search for.
        keywords (list): List of keywords to check for co-occurrence.

    Returns:
        dict: Dictionary with co-occurrence counts.
    """
    co_occurrence_counts = {phrase: 0 for phrase in phrases}
    for text in corpus:
        for phrase in phrases:
            if phrase in text:
                for keyword in keywords:
                    if keyword in text:
                        co_occurrence_counts[phrase] += 1
                        break  # Only count once if any keyword is present
    return co_occurrence_counts

# Define gender terms for co-occurrence analysis
gender_terms = ['man', 'woman', 'non-binary', 'male', 'female', 'boy', 'girl']

# Analyze co-occurrence in Class 2 and Class 3
pronoun_and_gender_co_occurrence_class_2 = count_co_occurrences(corpus_class_2, pronouns, gender_terms)
pronoun_and_gender_co_occurrence_class_3 = count_co_occurrences(corpus_class_3, pronouns, gender_terms)

print("\nPronoun and Gender Term Co-occurrences in Class 2:")
print(pronoun_and_gender_co_occurrence_class_2)

print("\nPronoun and Gender Term Co-occurrences in Class 3:")
print(pronoun_and_gender_co_occurrence_class_3)


Pronoun and Gender Term Co-occurrences in Class 2:
{'he': 150, 'she': 1, 'they': 38, 'them': 18}

Pronoun and Gender Term Co-occurrences in Class 3:
{'he': 180, 'she': 1, 'they': 25, 'them': 9}


> Dataset Inspection: Initially inspecting data structure, types, and missing values is essential to identify potential issues and understand the dataset layout, which informs preprocessing needs.

> Text Cleaning: Converting text to lowercase and removing non-alphabetic characters standardizes entries, enabling consistent keyword and phrase detection during analysis.

> Keyword Definition by Class: Defining specific keywords related to each class (e.g., male, female, non-binary) helps assess how well keywords alone can distinguish classes, guiding feature engineering decisions.

> Class Distribution Check: Analyzing class counts helps identify class imbalance, a critical factor influencing model training and performance across classes.

> Keyword Presence Analysis: Checking for the presence or absence of class-specific keywords in passages highlights the relevance of these keywords in differentiating classes.

> Overall Keyword Distribution: Reviewing keyword frequencies across all classes reveals patterns and biases in keyword usage, helping adjust for any skew that could impact model training.

> Frequent Words per Class: Extracting top words within each class captures unique vocabulary patterns that define each category, providing insights for more effective feature selection.

> Common Bigrams: Identifying frequent bigrams adds context by showing commonly associated word pairs, which enhances understanding of class-specific language nuances.

> Sentence Length Distribution: Analyzing sentence length across classes reveals structural differences, which can be informative for creating features that capture verbosity or conciseness.

> Unique Words per Class: Identifying words unique to each class helps differentiate categories and supports feature engineering by highlighting distinct vocabulary.

> Stopword Impact: Removing common stopwords reduces noise, focusing on relevant vocabulary, which clarifies data and improves feature quality.

> Co-occurring Words with Biased Keywords: Studying words frequently appearing with biased keywords in certain classes (e.g., Class 3) reveals potential associations and context around biases, guiding targeted feature adjustments.


## **Model Training**

Data Preprocessing and Feature Engineering

In [None]:
# Load the training dataset
train_df = pd.read_csv('/content/drive/MyDrive/HerWill/train.csv')

# Text Preprocessing
def clean_text(text):
    """Clean the text by lowercasing and removing non-alphanumeric characters."""
    text = text.lower()  # Lowercase all words
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove all non-alphabetic characters
    return text

# Apply the cleaning function to the 'passage' column in training set
train_df['cleaned_passage'] = train_df['passage'].apply(clean_text)

# Feature Engineering based on analysis

# Define keywords and phrases for feature creation
keywords = ['women', 'men', 'nonbinary', 'male', 'female', 'neutral']
phrases = ['regardless of gender', 'equal opportunities', 'inclusive environment',
           'just', 'but', 'despite', 'he', 'they', 'she']

# Add binary features for each keyword in the training data
for keyword in keywords:
    train_df[keyword] = train_df['cleaned_passage'].apply(lambda x: x.count(keyword))

# Add binary features for each phrase in the training data
for phrase in phrases:
    train_df[phrase] = train_df['cleaned_passage'].apply(lambda x: 1 if phrase in x else 0)

# Feature for keyword proximity
def keyword_proximity(text):
    """
    Check if any keywords appear within 5 words of each other.

    Parameters:
        text (str): The text to analyze.

    Returns:
        int: 1 if proximity condition is met, else 0.
    """
    words = text.split()
    for i, word in enumerate(words):
        if word in keywords:
            # Check within the next 5 words
            for j in range(i+1, min(i+6, len(words))):
                if words[j] in keywords and words[j] != word:
                    return 1
    return 0

train_df['keyword_proximity'] = train_df['cleaned_passage'].apply(keyword_proximity)

# Add polarity feature using TextBlob
def text_polarity(text):
    """Get the polarity of the text."""
    return TextBlob(text).sentiment.polarity

train_df['polarity'] = train_df['cleaned_passage'].apply(text_polarity)

TF-IDF Vectorization and Feature Scaling

In [None]:
# TF-IDF Vectorization
# Initialize TF-IDF vectorizer with max_features set to 13 and ngram_range=(1,3)
vectorizer = TfidfVectorizer(max_features=13, ngram_range=(1, 3), stop_words='english')

# Fit and transform the training data passages
X_tfidf = vectorizer.fit_transform(train_df['cleaned_passage'])

# Save the trained vectorizer for future use on test data
joblib.dump(vectorizer, '/content/drive/MyDrive/HerWill/tfidf_vectorizer.pkl')

# Convert the TF-IDF features into a DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Combine TF-IDF features and binary features from keywords/phrases in the training set
additional_features = keywords + phrases + ['keyword_proximity', 'polarity']
X_combined = pd.concat([tfidf_df, train_df[additional_features].reset_index(drop=True)], axis=1)

# Feature Scaling
# Initialize StandardScaler
scaler = StandardScaler()

# Fit the scaler on the combined features and transform
X_scaled = scaler.fit_transform(X_combined)

# Save the scaler for future use on test data
joblib.dump(scaler, '/content/drive/MyDrive/HerWill/scaler.pkl')

['/content/drive/MyDrive/HerWill/scaler.pkl']

Principal Component Analysis (PCA)

In [None]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reducing features to 3 principal components
X_pca = pca.fit_transform(X_scaled)

# Convert reduced features to a DataFrame
X_pca_df = pd.DataFrame(X_pca, columns=[f'pc_{i}' for i in range(1, 3)])

# Save PCA model for consistent test data transformation
joblib.dump(pca, '/content/drive/MyDrive/HerWill/pca_model.pkl')

# Print the number of input features after PCA
print(f"Number of input features after PCA: {X_pca_df.shape[1]}")

Number of input features after PCA: 2


Train-Test Split and Tensor Conversion

In [None]:
# Train-Test Split
y = train_df['y']  # Target variable
X_train, X_val, y_train, y_val = train_test_split(X_pca_df, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors for model training
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)

Define the Neural Network Model

In [None]:
# Define the Neural Network Model with very few parameters
class MinimalNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MinimalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1)  # Single neuron layer
        self.fc2 = nn.Linear(1, output_dim)  # Output layer

    def forward(self, x):
        out = self.fc1(x)
        out = torch.sigmoid(out)  # Sigmoid activation for the hidden layer
        out = self.fc2(out)
        return out

# Hyperparameters
input_dim = X_pca_df.shape[1]  # Number of input features after PCA (3)
output_dim = 4  # Number of output classes (0, 1, 2, 3)

# Initialize the model
model = MinimalNN(input_dim, output_dim)

Define Loss Function and Optimizer

In [None]:
# Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Training Loop with Validation and Metrics

In [None]:
# Training Loop with Validation and Metrics

batch_size = 32
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

num_epochs = 500
best_f1_score = 0
best_model_weights = None

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0
    all_train_preds = []
    all_train_labels = []

    # Training Phase
    for inputs, labels in train_loader:
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        epoch_loss += loss.item()

        # Collect predictions for training metrics
        _, train_preds = torch.max(outputs, 1)
        all_train_preds.extend(train_preds.numpy())
        all_train_labels.extend(labels.numpy())

    # Calculate Training Metrics
    train_acc = accuracy_score(all_train_labels, all_train_preds)
    train_f1 = f1_score(all_train_labels, all_train_preds, average='weighted')

    # Validation Phase
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        val_outputs = model(X_val_tensor)  # Forward pass on validation data
        _, y_val_pred = torch.max(val_outputs, 1)  # Get predictions

        val_acc = accuracy_score(y_val_tensor, y_val_pred)
        val_f1 = f1_score(y_val_tensor, y_val_pred, average='weighted')

        # Track the best model based on validation F1 score
        if val_f1 > best_f1_score:
            best_f1_score = val_f1
            best_model_weights = model.state_dict().copy()  # Save the best model weights

    # Print progress every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}")
        print(f"Train Accuracy: {train_acc:.4f}, Train F1: {train_f1:.4f}")
        print(f"Val Accuracy: {val_acc:.4f}, Val F1: {val_f1:.4f}")

Epoch [10/500], Loss: 1.0390
Train Accuracy: 0.5677, Train F1: 0.4341
Val Accuracy: 0.5607, Val F1: 0.4251
Epoch [20/500], Loss: 0.8909
Train Accuracy: 0.5771, Train F1: 0.4522
Val Accuracy: 0.5721, Val F1: 0.4464
Epoch [30/500], Loss: 0.8270
Train Accuracy: 0.6018, Train F1: 0.4993
Val Accuracy: 0.6086, Val F1: 0.5162
Epoch [40/500], Loss: 0.7898
Train Accuracy: 0.6302, Train F1: 0.5500
Val Accuracy: 0.6400, Val F1: 0.5680
Epoch [50/500], Loss: 0.7622
Train Accuracy: 0.6552, Train F1: 0.5897
Val Accuracy: 0.6714, Val F1: 0.6154
Epoch [60/500], Loss: 0.7401
Train Accuracy: 0.6727, Train F1: 0.6159
Val Accuracy: 0.6929, Val F1: 0.6498
Epoch [70/500], Loss: 0.7218
Train Accuracy: 0.7182, Train F1: 0.6869
Val Accuracy: 0.7164, Val F1: 0.6847
Epoch [80/500], Loss: 0.7062
Train Accuracy: 0.7457, Train F1: 0.7228
Val Accuracy: 0.7493, Val F1: 0.7292
Epoch [90/500], Loss: 0.6929
Train Accuracy: 0.7766, Train F1: 0.7604
Val Accuracy: 0.7714, Val F1: 0.7562
Epoch [100/500], Loss: 0.6816
Train A

Load Best Model Weights


In [None]:
# Load the best model weights based on validation F1 score
if best_model_weights is not None:
    model.load_state_dict(best_model_weights)

Calculate Number of Parameters (NOP) Using Thop

In [None]:
from thop import profile

# Function to measure NOP using thop
def measure_nop(model, inputs):
    flops, params = profile(model, inputs=(inputs,))
    return flops, params

# Test a single sample for NOP calculation
sample_input = X_train_tensor[0].unsqueeze(0)
flops, params = measure_nop(model, sample_input)
NOP = params

print(f"Total trainable parameters after PCA: {NOP}")

[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
Total trainable parameters after PCA: 11.0


Load and Preprocess Test Data

In [None]:
# Load test data
test_df = pd.read_csv('/content/drive/MyDrive/HerWill/test.csv')

# Preprocess the test data
test_df['cleaned_passage'] = test_df['passage'].apply(clean_text)
for keyword in keywords:
    test_df[keyword] = test_df['cleaned_passage'].apply(lambda x: x.count(keyword))
for phrase in phrases:
    test_df[phrase] = test_df['cleaned_passage'].apply(lambda x: 1 if phrase in x else 0)
test_df['keyword_proximity'] = test_df['cleaned_passage'].apply(keyword_proximity)
test_df['polarity'] = test_df['cleaned_passage'].apply(text_polarity)

Transform Test Data Using Trained Vectorizer and Scaler

In [None]:
# Load saved vectorizer and scaler
vectorizer = joblib.load('/content/drive/MyDrive/HerWill/tfidf_vectorizer.pkl')
scaler = joblib.load('/content/drive/MyDrive/HerWill/scaler.pkl')

# Transform test data
X_tfidf_test = vectorizer.transform(test_df['cleaned_passage'])
tfidf_test_df = pd.DataFrame(X_tfidf_test.toarray(), columns=vectorizer.get_feature_names_out())
X_test_combined = pd.concat([tfidf_test_df, test_df[additional_features].reset_index(drop=True)], axis=1)
X_scaled_test = scaler.transform(X_test_combined)

Apply PCA to Test Data

In [None]:
# Load saved PCA model
pca = joblib.load('/content/drive/MyDrive/HerWill/pca_model.pkl')
X_pca_test = pca.transform(X_scaled_test)
X_pca_test_df = pd.DataFrame(X_pca_test, columns=[f'pc_{i}' for i in range(1, 3)])

Convert Test Data to PyTorch Tensor

In [None]:
# Convert test data to tensor
X_test_tensor = torch.tensor(X_pca_test_df.values, dtype=torch.float32)

Make Predictions on Test Data

In [None]:
# Make predictions on the test data
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, y_test_pred = torch.max(test_outputs, 1)

Create Output DataFrame with Predictions and NOP

In [None]:
# Create output DataFrame with predictions and NOP
output_df = pd.DataFrame({
    'id': test_df['id'],
    'y_pred': y_test_pred.numpy(),
    'parameters': NOP
})

# Save the predictions
output_df.to_csv('/content/drive/MyDrive/HerWill/test_predictions.csv', index=False)
print("Predictions saved to 'test_predictions.csv'")

Predictions saved to 'test_predictions.csv'


 Calculate F1NOP Score

In [None]:
# Calculate F1NOP score
def calculate_f1nop(f1, nop):
    epsilon = 5 * 10**-16
    return (0.4 * f1) + (0.6 / (torch.log10(torch.tensor(max(1, nop))) + epsilon))

f1nop_score = calculate_f1nop(torch.tensor(best_f1_score), NOP)
print(f"\nFinal F1NOP Score: {f1nop_score.item():.4f}")


Final F1NOP Score: 0.9042


In [None]:
# Save the best model
torch.save(best_model_weights, '/content/drive/MyDrive/HerWill/best_model.pth')
print("Best model saved as 'best_model.pth'")

Best model saved as 'best_model.pth'


> Text Cleaning: Preprocessing text by converting to lowercase and removing non-alphanumeric characters standardizes inputs, ensuring consistency in feature extraction and reducing noise in model training.

> Feature Engineering with Keywords and Phrases: Creating binary features based on specific keywords and phrases related to each class provides additional signals relevant to the classification task, improving model interpretability and potentially enhancing prediction accuracy.

> Keyword Proximity Feature: Including proximity-based keyword features captures contextual relationships, which may provide meaningful distinctions among classes by considering adjacent word relationships.

> Polarity Feature with Sentiment Analysis: Adding sentiment polarity captures the text’s emotional tone, which could aid in identifying nuanced biases or sentiment-driven class distinctions.

> TF-IDF Vectorization: Using TF-IDF captures term importance while preserving word frequencies, providing dense numerical representations for words, which benefits the model's ability to capture and distinguish textual patterns.

> PCA for Dimensionality Reduction: Reducing dimensionality with PCA condenses relevant information into fewer components, mitigating overfitting and reducing computational load while retaining essential information.

> Minimal Neural Network Architecture: Using a compact neural network with few parameters helps prevent overfitting and reduces model complexity, aligning with the aim to create a lightweight, interpretable model.
