In [None]:
!pip install -U gensim
!pip install -U scipy~=1.13.0
!pip install -U Cython
from sklearn.decomposition import PCA
import gensim
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

In [None]:
#word2vec_path='/content/drive/My Drive/Datasets/GoogleNews-vectors-negative300.bin'
word2vec_path='/content/drive/My Drive/Datasets/GoogleNews-vectors-negative300-SLIM.bin.gz'
analogies_path='/content/drive/My Drive/Datasets/questions-words.txt'

In [None]:
drive.mount('/content/drive')

# Load the Word2Vec model directly from the Google Drive file path
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Example: Check similarity between two words
similarity = word2vec_model.similarity('man', 'woman')
print(f"Similarity between 'man' and 'woman': {similarity}")

Mounted at /content/drive
Similarity between 'man' and 'woman': 0.7664012312889099


In [None]:
# Read the file, skipping lines starting with ':'
df = pd.read_csv(analogies_path, delimiter=" ", header=None, comment=":")

# Assign column names
df.columns = ["word_a", "word_b", "word_c", "word_d"]

# Display the first few rows
print(df.head())

   word_a  word_b   word_c       word_d
0  Athens  Greece  Baghdad         Iraq
1  Athens  Greece  Bangkok     Thailand
2  Athens  Greece  Beijing        China
3  Athens  Greece   Berlin      Germany
4  Athens  Greece     Bern  Switzerland


In [None]:
import logging

logging.basicConfig(level=logging.INFO)

def find_gender_direction():
    """Finds and returns a 'gender direction'."""
    gender_biased_word_pairs = [
        ("she", "he"),
        ("her", "his"),
        ("woman", "man"),
        ("Mary", "John"),
        ("herself", "himself"),
        ("daughter", "son"),
        ("mother", "father"),
        ("gal", "guy"),
        ("girl", "boy"),
        ("vagina", "penis"),
        ("feminine", "masculine")
    ]

    # Filter out missing words
    valid_pairs = [(pair[0], pair[1]) for pair in gender_biased_word_pairs if pair[0] in word2vec_model and pair[1] in word2vec_model]

    logging.info(f"Valid pairs used: {len(valid_pairs)} / {len(gender_biased_word_pairs)}")

    # Compute bias vectors
    biases = [word2vec_model[pair[0]] - word2vec_model[pair[1]] for pair in valid_pairs]
    reversed_biases = [word2vec_model[pair[1]] - word2vec_model[pair[0]] for pair in valid_pairs]

    # Normalize bias vectors before PCA
    biases = [vec / np.linalg.norm(vec) for vec in biases]
    reversed_biases = [vec / np.linalg.norm(vec) for vec in reversed_biases]

    # Perform PCA
    pca = PCA(n_components=1)
    pca.fit(np.array(biases + reversed_biases))

    # Normalize the gender direction
    gender_direction = pca.components_[0]
    gender_direction = gender_direction / np.linalg.norm(gender_direction)

    logging.info(f"Gender direction: {gender_direction}")

    return gender_direction


In [None]:
gender_direction=find_gender_direction()
print(gender_direction.shape)

(300,)


In [None]:
class AdversarialBiasMitigation(tf.keras.Model):
    def __init__(self, input_dim):
        super().__init__()

        # Predictor network (X → Y)
        self.predictor = tf.keras.Sequential([
            tf.keras.layers.Dense(input_dim, activation='linear')  # Output: Y (continuous)
        ])

        # Adversary network (Y → Z)
        self.adversary = tf.keras.Sequential([
            tf.keras.layers.Dense(1, activation='linear')  # Output: Z (e.g., bias attribute)
        ])

    def call(self, x):
        y_pred = self.predictor(x)
        z_pred = self.adversary(y_pred)
        return y_pred, z_pred

    def debias(self, biased_word_or_embedding):
      # Check if input is a string (word) or an array (embedding)
      if isinstance(biased_word_or_embedding, str):
        embedding = word2vec_model[biased_word_or_embedding]  # Get embedding if word
      else:
        embedding = biased_word_or_embedding  # Use as is if embedding

      # Ensure the input is a TensorFlow tensor
      embedding = tf.convert_to_tensor(embedding, dtype=tf.float32)

      # Add a batch dimension explicitly
      embedding = tf.expand_dims(embedding, axis=0)  # Shape: (1, input_dim)

      # Pass through the predictor network to get the debiased embedding
      debiased_embedding = self.predictor(embedding).numpy()[0]  # Remove batch dimension

      return debiased_embedding



def project(grad_W_L1, grad_W_L2):
    """
    Calculates the projection of grad_W_L1 onto grad_W_L2.

    """
    # Validate input
    assert len(grad_W_L1) == len(grad_W_L2), "Gradient lists must have the same length"

    # Compute dot product
    dot_product = tf.add_n([tf.reduce_sum(tf.multiply(g1, g2)) for g1, g2 in zip(grad_W_L1, grad_W_L2)])

    # Compute norm of grad_W_L2
    norm_W_L2 = tf.add_n([tf.reduce_sum(tf.square(g2)) for g2 in grad_W_L2])

    # Avoid division by zero
    norm_W_L2 = tf.maximum(norm_W_L2, tf.keras.backend.epsilon())

    # Compute scaling factor
    scale = dot_product / norm_W_L2

    # Compute projection
    projection = [scale * g2 for g2 in grad_W_L2]

    return projection




# Training step
#@tf.function
def train_step(model, x, y, z, optimizer_W, optimizer_U, loss_fn_Y, loss_fn_Z):
    with tf.GradientTape(persistent=True) as tape:
        # Forward pass
        y_pred, z_pred = model(x)

        # Compute losses
        loss_WL1 = loss_fn_Y(y, y_pred)  # Predictor loss (L1)
        loss_UL2 = loss_fn_Z(z, z_pred)  # Adversary loss (L2)

    # Gradients w.r.t predictor (W) for both losses
    grads_WL1 = tape.gradient(loss_WL1, model.predictor.trainable_variables)
    grads_WL2 = tape.gradient(loss_UL2, model.predictor.trainable_variables)

    # Gradients w.r.t adversary (U)
    grads_UL2 = tape.gradient(loss_UL2, model.adversary.trainable_variables)

    # Project ∇WL1 onto ∇WL2 and subtract it
    proj_WL1_WL2 = project(grads_WL1, grads_WL2)

    '''
    modified_grads_WL1 = [g1 - p for g1, p in zip(grads_WL1, proj_WL1_WL2)]
    modified_grads_WL1 = [g1 - g2 for g1, g2 in zip(modified_grads_WL1, grads_WL2)]'''


    modified_grads_WL1 = []

    # Subtract both projection and grad_W_L2 from grad_W_L1 while preserving structure
    for g1, p, g2 in zip(grads_WL1, proj_WL1_WL2, grads_WL2):
        modified_grad = g1 - p - g2
        modified_grads_WL1.append(modified_grad)




    # Apply gradients to predictor (using modified gradients)
    optimizer_W.apply_gradients(zip(modified_grads_WL1, model.predictor.trainable_variables))

    # Apply gradients to adversary
    optimizer_U.apply_gradients(zip(grads_UL2, model.adversary.trainable_variables))

    return loss_WL1, loss_UL2





In [None]:
# INSTANTIATE THE MODEL

input_dim = 300  # Dimension of word embeddings
hidden_dim = 128

# Instantiate model
model = AdversarialBiasMitigation(input_dim=input_dim)

# Optimizers 0.001 0.005 2**-16
optimizer_W = tf.keras.optimizers.Adam(learning_rate=2**-16)
optimizer_U = tf.keras.optimizers.Adam(learning_rate=2**-16)

# Loss functions
loss_fn_Y = tf.keras.losses.MeanSquaredError()  # For predictor
loss_fn_Z = tf.keras.losses.MeanSquaredError()  # For adversary

In [None]:
# Compute X, Y, and Z
X, Y, Z = [], [], []

for _, row in df.iterrows():
    # Feature: X = -word_a + word_b + word_c (based on your query)
    word_a_vec = word2vec_model[row['word_a']]
    word_b_vec = word2vec_model[row['word_b']]
    word_c_vec = word2vec_model[row['word_c']]
    feature_vec = -word_a_vec + word_b_vec + word_c_vec
    X.append(feature_vec)

    # Label: Y = word_d vector
    word_d_vec = word2vec_model[row['word_d']]
    Y.append(word_d_vec)

    # Bias attribute: Z = scalar projection of word_d onto gender_direction
    Z.append(np.dot(word_d_vec, gender_direction))  # Scalar value

X = np.array(X)
Y = np.array(Y)
Z = np.array(Z).reshape(-1, 1)  # Convert to 2D array (batch_size, 1)

print(X.shape)
print(Y.shape)
print(Z.shape)


(19544, 300)
(19544, 300)
(19544, 1)


In [None]:
def create_batches(X, Y, Z, batch_size):
    """Create shuffled batches of data."""
    data_size = len(X)
    indices = np.random.permutation(data_size)
    X_shuffled = X[indices]
    Y_shuffled = Y[indices]
    Z_shuffled = Z[indices]

    batches = []
    for i in range(0, data_size, batch_size):
        X_batch = X_shuffled[i:i+batch_size]
        Y_batch = Y_shuffled[i:i+batch_size]
        Z_batch = Z_shuffled[i:i+batch_size]
        batches.append((X_batch, Y_batch, Z_batch))
    return batches

In [None]:
batch_size = 64
batches = create_batches(X, Y, Z, batch_size)
print(len(batches[0][0]))

64


In [None]:
# Training loop
for epoch in range(100):  # Number of epochs
    total_loss_Y = 0.0
    total_loss_Z = 0.0

    for X_batch, Y_batch, Z_batch in batches:
        # Perform a single training step
        batch_loss_Y, batch_loss_Z = train_step(
            model,
            X_batch,
            Y_batch,
            Z_batch,
            optimizer_W,
            optimizer_U,
            loss_fn_Y,
            loss_fn_Z,
        )

        total_loss_Y += batch_loss_Y.numpy()
        total_loss_Z += batch_loss_Z.numpy()

    if epoch==0 or (epoch+1)%10==0:
      print(f"Epoch {epoch+1}: Loss_Y={total_loss_Y / len(batches)}, Loss_Z={total_loss_Z / len(batches)}")


Epoch 1: Loss_Y=0.009228212521502785, Loss_Z=0.04634236788462385
Epoch 10: Loss_Y=0.009193668068916189, Loss_Z=0.04633211679878383
Epoch 20: Loss_Y=0.00920256596131652, Loss_Z=0.04633671402005978
Epoch 30: Loss_Y=0.009211303620580948, Loss_Z=0.04635128578828538
Epoch 40: Loss_Y=0.009180933352730243, Loss_Z=0.04632775460862938
Epoch 50: Loss_Y=0.009175163535462097, Loss_Z=0.04632600080343633
Epoch 60: Loss_Y=0.00921398829486148, Loss_Z=0.046357476745969524
Epoch 70: Loss_Y=0.009147297310965512, Loss_Z=0.046314526702356494
Epoch 80: Loss_Y=0.009206138176880046, Loss_Z=0.0463563273447791
Epoch 90: Loss_Y=0.009177102552105984, Loss_Z=0.04633769608754056
Epoch 100: Loss_Y=0.009163358501383877, Loss_Z=0.04632705370926


In [None]:
print(word2vec_model.similar_by_word("pilot",topn=5))
debiased=model.debias("pilot")
#print(debiased)
print(word2vec_model.similar_by_vector(debiased,topn=5))

print(word2vec_model.similar_by_word("penis",topn=5))
debiased=model.debias("penis")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similar_by_word("wife",topn=5))
debiased=model.debias("wife")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similar_by_word("macho",topn=5))
debiased=model.debias("macho")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similar_by_word("muscular",topn=5))
debiased=model.debias("muscular")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similar_by_word("strongman",topn=5))
debiased=model.debias("strongman")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similarity("midwife","woman"))
print(word2vec_model.most_similar_cosmul(positive=[model.debias("midwife")],negative=[model.debias("woman")])) # Changed line

[('Pilot', 0.6522234678268433), ('piloting', 0.6444098353385925), ('pilots', 0.6399162411689758), ('relaxed_el_Amruni', 0.6388845443725586), ('Samoshin', 0.576824426651001)]
[('pilot', 0.5282049775123596), ('relaxed_el_Amruni', 0.48049792647361755), ('eject_safely', 0.47894588112831116), ('HH_1N_Huey', 0.4657941162586212), ('aborts_landing', 0.45653775334358215)]
[('penises', 0.7238615155220032), ('genitals', 0.7193513512611389), ('vagina', 0.6902350783348083), ('testicles', 0.689781665802002), ('penile', 0.6632111668586731)]
[('penis', 0.5951092839241028), ('penises', 0.5219300389289856), ('erect_penis', 0.5028186440467834), ('genitalia', 0.4980599880218506), ('genitals', 0.4946903586387634)]

[('husband', 0.8294167518615723), ('daughter', 0.7662219405174255), ('fiancée', 0.7583051919937134), ('mother', 0.7550682425498962), ('fiancee', 0.7449482679367065)]
[('wife', 0.7480432391166687), ('husband', 0.647176206111908), ('daughter', 0.6396773457527161), ('son', 0.6233857870101929), ('da

In [None]:
#Save model weights
model.save_weights('/content/drive/MyDrive/MyModels/debias_trained_2weights.weights.h5')

In [None]:
# Reconstruct the architecture
loaded_model = AdversarialBiasMitigation(input_dim=300)

# Pass dummy data through the model to implicitly build it
dummy_input = tf.random.normal([1, 300])  # Batch size = 1, input_dim = 300
_ = loaded_model(dummy_input)  # Forward pass to initialize and build the model


# Load weights into the new model instance
loaded_model.load_weights('/content/drive/MyDrive/MyModels/debias_trained_2weights.weights.h5')


In [None]:
print(word2vec_model.similar_by_word("penis",topn=5))
debiased=loaded_model.debias("penis")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similar_by_word("wife",topn=5))
debiased=loaded_model.debias("wife")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similar_by_word("macho",topn=5))
debiased=loaded_model.debias("macho")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similar_by_word("muscular",topn=5))
debiased=loaded_model.debias("muscular")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similar_by_word("manly",topn=5))
debiased=loaded_model.debias("manly")
print(word2vec_model.similar_by_vector(debiased,topn=5))
print()

print(word2vec_model.similar_by_word("leader",topn=5))
debiased=loaded_model.debias("leader")
print(word2vec_model.similar_by_vector(debiased,topn=10))
print()

print(word2vec_model.similar_by_word("nurse",topn=5))
debiased=loaded_model.debias("nurse")
print(word2vec_model.similar_by_vector(debiased,topn=10))
print()

print(word2vec_model.similarity("midwife","woman"))
print(word2vec_model.most_similar_cosmul(positive=[word2vec_model["nurse"]],negative=[word2vec_model["woman"]])) # Changed line

[('penises', 0.7238615155220032), ('genitals', 0.7193513512611389), ('vagina', 0.6902350783348083), ('testicles', 0.689781665802002), ('penile', 0.6632111668586731)]
[('penis', 0.5951092839241028), ('penises', 0.5219300389289856), ('erect_penis', 0.5028186440467834), ('genitalia', 0.4980599880218506), ('genitals', 0.4946903586387634)]

[('husband', 0.8294167518615723), ('daughter', 0.7662219405174255), ('fiancée', 0.7583051919937134), ('mother', 0.7550682425498962), ('fiancee', 0.7449482679367065)]
[('wife', 0.7480432391166687), ('husband', 0.647176206111908), ('daughter', 0.6396773457527161), ('son', 0.6233857870101929), ('daughters', 0.6203823685646057)]

[('machismo', 0.7543787360191345), ('manly', 0.724465548992157), ('masculine', 0.6791226267814636), ('hyper_masculine', 0.6728920936584473), ('Macho_macho', 0.6065714359283447)]
[('macho', 0.5847579836845398), ('playfight', 0.47934985160827637), ('dry_humped', 0.4739474654197693), ('slack_jawed_yokels', 0.4693150520324707), ('sooooo

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(word2vec_model["midwife"].reshape(1,-1),word2vec_model["woman"].reshape(1,-1)))
print(cosine_similarity(loaded_model.debias("midwife").reshape(1,-1),word2vec_model["woman"].reshape(1,-1)))
print(cosine_similarity(word2vec_model["husband"].reshape(1,-1),word2vec_model["man"].reshape(1,-1)))
print(cosine_similarity(loaded_model.debias("husband").reshape(1,-1),word2vec_model["man"].reshape(1,-1)))


[[0.38243774]]
[[0.10887501]]
[[0.3449975]]
[[0.28097668]]


In [None]:
def analogy(a,b,c,trained_model):
  biased=-word2vec_model[a]+word2vec_model[b]+word2vec_model[c]
  print("Original 4th word completing analogy")
  print(word2vec_model.similar_by_vector(biased,topn=5))
  debiased=trained_model.debias(biased)
  print("Debiased 4th word completing analogy")
  print(word2vec_model.similar_by_vector(debiased,topn=5))


In [None]:
analogy('man','woman','dentist',loaded_model)

Original 4th word completing analogy
[('husband', 0.8645042181015015), ('mother', 0.712462842464447), ('wife', 0.7117722034454346), ('daughter', 0.7000213265419006), ('fiancé', 0.6188782453536987)]
Debiased 4th word completing analogy
[('wife', 0.5400162935256958), ('husband', 0.5280689597129822), ('father', 0.4422704577445984), ('mom', 0.4396694004535675), ('daughter', 0.43689027428627014)]


In [None]:
result = word2vec_model.most_similar(positive=['woman', 'doctor'], negative=['man'])
print(word2vec_model.most_similar(loaded_model.debias("gynecologist")))
print(result)


[('ANo', 0.42786943912506104), ('PICKAWAY', 0.40764257311820984), ('Sudan', 0.3930776119232178), ('PittsburgMo', 0.3920689821243286), ('Journalistically', 0.3900778889656067), ('gynecologist', 0.38513755798339844), ('Sudanese', 0.3810306787490845), ('Israelis', 0.37683916091918945), ('ConfiDent_®', 0.3766477406024933), ('women', 0.3741796612739563)]
[('gynecologist', 0.7093892097473145), ('nurse', 0.6477286219596863), ('doctors', 0.6471461653709412), ('physician', 0.64389967918396), ('pediatrician', 0.6249487996101379), ('obstetrician', 0.6072014570236206), ('midwife', 0.5927063822746277), ('dermatologist', 0.5739567279815674), ('pharmacist', 0.5698872804641724), ('oncologist', 0.5691169500350952)]
