In [1020]:
import numpy as np
import pandas as pd
import string
from tqdm import tqdm
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

In [1021]:
with open('stopwords.txt') as f:
    stopwords = f.read().replace('\n',' ').split()

In [1022]:
words = 0
stopwords = []
#remove double quotation marks and citations/references if any
with open('training_text.txt') as f1, open('stopwords.txt') as f2:
    for line2 in f2:
        for word in line2.split():
            stopwords.append(word)
    for line1 in f1:
        for word in line1.split():
            if (word) not in L:
                words += 1

words - 7

113

In [1023]:
with open('training_text.txt', encoding='utf-8') as f:
    word = 0
    text = f.read().replace('\n','')
    print(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([t for t in text if t not in list('0123456789')])
    text = text.replace('”', '').replace('“', '').replace('’', '').lower().split()

text = [w for w in text if w not in stopwords][:2000]

Analyze the following text for words pertaining to masculinity and femininity : Selection of appropriate software plays a vital role in facilitatingchildren’s desire to use computers. The discrepancybetween girls’ and boys’ use of computers is enhanced by thescarcity of gender bias-free software programs. Analysis ofthe extant literature and rating of children’s software indicatesthat gender bias is reflected in terms of characters, contentand reward systems in the software program. A checklistis developed and pretested in this article. This checklist canbe used to generate awareness of the subtle but pervasivegender bias in children’s software programs and can be usedby teachers in the selection and use of appropriate softwareprograms in their classrooms.


In [1024]:
WINDOW_SIZE = 3
NUM_NEGATIVE_SAMPLES = 3

data = []

#iterate over all words
for idx,center_word in enumerate(text[WINDOW_SIZE-1:-WINDOW_SIZE]):
    
    #iterate over the context words around the center word
    context_words = [context_word for context_word in text[idx:idx+2*WINDOW_SIZE-1] if context_word != center_word]
    for context_word in context_words:
        
        #get words NOT in the current context as negative examples
        data.append([center_word, context_word, 1])
        negative_samples = np.random.choice([w for w in text[WINDOW_SIZE-1:-WINDOW_SIZE] if w != center_word and w not in context_words], NUM_NEGATIVE_SAMPLES)
        
        for negative_samp in negative_samples:
            
            #add a training row
            data.append([center_word, negative_samp, 0])

In [1025]:
df = pd.DataFrame(columns=['center_word', 'context_word', 'label'], data=data)
words = np.intersect1d(df.context_word, df.center_word)
df = df[(df.center_word.isin(words)) & (df.context_word.isin(words))].reset_index(drop=True)

In [1026]:
def sigmoid(v, scale=1):
    return 1 / (1 + np.exp(-scale*v))

In [1027]:
def update_embeddings(df, main_embeddings, context_embeddings, learning_rate, debug=False):
    
    #get differences between main embeddings and corresponding context embeddings
    main_embeddings_center = main_embeddings.loc[df.center_word].values
    context_embeddings_context = context_embeddings.loc[df.context_word].values
    diffs = context_embeddings_context - main_embeddings_center
    
    #get similarities, scores, and errors between main embeddings and corresponding context embeddings
    dot_prods = np.sum(main_embeddings_center * context_embeddings_context, axis=1)
    scores = sigmoid(dot_prods)
    errors = (df.label - scores).values.reshape(-1,1)
    
    #calculate updates
    updates = diffs*errors*learning_rate
    updates_df = pd.DataFrame(data=updates)
    updates_df['center_word'] = df.center_word
    updates_df['context_word'] = df.context_word
    updates_df_center = updates_df.groupby('center_word').sum()
    updates_df_context = updates_df.groupby('context_word').sum()
    
    if debug:
        plot_words(debug)
    
    #apply updates
    main_embeddings += updates_df_center.loc[main_embeddings.index]
    context_embeddings -= updates_df_context.loc[context_embeddings.index]
    
    #normalize embeddings
    main_embeddings = normalize_data(main_embeddings)
    context_embeddings = normalize_data(context_embeddings)
    
    #return the updated embeddings
    return main_embeddings, context_embeddings

In [1028]:
def normalize_data(data):
    row_norms = np.sqrt((data.values**2).sum(axis=1)).reshape(-1,1)
    return data.divide(row_norms, axis='index')

In [1029]:
def plot_words(debug):
    plt.figure(figsize=(8,4))
    
    plt.subplot(1,2,1)
    lim_main_first = main_embeddings.loc[[debug[0]]]
    lim_main_second = main_embeddings.loc[[debug[1]]]
    p1 = plt.scatter(lim_main_first[0], lim_main_first[1], color='r')
    plt.arrow(0,0,float(lim_main_first[0]), float(lim_main_first[1]), head_width=0.01, length_includes_head=True)
    for idx,row in lim_main_first.iterrows():
        plt.text(row[0], row[1], str(idx))
    p2 = plt.scatter(lim_main_second[0], lim_main_second[1], color='r')
    plt.arrow(0,0,float(lim_main_second[0]), float(lim_main_second[1]), head_width=0.01, length_includes_head=True)
    for idx,row in lim_main_second.iterrows():
        plt.text(row[0], row[1], str(idx))
    sim = 1 - cosine(main_embeddings.loc[debug[0]], main_embeddings.loc[debug[1]])
    plt.title('Sim = %s'%round(sim,4), fontsize=20)
    plt.axvline(0, color='k', linestyle='--', alpha=0.5)
    plt.axhline(0, color='k', linestyle='--', alpha=0.5)
    
    t = np.arange(0, 3.14*2+0.1, 0.1)
    plt.plot(np.cos(t), np.sin(t), linewidth=1, color='k', alpha=0.5, linestyle='--')
    
    ###################################
    plt.subplot(1,2,2)
    lim_main = main_embeddings.loc[[debug[0]]]
    lim_context = context_embeddings.loc[[debug[1]]]
    p1 = plt.scatter(lim_main[0], lim_main[1], color='r')
    plt.arrow(0,0,float(lim_main[0]), float(lim_main[1]), head_width=0.01, length_includes_head=True)
    for idx,row in lim_main.iterrows():
        plt.text(row[0], row[1], str(idx))
    p2 = plt.scatter(lim_context[0], lim_context[1], color='b')
    plt.arrow(0,0,float(lim_context[0]), float(lim_context[1]), head_width=0.01, length_includes_head=True)
    for idx,row in lim_context.iterrows():
        plt.text(row[0], row[1], str(idx))
    sim = 1 - cosine(main_embeddings.loc[debug[0]], context_embeddings.loc[debug[1]])
    plt.title('Sim = %s'%round(sim,4), fontsize=20)
    plt.axvline(0, color='k', linestyle='--', alpha=0.5)
    plt.axhline(0, color='k', linestyle='--', alpha=0.5)
    
    plt.plot(np.cos(t), np.sin(t), linewidth=1, color='k', alpha=0.5, linestyle='--')
    
    plt.show()

In [1030]:
# run until all pairs shown have similarity scores of 1.0 or stop when one gender has over x words
EMBEDDING_SIZE = 2

main_embeddings = np.random.normal(0,0.1,(len(words), EMBEDDING_SIZE))
row_norms = np.sqrt((main_embeddings**2).sum(axis=1)).reshape(-1,1)
main_embeddings = main_embeddings / row_norms

context_embeddings = np.random.normal(0,0.1,(len(words), EMBEDDING_SIZE))
row_norms = np.sqrt((context_embeddings**2).sum(axis=1)).reshape(-1,1)
context_embeddings = context_embeddings / row_norms

main_embeddings = pd.DataFrame(data=main_embeddings, index=words)
context_embeddings = pd.DataFrame(data=context_embeddings, index=words)

In [1031]:
# run until all pairs shown have similarity scores of 1.0, stop after running x times or after 2x amount of words reach similarity score for one gender
L = []
restraints = ['he', 'son', 'his', 'him', 'father', 'man', 'boy', 'himself', 'male', 'brother', 'sons', 'fathers', 'men', 'boys', 'males', 'brothers', 
'uncle', 'uncles',  'nephew', 'nephews', 'gentleman', 'grandfather', 'sir', 'lord', 'mr.', 'mister', 'husband', 'fiance', 'groom', 'patriarch',
'lad', 'chap', 'bloke', 'dude', 'guy', 'fellow', 'stag', 'bull', 'stallion', 'monk', 'masculine', 'boyfriend', 'handsome', 'king', 'prince', 'god', 
'chief', 'manly', 'duke', 'emperor', 'sire', 'huntsman', 'wizard', 'bishop', 'manager', 'buisnessman', 'hero', 'masculinity']
for w1 in words:
    for w2 in words:
        if w1 != w2:
            sim = 1 - cosine(main_embeddings.loc[w1], main_embeddings.loc[w2])
            if w2 not in restraints:
                L.append((w1,w2,round(sim, 2)))
            #top [:x] similar words
            #word must be in text enough times
            #omit spaces/hyphens
sorted([item for item in L if item[0] == 'femininity'], key=lambda t: -t[2])[:6]

[('femininity', 'bias', 0.99),
 ('femininity', 'characters', 0.99),
 ('femininity', 'indicatesthat', 0.98),
 ('femininity', 'article', 0.97),
 ('femininity', 'desire', 0.97),
 ('femininity', 'reflected', 0.97)]

In [1032]:
# stop after running x times or after 2x amount of words reach similarity score for one gender
L = []
restraints = ['femme', 'she', 'daughter', 'hers', 'her', 'mother', 'woman', 'girl', 'herself', 'female', 'sister', 'daughters', 'mothers', 'women',
'girls', 'females', 'sisters', 'aunt', 'aunts', 'niece', 'nieces', 'grandmother', 'miss', 'ms.', 'madam', 'fiancee', 'bride', 'matriarch', 'gentleness', 
'sisterhood', 'sorority', 'feminine', 'girlfriend', 'quenn', 'princess', 'goddess', 'diva', 'maiden', 'dame', 'matron', 'heroine', 'muse', 'womanhood',
'femininity', 'buisnesswoman', 'actress', 'manageress', 'ladylike', 'girlish', 'beautiful', 'femininity']
for w1 in words:
    for w2 in words:
        if w1 != w2:
            sim = 1 - cosine(main_embeddings.loc[w1], main_embeddings.loc[w2])
            if w2 not in restraints:
                L.append((w1,w2,round(sim, 2)))
            #top [:x] similar words
            #word must be in text enough times
            #omit spaces/hyphens
sorted([item for item in L if item[0] == 'masculinity'], key=lambda t: -t[2])[:6]

[('masculinity', 'used', 1.0),
 ('masculinity', 'usedby', 1.0),
 ('masculinity', 'role', 0.96),
 ('masculinity', 'literature', 0.95),
 ('masculinity', 'thescarcity', 0.88),
 ('masculinity', 'vital', 0.87)]