In [35]:
import numpy as np
import pandas as pd
import string
from tqdm import tqdm
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
import os
import openai

In [36]:
with open('stopwords.txt') as f:
    stopwords = f.read().replace('\n',' ').split()

In [37]:
with open('training_text.txt', encoding='utf-8') as f:
    text = f.read().replace('\n','')
    print(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([t for t in text if t not in list('0123456789')])
    text = text.replace('”', '').replace('“', '').replace('’', '').lower().split()

text = [w for w in text if w not in stopwords][:2000]

history of ice hockey, notable events and people in the development of ice hockey since its creation during the 19th century. Known simply as hockey throughout North America (despite the confusion this creates with the less-prominent field hockey), it is unique among popular team sports in that it is played on top of a sheet of ice. As such, it first gained popularity in the colder climates of Canada and the northern United States, particularly in the northeast and around the Great Lakes, where iced-over lakes could serve as playing surfaces during the winter. Canada, in particular, has taken hockey to heart and made it the country’s official winter sport, becoming home to the vast majority of the game’s best players until the late 20th century. The country also provided ice hockey its most prestigious honour, the Stanley Cup, which was first awarded in 1892–93 and has, since 1926, gone to the champion of the National Hockey League (NHL), the sport’s premier level of play.


In [38]:
WINDOW_SIZE = 3
NUM_NEGATIVE_SAMPLES = 3

data = []

#iterate over all words
for idx,center_word in enumerate(text[WINDOW_SIZE-1:-WINDOW_SIZE]):
    
    #iterate over the context words around the center word
    context_words = [context_word for context_word in text[idx:idx+2*WINDOW_SIZE-1] if context_word != center_word]
    for context_word in context_words:
        
        #get words NOT in the current context as negative examples
        data.append([center_word, context_word, 1])
        negative_samples = np.random.choice([w for w in text[WINDOW_SIZE-1:-WINDOW_SIZE] if w != center_word and w not in context_words], NUM_NEGATIVE_SAMPLES)
        
        for negative_samp in negative_samples:
            
            #add a training row
            data.append([center_word, negative_samp, 0])

In [39]:
df = pd.DataFrame(columns=['center_word', 'context_word', 'label'], data=data)
words = np.intersect1d(df.context_word, df.center_word)
df = df[(df.center_word.isin(words)) & (df.context_word.isin(words))].reset_index(drop=True)

In [40]:
def sigmoid(v, scale=1):
    return 1 / (1 + np.exp(-scale*v))

In [41]:
def update_embeddings(df, main_embeddings, context_embeddings, learning_rate, debug=False):
    
    #get differences between main embeddings and corresponding context embeddings
    main_embeddings_center = main_embeddings.loc[df.center_word].values
    context_embeddings_context = context_embeddings.loc[df.context_word].values
    diffs = context_embeddings_context - main_embeddings_center
    
    #get similarities, scores, and errors between main embeddings and corresponding context embeddings
    dot_prods = np.sum(main_embeddings_center * context_embeddings_context, axis=1)
    scores = sigmoid(dot_prods)
    errors = (df.label - scores).values.reshape(-1,1)
    
    #calculate updates
    updates = diffs*errors*learning_rate
    updates_df = pd.DataFrame(data=updates)
    updates_df['center_word'] = df.center_word
    updates_df['context_word'] = df.context_word
    updates_df_center = updates_df.groupby('center_word').sum()
    updates_df_context = updates_df.groupby('context_word').sum()
    
    if debug:
        plot_words(debug)
    
    #apply updates
    main_embeddings += updates_df_center.loc[main_embeddings.index]
    context_embeddings -= updates_df_context.loc[context_embeddings.index]
    
    #normalize embeddings
    main_embeddings = normalize_data(main_embeddings)
    context_embeddings = normalize_data(context_embeddings)
    
    #return the updated embeddings
    return main_embeddings, context_embeddings

In [42]:
def normalize_data(data):
    row_norms = np.sqrt((data.values**2).sum(axis=1)).reshape(-1,1)
    return data.divide(row_norms, axis='index')

In [43]:
def plot_words(debug):
    plt.figure(figsize=(8,4))
    
    plt.subplot(1,2,1)
    lim_main_first = main_embeddings.loc[[debug[0]]]
    lim_main_second = main_embeddings.loc[[debug[1]]]
    p1 = plt.scatter(lim_main_first[0], lim_main_first[1], color='r')
    plt.arrow(0,0,float(lim_main_first[0]), float(lim_main_first[1]), head_width=0.01, length_includes_head=True)
    for idx,row in lim_main_first.iterrows():
        plt.text(row[0], row[1], str(idx))
    p2 = plt.scatter(lim_main_second[0], lim_main_second[1], color='r')
    plt.arrow(0,0,float(lim_main_second[0]), float(lim_main_second[1]), head_width=0.01, length_includes_head=True)
    for idx,row in lim_main_second.iterrows():
        plt.text(row[0], row[1], str(idx))
    sim = 1 - cosine(main_embeddings.loc[debug[0]], main_embeddings.loc[debug[1]])
    plt.title('Sim = %s'%round(sim,4), fontsize=20)
    plt.axvline(0, color='k', linestyle='--', alpha=0.5)
    plt.axhline(0, color='k', linestyle='--', alpha=0.5)
    
    t = np.arange(0, 3.14*2+0.1, 0.1)
    plt.plot(np.cos(t), np.sin(t), linewidth=1, color='k', alpha=0.5, linestyle='--')
    
    ###################################
    plt.subplot(1,2,2)
    lim_main = main_embeddings.loc[[debug[0]]]
    lim_context = context_embeddings.loc[[debug[1]]]
    p1 = plt.scatter(lim_main[0], lim_main[1], color='r')
    plt.arrow(0,0,float(lim_main[0]), float(lim_main[1]), head_width=0.01, length_includes_head=True)
    for idx,row in lim_main.iterrows():
        plt.text(row[0], row[1], str(idx))
    p2 = plt.scatter(lim_context[0], lim_context[1], color='b')
    plt.arrow(0,0,float(lim_context[0]), float(lim_context[1]), head_width=0.01, length_includes_head=True)
    for idx,row in lim_context.iterrows():
        plt.text(row[0], row[1], str(idx))
    sim = 1 - cosine(main_embeddings.loc[debug[0]], context_embeddings.loc[debug[1]])
    plt.title('Sim = %s'%round(sim,4), fontsize=20)
    plt.axvline(0, color='k', linestyle='--', alpha=0.5)
    plt.axhline(0, color='k', linestyle='--', alpha=0.5)
    
    plt.plot(np.cos(t), np.sin(t), linewidth=1, color='k', alpha=0.5, linestyle='--')
    
    plt.show()

In [44]:
EMBEDDING_SIZE = 2

main_embeddings = np.random.normal(0,0.1,(len(words), EMBEDDING_SIZE))
row_norms = np.sqrt((main_embeddings**2).sum(axis=1)).reshape(-1,1)
main_embeddings = main_embeddings / row_norms

context_embeddings = np.random.normal(0,0.1,(len(words), EMBEDDING_SIZE))
row_norms = np.sqrt((context_embeddings**2).sum(axis=1)).reshape(-1,1)
context_embeddings = context_embeddings / row_norms

main_embeddings = pd.DataFrame(data=main_embeddings, index=words)
context_embeddings = pd.DataFrame(data=context_embeddings, index=words)

In [46]:
import math

L = []
vFirst = True
check = []
extra = []
wordCount = 0
cutOff = 0

with open("training_text.txt", "r") as text:
    stopWords = open('stopwords.txt', 'r')
    for line in text:
        for word in line.split():
            if word not in stopWords:
                wordCount = wordCount + 1;

cutOff = math.ceil(wordCount / 20)

with open("masculine.txt", "r") as masculine:
    for line in masculine:
        for word in line.split():
            L.clear()
            for w1 in words:
                for w2 in words:
                    if w1 != w2:
                        sim = 1 - cosine(main_embeddings.loc[w1], main_embeddings.loc[w2])
                        L.append((w1,w2,sim))
            for entry in sorted([item for item in L if item[0] == word], key=lambda t: -t[2])[:cutOff]:
                with open("masculine.txt", "a") as men:
                    men.write("\n" + str(entry))

with open("feminine.txt", "r") as feminine:
    for line in feminine:
        for word in line.split():
            L.clear()
            women = open("feminine.txt", "a")
            for w1 in words:
                for w2 in words:
                    if w1 != w2:
                        sim = 1 - cosine(main_embeddings.loc[w1], main_embeddings.loc[w2])
                        L.append((w1,w2,sim))
            for entry in sorted([item for item in L if item[0] == word], key=lambda t: -t[2])[:cutOff]:
                women.write("\n" + str(entry))

with open("neutral.txt", "r") as neutral:
    extra.clear()
    for line in neutral:
        for word in line.split():
            L.clear()
            neutral = open("neutral.txt", "a")
            for compare in words:
                if compare != word:
                        sim = 1 - cosine(main_embeddings.loc[word], main_embeddings.loc[compare])
                        L.append((word,compare,sim))
                        extra.append(compare)
            for entry in sorted([item for item in L if item[0] == word], key=lambda t: -t[2])[:cutOff]:
                check.append(str(entry)[len(word) + 2:])

with open('verify.txt', 'a') as verify:
    for w1 in extra:
        for w2 in check:
            if w1 in w2:
                if vFirst:
                    verify.write(w1)
                    vFirst = False
                else:
                    verify.write("\n" + w1)

KeyError: 'rules'