# dependencies

In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import fasttext as fastText
from wordfreq import word_frequency, zipf_frequency

# functions

In [2]:
def load_fasttext():
    #load pretrained fasttext crawl-300d-2M-subword.zip
    print('loading word embeddings...')
    embeddings_index = {}
    f = open('/sandata/lexica_creation/crawl-300d-2M.vec',encoding='utf-8')
    for idx, line in enumerate(f):
        if idx != 0:
            values = line.strip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))

    return embeddings_index

In [3]:
def get_emb_ls(ls):
    # input: list of seed words
    # output: list of corresponding embeddings
    ls_seed_emb = []
    for seed in ls:
        seed_emb = np.mean([fasttext[x] for x in seed.split(' ') if x in fasttext], axis = 0)
        ls_seed_emb.append(seed_emb)
    return ls_seed_emb

In [4]:
def get_center_emb(ls_emb):
    # input: list of embeddings
    # output: center embedding
    return np.mean(ls_emb, axis = 0)

In [5]:
def sort_words(seed_center_emb,output_size):
    # input: center embedding
    # output: list of all words with cosine similarity to the center, descending order
    final_list = []
    min_sim = 0
    
    all_words = list(fasttext.keys())
    for word in tqdm(all_words):
        cos_sim = cosine_similarity([seed_center_emb],[fasttext[word]])[0][0]
        if cos_sim > min_sim:
            if len(final_list) == output_size:
                final_list = final_list[:-1]
            final_list.append((word,cos_sim))
            final_list.sort(key = lambda x: x[1], reverse = True)
            min_sim = final_list[-1][1]
    
    return final_list

In [6]:
def lexica_creation(seed_words, output_size):
    ls_seed_emb = get_emb_ls(seed_words)
    seed_center_emb = get_center_emb(ls_seed_emb)
    lexica = sort_words(seed_center_emb,output_size)
    return lexica

In [7]:
def load_seed_words(file):
    seed_words = []
    with open(file,'r') as file:
        for word in file:      
            seed_words.append(word.lower().strip()) 
    return seed_words

# main

### input

In [8]:
seed_words_indv = load_seed_words("individualist_seed.txt")
seed_words_coll = load_seed_words("collectivist_seed.txt")

### initialization

In [9]:
fasttext = load_fasttext()

loading word embeddings...
found 1999995 word vectors


In [89]:
vectors = fastText.load_model('/sandata/lexica_creation/wiki.en.bin')
wiki_en_words, freqs = vectors.get_words(include_freq=True)
wiki_en_freqs = dict(zip(wiki_en_words, freqs))



### functions

In [11]:
#get nearest words above a certain distance threshold
def get_nearest_words(word, threshold_wiki, threshold_wordfreq):
    nearest_words = lexica_creation([word], 100)
    final_words = []
    for n in nearest_words:
        if(n[1] > threshold):
            final_words.append(n)
    return final_words

def save_dict_to_file(dic, filename):
    f = open(filename,'w')
    f.write(str(dic))
    f.close()

def load_dict_from_file(filename):
    f = open(filename,'r')
    data=f.read()
    f.close()
    return eval(data)

def parse_list(lexica, threshold_wiki, threshold_wordfreq):
    final_lexica = []
    for w in lexica:
        word = w[0]#.replace("-", " ")
        try:
            freq_wiki = wiki_en_freqs[word]
        except:
            freq_wiki = 0
        freq_wordfreq = word_frequency(word, 'en', wordlist='best', minimum=0.0)
        if(freq_wiki > threshold_wiki or freq_wordfreq > threshold_wordfreq):
            final_lexica.append(w)
    return final_lexica

def print_excel(lexica):
    for w in lexica:
        print(w[0] + ", " + str(w[1]))

### lexica creation

In [12]:
threshold = 0.6
SAVED = True

if(SAVED):
    nearest_words_indv = load_dict_from_file("nearest_words_indv.txt")
    nearest_words_coll = load_dict_from_file("nearest_words_coll.txt")
    lexica_indv = load_dict_from_file("lexica_indv.txt")
    lexica_coll = load_dict_from_file("lexica_coll.txt")
else:
    #get nearest individualist words
    nearest_words_indv = {}
    for w in seed_words_indv:
        nearest_words = get_nearest_words(w, threshold)
        nearest_words_indv[w] = nearest_words

    #get nearest collectivist words
    nearest_words_coll = {}
    for w in seed_words_coll:
        nearest_words = get_nearest_words(w, threshold)
        nearest_words_coll[w] = nearest_words

    #get full lexica
    lexica_indv = lexica_creation(seed_words_indv,1000)
    lexica_coll = lexica_creation(seed_words_coll,1000)
    
    #save outputs
    save_dict_to_file(nearest_words_indv, "nearest_words_indv.txt")
    save_dict_to_file(nearest_words_coll, "nearest_words_coll.txt")
    save_dict_to_file(lexica_indv, "lexica_indv.txt")
    save_dict_to_file(lexica_coll, "lexica_coll.txt")

### Overlap large cluster lexicon with individual cluster lexicons

In [84]:
#Aggregate results
total_indv = []
for seed in nearest_words_indv:
    for word in nearest_words_indv[seed]:
        total_indv.append(word)

total_coll = []
for seed in nearest_words_coll:
    for word in nearest_words_coll[seed]:
        total_coll.append(word)


In [189]:
THRESHOLD_SINGLE = 0.75
THRESHOLD_CLUSTER = 0.45

final_indv = []
#Add seed words to final lexica with distance 1
for w in seed_words_indv:
    final_indv.append((w, 1))

#add words from single word cluster expansion that also overlap with joint cluster expansion
#where distance from expanded word --> original word is > THRESHOLD_SINGLE
final_temp_keys = list(zip(*final_indv))[0]
lexica_indv_keys = list(zip(*lexica_indv))[0]
for w1 in total_indv:
    if (w1[0] in lexica_indv_keys and w1[1] > THRESHOLD_SINGLE and w1[0] not in final_temp_keys):
        final_indv.append(w1)

#add words from joint cluster expansion 
#where distance from expanded word --> original centroid is > THRESHOLD_CLUSTER
final_temp_keys = list(zip(*final_indv))[0]
for w in lexica_indv:
    if(w[1] > THRESHOLD_CLUSTER and w[0] not in final_temp_keys):
        final_indv.append(w)

final_coll = []
#Add seed words to final lexica with distance 1
for w in seed_words_coll:
    final_coll.append((w, 1))

#add words from single word cluster expansion that also overlap with joint cluster expansion
#where distance from expanded word --> original word is > THRESHOLD_SINGLE
final_temp_keys = list(zip(*final_coll))[0]
lexica_coll_keys = list(zip(*lexica_coll))[0]
for w1 in total_coll:
    if (w1[0] in lexica_coll_keys and w1[1] > THRESHOLD_SINGLE and w1[0] not in final_temp_keys):
        final_coll.append(w1)

#add words from joint cluster expansion 
#where distance from expanded word --> original centroid is > THRESHOLD_CLUSTER
final_temp_keys = list(zip(*final_coll))[0]
for w in lexica_coll:
    if(w[1] > THRESHOLD_CLUSTER and w[0] not in final_temp_keys):
        final_coll.append(w)

In [190]:
#Remove words with tokenizing errors
THRESHOLD_WIKI = 50
THRESHOLD_WORDFREQ = 1e-4

final_indv_parsed = parse_list(final_indv, THRESHOLD_WIKI, THRESHOLD_WORDFREQ)
final_coll_parsed = parse_list(final_coll, THRESHOLD_WIKI, THRESHOLD_WORDFREQ)

len(wiki_en_words)

2519370