### Dependencies

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import fasttext as fastText
from wordfreq import word_frequency, zipf_frequency
import pandas as pd

### Initialization

In [None]:
def load_fasttext():
    #load pretrained fasttext crawl-300d-2M-subword.zip
    print('loading FT word embeddings...')
    embeddings_index = {}
    f = open('/sandata/lexica_creation/crawl-300d-2M.vec',encoding='utf-8')
    for idx, line in enumerate(f):
        if idx != 0:
            values = line.strip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))
    return embeddings_index

fasttext = load_fasttext()

vectors = fastText.load_model('/sandata/lexica_creation/wiki.en.bin')
wiki_en_words, freqs = vectors.get_words(include_freq=True)
wiki_en_freqs = dict(zip(wiki_en_words, freqs))

### Input seed words

In [None]:
#Each category should be a list of lowercase, stripped words
def load_seed_words(file):
    seed_words = []
    with open(file,'r') as file:
        for word in file:      
            seed_words.append(word.lower().strip()) 
    return seed_words

seed_df = pd.read_csv("TL_NHB.csv")
seed_words_indv = seed_df[seed_df["category"] == "indv"]["term"].tolist()
seed_words_coll = seed_df[seed_df["category"] == "coll"]["term"].tolist()

### Helper Functions

In [None]:
# input: list of seed words
# output: list of corresponding embeddings
def get_seed_embeddings(ls):
    ls_seed_emb = []
    for seed in ls:
        seed_emb = np.mean([fasttext[x] for x in seed.split(' ') if x in fasttext], axis = 0)
        ls_seed_emb.append(seed_emb)
    return ls_seed_emb

# input: list of embeddings
# output: center embedding
def get_center_embedding(ls_emb):
    # filter out embeddings with nan values
    ls_emb = [x for x in ls_emb if not np.isnan(x).any()]
    return np.mean(ls_emb, axis = 0)


# input: center embedding
# output: list of all words with their corresponding cosine similarity, descending order
def concept_expansion(seed_center_emb,output_size):
    final_list = []
    min_sim = 0   
    all_words = list(fasttext.keys())
    for word in tqdm(all_words):
        try:
            cos_sim = cosine_similarity([seed_center_emb],[fasttext[word]])[0][0]
        except:
            print("error: ", word)
            continue
        if cos_sim > min_sim:
            if len(final_list) == output_size:
                final_list = final_list[:-1]
            final_list.append((word,cos_sim))
            final_list.sort(key = lambda x: x[1], reverse = True)
            min_sim = final_list[-1][1]
    return final_list


# input: list of seed words, output size
# output: concept expanded lexica
def get_concept_expansion(seed_words, output_size):
    ls_seed_emb = get_seed_embeddings(seed_words)
    seed_center_emb = get_center_embedding(ls_seed_emb)
    lexica = concept_expansion(seed_center_emb,output_size)
    return lexica

# input: seed word, fasttext model, number of synonyms
# output: list of synonyms
def synonym_expansion(word, model, k=50):
    nearest_words = model.get_nearest_neighbors(word, k)
    final_words = []
    for n in nearest_words:
        final_words.append((n[1], n[0]))
    return final_words

def save_dict_to_file(dic, filename):
    f = open(filename,'w')
    f.write(str(dic))
    f.close()

def load_dict_from_file(filename):
    f = open(filename,'r')
    data=f.read()
    f.close()
    return eval(data)

def parse_list(lexica, threshold_wiki, threshold_wordfreq):
    final_lexica = []
    for w in lexica:
        word = w[0]#.replace("-", " ")
        try:
            freq_wiki = wiki_en_freqs[word]
        except:
            freq_wiki = 0
        freq_wordfreq = word_frequency(word, 'en', wordlist='best', minimum=0.0)
        if(freq_wiki > threshold_wiki or freq_wordfreq > threshold_wordfreq):
            final_lexica.append(w)
    return final_lexica

def print_excel(lexica):
    for w in lexica:
        print(w[0] + ", " + str(w[1]))

## Lexica Creation

In [None]:
SAVED = False

if(SAVED):
    synonym_expansion_indv = load_dict_from_file("nearest_words_indv.txt")
    synonym_expansion_coll = load_dict_from_file("nearest_words_coll.txt")
    concept_expansion_indv = load_dict_from_file("lexica_indv.txt")
    concept_expansion_coll = load_dict_from_file("lexica_coll.txt")
else:
    #get synonyms for indvness words
    synonym_expansion_indv = {}
    for w in seed_words_indv:
        print(w)
        nearest_words = synonym_expansion(w, vectors, 100)
        synonym_expansion_indv[w] = nearest_words
    save_dict_to_file(synonym_expansion_indv, "nearest_words_indv.txt")

    #get synonyms for collness words
    synonym_expansion_coll = {}
    for w in seed_words_coll:
        print(w)
        nearest_words = synonym_expansion(w, vectors, 100)
        synonym_expansion_coll[w] = nearest_words
    save_dict_to_file(synonym_expansion_coll, "nearest_words_coll.txt")

    #get concept expansion for indvness words
    concept_expansion_indv = get_concept_expansion(seed_words_indv,1000)
    save_dict_to_file(concept_expansion_indv, "lexica_indv.txt")
    
    #get concept expansion for collness words
    concept_expansion_coll = get_concept_expansion(seed_words_coll,1000)
    save_dict_to_file(concept_expansion_coll, "lexica_coll.txt")    
    
    
    

### Overlap synonym expansion and concept expansion

In [None]:
#Aggregate results
total_indv = []
for seed in synonym_expansion_indv:
    for word in synonym_expansion_indv[seed]:
        total_indv.append(word)

total_coll = []
for seed in synonym_expansion_coll:
    for word in synonym_expansion_coll[seed]:
        total_coll.append(word)


In [None]:
THRESHOLD_SINGLE = 0.75
THRESHOLD_CLUSTER = 0.45

final_indv = []
#Add seed words to final lexica with distance 1
for w in seed_words_indv:
    final_indv.append((w, 1))

#add words from single word cluster expansion that also overlap with joint cluster expansion
#where distance from expanded word --> original word is > THRESHOLD_SINGLE
final_temp_keys = list(zip(*final_indv))[0]
lexica_indv_keys = list(zip(*concept_expansion_indv))[0]
for w1 in total_indv:
    if (w1[0] in lexica_indv_keys and w1[1] > THRESHOLD_SINGLE and w1[0] not in final_temp_keys):
        final_indv.append(w1)

#add words from joint cluster expansion 
#where distance from expanded word --> original centroid is > THRESHOLD_CLUSTER
final_temp_keys = list(zip(*final_indv))[0]
for w in concept_expansion_indv:
    if(w[1] > THRESHOLD_CLUSTER and w[0] not in final_temp_keys):
        final_indv.append(w)

final_coll = []
#Add seed words to final lexica with distance 1
for w in seed_words_coll:
    final_coll.append((w, 1))

#add words from single word cluster expansion that also overlap with joint cluster expansion
#where distance from expanded word --> original word is > THRESHOLD_SINGLE
final_temp_keys = list(zip(*final_coll))[0]
lexica_coll_keys = list(zip(*concept_expansion_coll))[0]
for w1 in total_coll:
    if (w1[0] in lexica_coll_keys and w1[1] > THRESHOLD_SINGLE and w1[0] not in final_temp_keys):
        final_coll.append(w1)

#add words from joint cluster expansion 
#where distance from expanded word --> original centroid is > THRESHOLD_CLUSTER
final_temp_keys = list(zip(*final_coll))[0]
for w in concept_expansion_coll:
    if(w[1] > THRESHOLD_CLUSTER and w[0] not in final_temp_keys):
        final_coll.append(w)

In [None]:
#Remove words with tokenizing errors
THRESHOLD_WIKI = 50
THRESHOLD_WORDFREQ = 1e-4

final_indv_parsed = parse_list(final_indv, THRESHOLD_WIKI, THRESHOLD_WORDFREQ)
final_coll_parsed = parse_list(final_coll, THRESHOLD_WIKI, THRESHOLD_WORDFREQ)

In [None]:

#save file to csv
FILENAME = "individualism_collectivism.csv".format(THRESHOLD_CLUSTER, THRESHOLD_SINGLE)
df = pd.DataFrame(columns=['WORD', 'CATEGORY','WEIGHT'])
for w in final_indv_parsed:
    df = df.append({'WORD': w[0], 'CATEGORY': 'indv', 'WEIGHT': w[1]}, ignore_index=True)
for w in final_coll_parsed:
    df = df.append({'WORD': w[0], 'CATEGORY': 'coll', 'WEIGHT': w[1]}, ignore_index=True)
df.to_csv("{}".format(FILENAME), index=False)