# Generate Polar Embeddings

This code script was created to generate the POLAR embeddings for a given pre-trained embedding model and antonym list. We created it on the basis of the original POLAR code provided here (https://github.com/Sandipan99/POLAR).

## 1 Data Import 

### 1.1 Import Packages

In [1]:
#Import packages
import gensim
from numpy import linalg
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm
import time
from random import shuffle
import sys
import nltk 
from nltk.corpus import wordnet 
import gc
from collections import defaultdict
import random
import json
import os
import pandas as pd
import random
import scipy
import torch
import subprocess

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

from gensim.test.utils import datapath

### 1.2 Import Model

Here a pre-trained word embedding model can be imported to be used as a basis for the POLAR embeddings.

In [2]:
#only execute if model is not imported yet
#model_glove = glove2word2vec('../data/raw/glove.twitter.27B.200d.txt','gensim_glove_twitter_200d.txt')

In [3]:
#only execute if model is not imported yet
#model_glove = gensim.models.KeyedVectors.load_word2vec_format("../data/raw/reddit_word2vec.txt", binary=False)

In [4]:
#only execute if model is not imported yet
def generate_norm_embedding(model, output_path):
    temp_file = open(output_path,'wb')
    temp_file.write(str.encode(str(len(model.vocab))+' '+str(model.vector_size)+'\n'))
    
    for each_word in tqdm(model.vocab):
        temp_file.write(str.encode(each_word+' '))
        temp_file.write(model[each_word]/linalg.norm(model[each_word]))
        temp_file.write(str.encode('\n'))
    
    temp_file.close()

In [5]:
#only execute if model is not imported yet
#generate_norm_embedding(model_glove,'reddit_word2vec.mod')

In [6]:
#Import word embedding model
model_gn = gensim.models.KeyedVectors.load_word2vec_format('../data/raw/reddit_word2vec.mod',binary=True)
current_model = model_gn

### 1.3 Import POLAR Dimension List

In this part the list of word pairs for the POLAR dimensions is chosen.

In [7]:
#load original antonyms 
#only execute if you wna to use this list
list_antonym = pd.read_pickle(r'../data/interim/final_antonym_list')

In [8]:
#load business antonym list
#only execute if you wna to use this list
list_new= [('product', 'service'), ('essential', 'luxury'), ('technical', 'natural'), #('renewable', 'nonrenewable'),
           ('advertising', 'secretive'), ('lease', 'sell'), ('tangible', 'intangible'), ('demand', 'supply'), #('wfh', 'wfo'),
           ('child', 'childless'), ('remote', 'physical'), ('salary', 'goodies'), ('store', 'online'), 
           ('details', 'outlines'), ('stakeholders', 'spectators'), ('isolating', 'social'), ('goal', 'task'),
           ('employees', 'consultant'), ('cost', 'revenue'), ('seasonal', 'temporary'), ('alliance', 'proprietorship'),
           ('loss', 'profit'), ('integrity', 'corruption'), ('international', 'local'), ('corporate', 'individual'),
           ('order', 'disorder'), ('solution', 'problem'), ('manager', 'worker'), ('diversity', 'uniformity'),
           ('public', 'private'), ('strategic', 'impulsive'), ('innovator', 'follower'), ('bankruptcy', 'prosperity'),
           ('growth', 'decline'), ('sustainable', 'unsustainable'), ('family', 'work'), ('criminal', 'rightful'),
           ('financial', 'artisanal'), ('supplier', 'purchaser'), ('commitment', 'rejection'), ('professional', 'amateur'),
           ('independent', 'dependent'), ('digital', 'analogue'), ('marketing', 'secret'), ('secure', 'risky'), #('longterm', 'shortterm'), 
           ('responsible', 'neglect'), ('ethical', 'unethical'), ('beneficial', 'harmful'),
           ('diversity', 'uniformity'), ('trust', 'mistrust'), ('teamwork', 'individualism'), ('opportunity', 'threat'),
           ('innovative', 'traditional'), ('flexible', 'rigid'), ('ambiguity', 'clarity'), ('feminine', 'masculine'),
           ('globally', 'locally'), ('insiders', 'outsiders'), ('foreigners', 'natives'), ('minorities', 'majority'),
           ('transparency', 'obscurity'), ('discrimination', 'impartial'), ('credible', 'deceptive'), ('environment', 'pollution'),
           ('pressure', 'relax'), ('growth', 'decline'), ('satisfied', 'unsatisfied'), #('diplomatic', 'undiplomatic'), ('motivate', 'demotivate'), ('communicative', 'uncommunicative'), 
           ('connected', 'disconnected'), #('autonomous', 'micromanagement'), 
           ('nurture', 'neglect'), ('progressive', 'conservative'),#('rewarding', 'unrewarding'), ('bias', 'unbias'), 
           ('challenge', 'obscurity'), ('collaboration', 'silo'),
           ('outdated', 'modern'), ('effortless', 'demanding'), ('economic', 'overpriced'), ('widespread', 'local'),
           ('freedom', 'captive'), ('consistent', 'inconsistent')]

list_new= list(dict.fromkeys(list_new).keys())

similarity_matrix = defaultdict(list)
for each_pair in tqdm(list_new):
    word1 = each_pair[0]
    word2 = each_pair[1]
    if word1 < word2:
        similarity_matrix[word1].append(word2)
    else:
        similarity_matrix[word2].append(word1)

all_similarity = defaultdict(dict)
for each_key in tqdm(similarity_matrix):
    for each_value in similarity_matrix[each_key]:
#         cosine_similarity([current_model[each_key]]
        all_similarity[each_key][each_value] = abs(cosine_similarity([current_model[each_key]],[current_model[each_value]])[0][0])

final_list = []
for index_counter, each_key in enumerate(tqdm(all_similarity)):
#     print(each_key,all_similarity[each_key])
    listofTuples = sorted(all_similarity[each_key].items() ,  key=lambda x: x[1])
#     print(listofTuples)
    final_list.append((each_key, listofTuples[0][0]))
print(len(final_list))

list_antonym = final_list

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for each_pair in tqdm(list_new):


  0%|          | 0/74 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for each_key in tqdm(similarity_matrix):


  0%|          | 0/73 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index_counter, each_key in enumerate(tqdm(all_similarity)):


  0%|          | 0/73 [00:00<?, ?it/s]

73


### 1.4 Import Entities to be embedded

In [9]:
#import company names
company = pd.read_csv('../data/raw/International_Fortune_GloVe.csv')
name_list = company['0']

In [10]:
#Company names for reddit embeddings
#only execute when using reddit model
name_list = ['walmart','homedepot','amazon','apple','cvs','toyota','volkswagen','berkshire','mckesson','samsung',
             'ping','royal','industrial','alphabet','hon','exxon','daimler','costco','cigna','cardinal','microsoft',
             'walgreens','allianz','kroger','jpmorgan','huawei','verizon','axa','ford','honda','general','anthem',
             'mitsubishi','deutsche','bmw','nippon','saic','fannie','alibaba','comcast','amer','shandong','chevron',
             'dell','bank','target','marathon','citigroup','hyundai','gazprom','facebook','royal','sony','johnson',
             'hitachi','carrefour','bnp','bosch','tesco','aeon','hsbc','wells','general','state','intel','humana',
             'nippon','deutsche','nissan','munich','enel','banco','procter','sk','pepsico','tencent','albertsons',
             'basf','fedex','metlife','bank','aviation','freddie','greenland','phillips','lockheed','walt','archer',
             'roche','xiamen','pacific','siemens','engie','legal','panasonic','reliance','brookfield','aviva','lenovo',
             'valero','toyota','zurich','xiamen','aegon','boeing','unilever','guangzhou','prudential','airbus','mitsubishi',
             'petrobras','hp','raytheon','softbank','prudential','tokyo','seven','alimentation','lg','goldman','industrial','aluminum',
             'sysco','jbs','morgan','state','ptt','hca','tokio','vodafone','christian','aia','vinci','kia','eni',
             'novartis','renault','shaanxi','cisco','korea','bayer','power','charter','merck','elo','shaanxi','zhejiang',
             'denso','deutsche','publix','allstate','zhejiang','pemex','accenture','edeka','liberty','groupe','lloyds',
             'tyson','bhp','woolworths','progressive','petronas','nationwide','pfizer','shandong','caterpillar','george',
             'vale','acs','maersk','mitsubishi','ubs','oracle','energy','daiwa','jiangsu','zhejiang','dow','meiji',
             'nike','zf','quanta','northrop','volvo','metro','usaa','chubb','banco','xiaomi','deere','barclays','cathay',
             'mitsubishi','abbott','ck','poste','sncf','tata','fujitsu','cedar','northwestern','dollar','louis',
             'jardine','magna','honeywell','bank','phoenix','credit','sun','thermo','repsol','tjx','shandong','travelers',
             'capital','new','ing','tesla','cma','bharat','sap','shenzhen','coop','hyundai','anglo','mitsubishi','siemens',
             'shanxi','jfe','haier','takeda','abb','suzuki','canon','new','samsung','kansai','enbridge','medtronic','toshiba',
             'philip','arrow','schneider','banco','phoenix','chs','beijing','nec','zhejiang','bridgestone','guangxi',
             'crh','xinjiang','linde','enterprise','mazda','hewlett','subaru','guangzhou','lg','kraft','guangzhou','olam',
             'yunnan','samsung','wh','dollar','amgen','compass','coles','ericsson','banco','performance','netflix',
             'nokia','bae','gree','gilead','eli','commonwealth','flex','rite']
name_list = set(name_list)
name_list = list(name_list)

In [11]:
name_word_embedding = dict()
for name in name_list:
    if name in current_model.vocab:
        name_word_embedding[name] = current_model[name]
len(name_word_embedding)

249

Other than business entities we also need generic terms embedded for some applications.

In [12]:
#import Osgoods common words
#only execute when you want to create embeddings for these words
common_list = ['star', 'san', 'fish', 'policeman', 'luck', 'chair', 'woman', 'love', 'trust', 'cloud', 'cup', 
               'punishment', 'doctor', 'wealth', 'hand', 'sleep', 'success', 'money', 'horse', 'knowledge',
               'rope', 'thief', 'laughter', 'snake', 'sun', 'map', 'meat', 'bread', 'respect', 'danger', 'poison',
               'cat', 'bird', 'lake', 'heat', 'head', 'egg', 'tongue', 'smoke', 'story', 'dog', 'fruit', 'anger', 
               'music', 'death', 'heart', 'battle', 'freedom', 'crime', 'pain', 'sympathy', 'color', 'rain', 'ear',
               'choice', 'husband', 'wind', 'wednesday', 'river', 'need', 'hunger', 'marriage', 'hair', 'author', 
               'fire', 'power', 'moon', 'pleasure', 'water', 'tree', 'life', 'peace', 'truth', 'girl', 'tooth',
               'guilt', 'future', 'window', 'seed', 'picture', 'stone', 'courage', 'defeat', 'hope', 'book', 'knot',
               'food', 'purpose', 'progress', 'root', 'work', 'friend', 'noise', 'game', 'belief', 'mother', 
               'father', 'house', 'fear', 'thunder']

In [13]:
#import 1500 common words
#only execute when you want to create embeddings for these words
with open('../data/raw/Common-eng-nouns2.txt') as f:
    lines = f.readlines()
lines=[line.rstrip('\n') for line in lines]

common_list=[]
for word in lines:
    if word in current_model.vocab:
        common_list.append(word)

In [14]:
common_word_embedding = dict()
for name in common_list:
    common_word_embedding[name] = current_model[name]

## 2 Model Creation

### 2.1 Select POLAR Dimension Size

Here we select how many POLAR dimension we want to have in the end.

In [15]:
num_antonym = 500

## Find the antonym difference vectors
antonymy_vector = []
for each_word_pair in list_antonym:
    if each_word_pair[0] in current_model.vocab:
        if each_word_pair[1] in current_model.vocab:
            antonymy_vector.append(current_model[each_word_pair[0]]- current_model[each_word_pair[1]])
antonymy_vector = np.array(antonymy_vector)
print(antonymy_vector.shape)

(73, 100)


### 2.2 Implement Dimesion Selection Method

Now we want to specify how to select the POLAR dimensions. Therefore, we use some functions defined in the code from the original POLAR paper and adapt them to our purpose.

In [16]:
random.seed(42)

t1 = np.array(antonymy_vector)
dimension_similarity_matrix = scipy.spatial.distance.cdist(np.array(antonymy_vector),np.array(antonymy_vector),'cosine')
dimension_similarity_matrix = abs(1-dimension_similarity_matrix)

def get_set_score(final_list, each_dim):
    final_output = 0.0
    for each_vec in final_list:
        final_output += dimension_similarity_matrix[each_vec][each_dim]
    return final_output/(len(final_list))

def select_subset_dimension(dim_vector, num_dim):
    working_list = np.array(dim_vector)

    working_position_index = [i for i in range(working_list.shape[0])]
    final_position_index = []

    print('working list is ready, shape', working_list.shape)
    sel_dim = random.randrange(0, working_list.shape[0])

    final_position_index.append(sel_dim)

    working_position_index.remove(sel_dim)

    for test_count in tqdm(range(num_dim-1)):
        min_dim = None
        min_score = 1000
        for temp_index, each_dim in enumerate(working_position_index):
            temp_score = get_set_score(final_position_index, each_dim)
            if temp_score< min_score:
                min_score= temp_score
                min_dim = each_dim
        final_position_index.append(min_dim)
        working_position_index.remove(min_dim)
    return final_position_index

In [17]:
embedding_size = antonymy_vector.shape[0]
print('The embedding size is', embedding_size)

variance_antonymy_vector_inverse = np.linalg.pinv(np.transpose(antonymy_vector))
variance_antonymy_vector_inverse = torch.tensor(variance_antonymy_vector_inverse)

embedding_matrix = []

current_model_tensor = torch.t(torch.tensor(current_model.wv.vectors))

  current_model_tensor = torch.t(torch.tensor(current_model.wv.vectors))


The embedding size is 73


In [18]:
#use a batch approach to save work spacee for the variance calculation
#assumes independence of batches and take average over all batches as variance
var_list = [None for x in range(20)] # variance for each antonym in each batch

for i in range(19):  # the first 19 batches, each of size 100k
  temp = torch.matmul(variance_antonymy_vector_inverse, current_model_tensor[:,100000*i:100000*i+100000])
  temp_var_mean = torch.var(temp, axis = 1)
  var_list[i] = temp_var_mean.numpy()
  del temp

temp = torch.matmul(variance_antonymy_vector_inverse, current_model_tensor[:,1900000:])
temp_var_mean = torch.var(temp, axis = 1)
var_list[19] = temp_var_mean.numpy()
del temp

In [19]:
variance_list = np.mean(np.array(var_list),axis = 0)
variance_antonymy_vector = [each for each in sorted(range(len(variance_list)), key=lambda i: variance_list[i], reverse=True)]

### 2.3 Create POLAR Embedding

In this part we create the embeddings for the chosen entities and dimensions.

In [20]:
def transform_to_antonym_space(current_model, output_file_path, binary, current_antonymy_vector_inverse):
    temp_dict = dict()

    embedding_size = current_antonymy_vector_inverse.shape[0]   
    print('New model size is',len(current_model), embedding_size)

    temp_file = None

    if binary:
        temp_file = open(output_file_path,'wb')
        temp_file.write(str.encode(str(len(current_model))+' '+str(embedding_size)+'\n'))
    else:
        temp_file = open(output_file_path,'w')
        temp_file.write(str(len(current_model))+' '+str(embedding_size)+'\n')

    total_words = 0
    for each_word in tqdm(current_model):
        total_words += 1
        if binary:
            temp_file.write(str.encode(each_word+' '))
        else:
            temp_file.write(each_word+' ')

        new_vector = np.matmul(current_antonymy_vector_inverse,current_model[each_word])

        new_vector = new_vector/linalg.norm(new_vector)
        temp_dict[each_word] = new_vector

        if binary:
            temp_file.write(new_vector)
            temp_file.write(str.encode('\n'))
        else:
            temp_file.write(str(new_vector))
            temp_file.write('\n')

    temp_file.close()
    return temp_dict

In [21]:
def generate_embedding_path(current_model, embedding_path, binary, antonym_vector, curr_dim):
    curr_antonym_vector = antonymy_vector[antonym_vector[:curr_dim]]
    curr_antonymy_vector_inverse = np.linalg.pinv(np.transpose(curr_antonym_vector))
    new_embedding_dict = transform_to_antonym_space(current_model, embedding_path, binary,curr_antonymy_vector_inverse)

    return new_embedding_dict

In [22]:
antonym_vector_method = variance_antonymy_vector
#use the variance method for dimension selection
antonym_500 = [list_antonym[x] for x in antonym_vector_method[:num_antonym]]

In [23]:
#create the POLAR embeddings
#change according to the entity you want to embedd
#name_new_embedding = generate_embedding_path(name_word_embedding,'name_embeddings',True,antonym_vector_method,num_antonym)

In [24]:
def make_polar_dict(company_name, antonym, embedding, top_n = False, n = 10):
  temp_dict = dict()
  temp_polar = embedding[company_name]

  if top_n:
    idx = np.argsort([abs(x) for x in temp_polar])[-n:]
    for i in idx:
      print(antonym[i],temp_polar[i],'\n')


  if len(antonym) == len(temp_polar):
    for a in range(len(antonym)):
      temp_dict[antonym[a]] = temp_polar[a]
    return temp_dict

## 3 Save Model

Here we save the POLAR model generated for further usage.

In [26]:
#create csv file for the embedding matrix
#df = dict()
#for t in name_list:
    #if t in current_model.vocab:
        #df[t] = make_polar_dict(t, antonym_500, name_new_embedding)

#new_df = pd.DataFrame(df).transpose()

# change columns to better read names
#new_columns = []

#for pair in antonym_500:
  #temp = pair[0]+''+pair[1]
  #new_columns.append(temp)

#new_df.columns = new_columns

In [27]:
#save the file locally
#new_df.to_csv('POLAR-Reddit-org-antonyms-inter.csv')