In [None]:
!pip install chainer
!pip install fasttext

In [9]:
from chainer import cuda
from context2vec.common.context_models import Toks
from context2vec.common.model_reader import ModelReader
import re

usim=open('./usim_en.txt').readlines()
model_reader = ModelReader('./context2vec/model/MODEL-1B-300dim.params.10')
W = model_reader.w
word2index = model_reader.word2index
index2word = model_reader.index2word
model = model_reader.model

Reading config file: ./context2vec/model/MODEL-1B-300dim.params.10
Config:  {'config_path': './context2vec/model/', 'model_file': 'MODEL-1B-300dim.10', 'words_file': 'WORDS-1B-300dim.targets.10', 'unit': '300', 'deep': 'yes', 'drop_ratio': '0.0'}


In [2]:
import numpy 

target_exp = re.compile('\[.*\]')
n_result = 10 
gpu = -1 # todo: make this work with gpu

if gpu >= 0:
    cuda.check_cuda_available()
    cuda.get_device(gpu).use()    
xp = cuda.cupy if gpu >= 0 else numpy

def mult_sim(w, target_v, context_v):
    target_similarity = w.dot(target_v)
    target_similarity[target_similarity<0] = 0.0
    context_similarity = w.dot(context_v)
    context_similarity[context_similarity<0] = 0.0
    return (target_similarity * context_similarity)

def parse_input(line):
    sent = line.strip().split()
    target_pos = None
    for i, word in enumerate(sent):
        if target_exp.match(word) != None:
            target_pos = i
            if word == '[]':
                word = None
            else:
                word = word[1:-1]
            sent[i] = word
    return sent, target_pos

def produce_candidates(line,index2word,w):
    sent, target_pos = parse_input(line)
    if target_pos == None:
        raise ParseException("Can't find the target position.") 

    if sent[target_pos] == None:
        target_v = None
    elif sent[target_pos] not in word2index:
        raise ParseException("Target word is out of vocabulary.")
    else:
        target_v = w[word2index[sent[target_pos]]]
    if len(sent) > 1:
        context_v = model.context2vec(sent, target_pos) 
        context_v = context_v / xp.sqrt((context_v * context_v).sum())
    else:
        context_v = None

    if target_v is not None and context_v is not None:
        similarity = mult_sim(w, target_v, context_v)
    else:
        if target_v is not None:
            v = target_v
        elif context_v is not None:
            v = context_v                
        else:
            raise ParseException("Can't find a target nor context.")   
        similarity = (w.dot(v)+1.0)/2 # Cosine similarity can be negative, mapping similarity to [0,1]

    count = 0
    results=[]
    for i in (-similarity).argsort():
        if numpy.isnan(similarity[i]):
                continue
#         print('{0}: {1}'.format(index2word[i], similarity[i]))
        results.append((index2word[i],similarity[i]))
        count += 1
        if count == n_result:
            break
    return results


In [3]:
import fasttext
ft_model = fasttext.load_model("wiki.en.300.bin")



In [4]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
a=numpy.array([[1,2],[4,5],[6,7]])
d=numpy.array([3,5,8]).reshape(3,1)
a*d



In [16]:
from tqdm.auto import tqdm
from collections import defaultdict
import math

left_embeds_firstorder=[]
left_embeds_secondorder=[]
right_embeds_firstorder=[]
right_embeds_secondorder=[]
w2docfreqs=defaultdict(int)
doc_len=len(usim)
for line in usim:
    for  word in set(line.strip().split()):
        w2docfreqs[word]+=1
w2idf={word:math.log(doc_len/w2docfreqs[word]) for word in w2docfreqs}
        
print (w)
for linei in tqdm(range(0,len(usim))):
    
#     print (linei)
    
    origsentence,wi,_,_=usim[linei].split('\t')
    sentence=origsentence.split()
    # first order
    contexts=[ft_model[w] for w in sentence if w not in stop_words]
    idfs=numpy.array([w2idf[w] for w in sentence if w not in stop_words])
    idfs=idfs.reshape(len(idfs),1)
#     print (idfs)
#     break
    first_order_embeds=numpy.vstack(contexts)*idfs
    first_order_embeds=first_order_embeds.mean(0)
    # second  order
    sentence[int(wi)]='[]'
    sentence=' '.join(sentence)
#     print (sentence)
    candidates=produce_candidates(sentence,index2word,W)
    words,simscores=list(zip(*candidates))
    simscores=numpy.array([simscore/sum(simscores) for simscore in simscores]).reshape(len(simscores),1)
    ft_embeds=numpy.vstack([ft_model[w] for w in words])
    second_order_embeds=ft_embeds*simscores
    second_order_embeds=second_order_embeds.mean(0)
    if linei%2==0:
        left_embeds_secondorder.append(second_order_embeds)
        left_embeds_firstorder.append(first_order_embeds)
    else:
        right_embeds_secondorder.append(second_order_embeds)
        right_embeds_firstorder.append(first_order_embeds)

  
        

two


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2266.0), HTML(value='')))




In [17]:
assert len(right_embeds_firstorder)==len(left_embeds_firstorder)==len(right_embeds_secondorder)==len(left_embeds_secondorder)


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

firstorder_scores=cosine_similarity(right_embeds_firstorder,left_embeds_firstorder)
firstorder_scores=[firstorder_scores[i][j] for i in range(len(firstorder_scores)) for j in range(len(firstorder_scores)) if i==j]
secondorder_scores=cosine_similarity(right_embeds_secondorder,left_embeds_secondorder)
secondorder_scores=[secondorder_scores[i][j] for i in range(len(secondorder_scores)) for j in range(len(secondorder_scores)) if i==j]
scores=[float(usim[linei].strip().split('\t')[-2]) for linei in range(0,len(usim),2)]

assert len(firstorder_scores)==len(secondorder_scores)==len(scores)


In [19]:
from scipy.stats import spearmanr
spearmanr(scores,firstorder_scores)

SpearmanrResult(correlation=0.16028445440323638, pvalue=5.817021845429074e-08)

In [20]:
from transformers import BertModel, BertTokenizer
import torch

In [21]:
bertmodel=BertModel.from_pretrained('bert-base-uncased')
berttokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
bertmodel=bertmodel.cuda()

In [23]:

def delete_tokenmark_input(input_ids,tokenizer):
    input_id_new=[]
    del_num=0
    token_pos_start_id=[tokenizer.encode('[',add_special_tokens=False)[0],tokenizer.encode(' [',add_special_tokens=False)[0]]
    token_pos_end_id=[tokenizer.encode(']',add_special_tokens=False)[0],tokenizer.encode(' ]',add_special_tokens=False)[0]]
    token_pos_start_end_id=set(token_pos_start_id+token_pos_end_id)
    for i,input_i in enumerate(input_ids):
        if input_i not in token_pos_start_end_id:
            input_id_new.append(input_i)
        else:
            del_num+=1
    input_id_new+=del_num*[tokenizer.pad_token_id]
    return input_id_new

def delete_tokenmarker_am(input_ids,tokenizer):
    am_new=[]
    for i in input_ids:
        if i==tokenizer.pad_token_id:
            am_new.append(0)
        else:
            am_new.append(1)
    return am_new

def find_token_id(input_id,tokenizer):
    token_pos_start_id=set([tokenizer.encode('[',add_special_tokens=False)[0],tokenizer.encode(' [',add_special_tokens=False)[0]])    
    token_pos_end_id=set([tokenizer.encode(']',add_special_tokens=False)[0],tokenizer.encode(' ]',add_special_tokens=False)[0]])    
    
    token_ids=[]
    for i,input_i in enumerate(input_id):
        input_i=int(input_i)
        if i==len(input_id)-1: # the last token
            continue
        if input_i in [tokenizer.mask_token_id,tokenizer.cls_token_id,tokenizer.pad_token_id]:
            continue
        if input_i in token_pos_start_id:
            token_ids.append(i+1)
            # logger.info("first word",token_ids)
        elif input_i in token_pos_end_id:
            token_ids.append(i)
    try:
        assert len(token_ids)==2
    except AssertionError as e:
        print ('Warning, token id alter is not length 2')
        print (input_id)
        print (tokenizer.convert_ids_to_tokens(input_id))
        print (token_pos_start_id)
        print (token_pos_end_id)
        print (token_ids)
        sys.exit(1)
   
    try:
        assert token_ids[1]!=token_ids[0]
    except AssertionError as e:
        print ('token marker star == end')
        print (input_id)
        print (token_ids)
        sys.exit(1)
    token_ids[1]=token_ids[1]-1
    token_ids[0]=token_ids[0]-1
    return token_ids
    
def delete_tokenmaker_tokentypeids(input_ids,tokenizer):
    tokentype_ids=[]
    item=0
    for i in input_ids:
    
        if i==tokenizer.pad_token_id:
            tokentype_ids.append(0)
        
        elif i==tokenizer.sep_token_id:
            tokentype_ids.append(item)
            item=1
        else:
            tokentype_ids.append(item)  
    return tokentype_ids

def get_embed(sentences,tokenizer,model,flag='cls',layer_start=None,layer_end=None,maxlen=64):
    if flag=='cls':
        sentences=[sentence.replace('[','').replace(']','') for sentence in sentences]
        toks = tokenizer.batch_encode_plus(sentences, max_length = maxlen,truncation = True, padding="max_length", return_tensors="pt")
        with torch.no_grad():
            outputs_ = model(input_ids=toks['input_ids'].cuda(),attention_mask=toks['attention_mask'].cuda(), output_hidden_states=True)
        last_hidden_state = outputs_.last_hidden_state
        output = last_hidden_state.detach().cpu().numpy()[:,0]
    elif flag=='cls_with_token':
        toks = tokenizer.batch_encode_plus(sentences, max_length = maxlen,truncation = True, padding="max_length", return_tensors="pt")
        with torch.no_grad():
            outputs_ = model(input_ids=toks['input_ids'].cuda(),attention_mask=toks['attention_mask'].cuda(), output_hidden_states=True)
        last_hidden_state = outputs_.last_hidden_state
        output = last_hidden_state.detach().cpu().numpy()[:,0]
    elif flag=='mean':
        sentences=[sentence.replace('[','').replace(']','') for sentence in sentences]
        toks = tokenizer.batch_encode_plus(sentences, max_length = maxlen,truncation = True, padding="max_length", return_tensors="pt")
        with torch.no_grad():
            outputs_ = model(input_ids=toks['input_ids'].cuda(),attention_mask=toks['attention_mask'].cuda(), output_hidden_states=True)
        hidden_states = outputs_.hidden_states
        average_layer_batch = sum(hidden_states[layer_start:layer_end]) / (layer_end-layer_start)
        
        output = average_layer_batch.detach().cpu().numpy().mean(1)

    elif flag=='preappend':
        sentences=[sentence.split()[sentence.split().index('[')+1]+' $ '+ sentence for sentence in sentences]
        # print (sentences)
        toks = tokenizer.batch_encode_plus(sentences, max_length = maxlen,truncation = True, padding="max_length", return_tensors="pt")
        with torch.no_grad():
            outputs_ = model(input_ids=toks['input_ids'].cuda(),attention_mask=toks['attention_mask'].cuda(), output_hidden_states=True)
        last_hidden_state = outputs_.last_hidden_state
        output = last_hidden_state.detach().cpu().numpy()[:,0,:]
    elif flag=='alltoken':
        toks = tokenizer.batch_encode_plus(sentences, max_length = maxlen,truncation = True, padding="max_length", return_tensors="pt")

       
        # for num in range(average_layer_batch.size()[0]):
        #     embeds_per_sent=average_layer_batch[num]
        #     token_ids_per_sent=all_token_ids[num]
            
        #     embed_token=torch.mean(embeds_per_sent[int(token_ids_per_sent[0]):int(token_ids_per_sent[1])],dim=0,keepdim=True)
        #     # assert int(token_ids_per_sent[0])!=int(token_ids_per_sent[1])
        #     assert not torch.isnan(embed_token).any()
        #     if num == 0:
        #         output = embed_token
        #     else:
        #         output = torch.cat((output, embed_token),0)
        output = output.detach().cpu().numpy()
    elif flag.startswith('token'):
        toks = tokenizer.batch_encode_plus(sentences, max_length = maxlen,truncation = True, padding="max_length")
        all_token_ids=torch.tensor([find_token_id(tok,tokenizer) for tok in toks['input_ids']], dtype=torch.long).cuda()
        all_input_ids=torch.tensor([delete_tokenmark_input(tok,tokenizer) for tok in toks['input_ids']], dtype=torch.long).cuda()
        all_attention_mask=torch.tensor([delete_tokenmarker_am(input_ids,tokenizer) for input_ids in all_input_ids], dtype=torch.long).cuda()
        all_token_type_ids=torch.tensor([delete_tokenmaker_tokentypeids(input_ids,tokenizer) for input_ids in all_input_ids], dtype=torch.long).cuda()
        inputs = {"input_ids": all_input_ids, "attention_mask": all_attention_mask}
        with torch.no_grad():
            outputs_ = model(**inputs, output_hidden_states=True)
        hidden_states = outputs_.hidden_states
        average_layer_batch = sum(hidden_states[layer_start:layer_end]) / (layer_end-layer_start)
        
        for num in range(average_layer_batch.size()[0]):
            embeds_per_sent=average_layer_batch[num]
            token_ids_per_sent=all_token_ids[num]
            
            embed_token=torch.mean(embeds_per_sent[int(token_ids_per_sent[0]):int(token_ids_per_sent[1])],dim=0,keepdim=True)
            # assert int(token_ids_per_sent[0])!=int(token_ids_per_sent[1])
            assert not torch.isnan(embed_token).any()
            if num == 0:
                output = embed_token
            else:
                output = torch.cat((output, embed_token),0)
        output = output.detach().cpu().numpy()
        if flag=='token+cls':
            last_hidden_state = outputs_.last_hidden_state
            output=np.concatenate([output, last_hidden_state.detach().cpu().numpy()[:,0]],axis=1)
            # print (output.shape)
        
    return output

In [26]:
bert_input_left=[]
bert_input_right=[]

for linei,line in enumerate(usim):
    sentence,wi,_,_=line.strip().split('\t')
    wi=int(wi)
    sentence=sentence.split()
    prev_sentence=' '.join(sentence[:int(wi)]).replace('[','').replace(']','').split()
    after_sentence=' '.join(sentence[int(wi)+1:]).replace('[','').replace(']','').split()
    sentence=prev_sentence+['[',sentence[wi],']']+after_sentence
    if linei%2==0:
        bert_input_left.append(' '.join(sentence))
    else:
        bert_input_right.append(' '.join(sentence))

    

In [None]:
assert len(bert_input_left)==len(bert_input_right)

In [None]:
bert_input_left[0]

In [25]:
left_embeds=get_embed(bert_input_left,berttokenizer,bertmodel,'token',9,13)
right_embeds=get_embed(bert_input_right,berttokenizer,bertmodel,'token',9,13)

In [27]:
NORMALIZE='normalize'
CENTER='center'
def normalize_embeddings(emb, types, mean=None):
    """
    Normalize embeddings by their norms / recenter them.
    """
    for t in types.split(','):
        if t == '':
            continue
        if t == CENTER:
            if mean is None:
                mean = emb.mean(0, keepdim=True)
            emb.sub_(mean.expand_as(emb))
        elif t == NORMALIZE:
            matrix_norm(emb)
        else:
            raise Exception('Unknown normalization type: "%s"' % t)
    return mean if mean is not None else None

def produce_cosine_list(test_src,test_tgt):
    cos_matrix=produce_cos_matrix(test_src, test_tgt)
    scores_pred = [float(cos_matrix[i][i]) for i in range(len(cos_matrix))]
    return scores_pred

def produce_cos_matrix(test_src,test_tgt):
    normalize_embeddings(test_src, NORMALIZE, None)
    normalize_embeddings(test_tgt, NORMALIZE, None)
    cos_matrix = torch.mm(test_src, test_tgt.transpose(0, 1))
    return cos_matrix

def matrix_norm(emb):
    emb.div_(emb.norm(2, 1, keepdim=True).expand_as(emb))

In [28]:
scores_bert=produce_cosine_list(torch.from_numpy(left_embeds),torch.from_numpy(right_embeds))

In [31]:
spearmanr(firstorder_scores,secondorder_scores)

SpearmanrResult(correlation=0.08867805493761785, pvalue=0.002812678748437777)