In [1]:
import faiss
import pandas as pd
import numpy as np
import transformers
import json
import torch
import time
from tqdm.notebook import tqdm
import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
MODEL = 'bert-base-uncased'

sciBert = transformers.BertModel.from_pretrained(MODEL)
tokenizer = transformers.BertTokenizer.from_pretrained(MODEL, do_lower_case=True)
device = "cuda:0"
sciBert = sciBert.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
EMBEDDING_FILENAME = 
OUTPUT_AUTHORS_FILENAME = 
OUTPUT_TEXT_FILENAME = 
INDEX_MAP_FILENAME = 
TWEET_FILENAME

In [33]:
text_coord_authors_fh = open(OUTPUT_AUTHORS_FILENAME, 'w')
text_coord_text_fh = open(OUTPUT_TEXT_FILENAME, 'w')

text_coord_authors_fh.write('author1,author2,author1id,author2id,distance\n')
text_coord_text_fh.write('text1,text2,author1,author2,author1id,author2id,distance\n')


57

In [5]:
def embedding_fn(model, text) :
    with torch.no_grad():
        tokens = tokenizer.encode(text, max_length=512)
        batch_tokens = np.expand_dims(tokens, axis = 0)
        batch_tokens = torch.tensor(batch_tokens).cuda()
        batch_tokens = batch_tokens.to(device)
        return model(batch_tokens)[0].cpu()

def compute_mean(embedding):
    if not isinstance(embedding, torch.Tensor):
        print('Embedding must be a torch.Tensor')
        return 
    return embedding.mean(1)

def compute_cosine_measure(x1, x2):
    return cosine_similarity(x1, x2)

def compute_distance(x1, x2):
    return compute_cosine_measure(x1.detach().numpy(), x2.detach().numpy())

In [39]:
def index_to_title_t(indexes, distances, curr_author):
    for i, idx in enumerate(indexes) :
                
        if index_map[str(idx)]['cord_uid'] == curr_author['idx']: 
            continue
        distance = distances[i]
        
        matched_text = index_map[str(idx)]['text']
        matched_author_name = index_map[str(idx)]['author_user_name']
        matched_author_id = index_map[str(idx)]['author_userid']

        matched_json = {}
        matched_json['author1'] = curr_author['author_user_name']
        matched_json['author2'] = matched_author_name

        matched_json['author1id'] = curr_author['author_userid']
        matched_json['author2id'] = matched_author_id

        matched_json['text1idx'] = str(curr_author['idx'])
        matched_json['text2idx'] = str(index_map[str(idx)]['cord_uid'])
        matched_json['text1'] = curr_author['text']
        matched_json['text2'] = matched_text
        matched_json['distance'] = str(distance)
        
        text_coord_authors_fh.write(curr_author['author_user_name'] + ',' + matched_author_name + ',' + 
                                    str(curr_author['author_userid']) + ',' + str(matched_author_id) + ',' +
                                    str(distance) + '\n')

        
        text_coord_text_fh.write(curr_author['text'] + ',' + matched_text + ',' +
                                 curr_author['author_user_name'] + ',' + matched_author_name + ',' + 
                                 str(curr_author['author_userid']) + ',' + str(matched_author_id) + ',' +
                                 str(distance) + '\n'
                                )


In [8]:
embeddings = np.load(EMBEDDING_FILENAME)['a']

embeddings_32 = embeddings.astype('float32')
n_dimensions = embeddings_32.shape[1] 

In [10]:
fastIndex_gpu = faiss.index_factory(n_dimensions, "Flat", faiss.METRIC_INNER_PRODUCT)
res = faiss.StandardGpuResources()
fastIndex_gpu = faiss.index_cpu_to_gpu(res, 0, fastIndex_gpu)
faiss.normalize_L2(embeddings_32)
fastIndex_gpu.add(embeddings_32) 
n_embeddings = embeddings.shape[0]

In [12]:
df = pd.read_json(TWEET__FILENAME, lines=True)

In [14]:
with open(INDEX_MAP_FILENAME, 'r') as f:
    index_map = json.load(f)

In [42]:
k=3

for i in range(0, n_embeddings):
    emb = embeddings[i]
    emb = emb.astype('float32').reshape(1,-1)
    faiss.normalize_L2(emb)
    
    row = df.iloc[i]
    curr_author = {}
    curr_author['text'] = row['text']
    curr_author['author_user_name'] = row['user']['screen_name']
    curr_author['author_userid'] = row['user']['id']
    curr_author['idx'] = row['id']
    
    distances, neighbors = fastIndex_gpu.search(emb, k)
    index_to_title_t(neighbors[0], distances[0], curr_author)