In [1]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

from fiz_lernmodule.preprocessing import PreProcessor

from multiprocessing import Pool

import pickle
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/schikanski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
seed_list = ['hair_dryer', 'video_codec', 'diesel', "contact_lens", "contact_lens_us_c", "3d_printer"]
src_dir = '.'
seeds = [seed_list[1]]
seed_name = seeds[0]

with open(src_dir + "/data/" + seed_name + "/terms_attributes.pkl", 'rb') as infile:
    df = pickle.load(infile)
architectures = ["bert-base-uncased",
                'allenai/scibert_scivocab_uncased',
                'google/pegasus-big_patent', # faulty
                'google/bigbird-pegasus-large-bigpatent',
                'AI-Growth/PatentSBERTa',
                'distilbert-base-uncased']

idx1 = 1
checkpoint = architectures[idx1]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

print(len(df))

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


235


In [3]:
family_ids_to_filter = np.unique(df['family_id'])[np.unique(df['family_id'], return_counts=True)[1] > 1]
rows_to_drop = np.empty((0, 1), int)
for family_id in family_ids_to_filter:
    rows_to_drop = np.append(rows_to_drop, df[df['family_id'] == family_id].sort_values('pub_num').index[1:])
df = df.drop(rows_to_drop)
print(len(df))

229


In [6]:
def create_embedding(text):
    inputs = tokenizer(text, padding=True, truncation=False, return_tensors='pt')
    inputs_ids = inputs['input_ids'][0]

    size = inputs['input_ids'].shape[1]
    if size > 512:
        split_input_masks = []
        split_attention_masks = []
        for i in range(int(size / 512)):
            split_input_masks.append(inputs['input_ids'][0, (i * 512):((i + 1) * 512)].reshape(1, -1))
            split_attention_masks.append(inputs['attention_mask'][0, (i * 512):((i + 1) * 512)].reshape(1, -1))
        if (size % 512) != 0:
            split_input_masks.append(inputs['input_ids'][0, -(size % 512):].reshape(1, -1))
            split_attention_masks.append(inputs['attention_mask'][0, -(size % 512):].reshape(1, -1))
    if size > 512:
        split_outputs = []
        for a, b in zip(split_input_masks, split_attention_masks):
            temp_dict = {'input_ids':a, 'attention_mask':b}
            split_outputs.append(model(**temp_dict))
            del temp_dict
    else:
        outputs = model(**inputs)
        lhs = outputs['last_hidden_state']
    if size > 512:
        key = 'last_hidden_state'
        lhs = torch.Tensor()
        for obj in split_outputs:
            lhs = torch.cat((lhs, obj[key]), dim=1)
    lhs = lhs[:, 1:-1, :].detach().numpy()
    lhs.shape
    decoded_list = [tokenizer.decode(x) for x in inputs_ids][1:-1]
    len(decoded_list)
    hashtag_list = [idx for idx, val in enumerate(decoded_list) if '##' in val]

    def fuse_hashtags():
        to_fuse = hashtag_list[::-1]
        list_of_indices = []
        for idx in to_fuse:
            temp_list = [idx]
            former_element = idx - 1
            while former_element in to_fuse:
                temp_list.append(former_element)
                to_fuse.remove(former_element)
                former_element -= 1
            temp_list.append(former_element)
            temp_list = tuple(temp_list[::-1])
            list_of_indices.append(temp_list)
        return list_of_indices


    hashtags_to_fuse = fuse_hashtags()

    def fuse(pos_list, lhs, decoded_list):
        for chain in pos_list:
            first_pos = chain[0]
            last_pos = chain[-1]
            word = ''.join(map(lambda x: x.replace('#', ''), decoded_list[first_pos:last_pos + 1]))
            vector = np.mean(lhs[0, first_pos:last_pos + 1, :], axis=0)
            decoded_list[first_pos] = word
            lhs[:, first_pos, :] = vector

            for i in range(last_pos, first_pos, -1):
                decoded_list.pop(i)
                lhs = np.delete(lhs, i, axis=1)
            
        return lhs, decoded_list

    lhs, decoded_list = fuse(hashtags_to_fuse, lhs, decoded_list)

    return (lhs, decoded_list)

In [8]:
def create_embedding_sentence(text):
    text = re.split('\.|\;', text)
    sentences = [sentence for sentence in text if len(sentence) > 30]

    inputs = []
    input_ids = []
    for sentence in sentences:
        tokenized = tokenizer(sentence, padding=True, truncation=True, max_length=512, return_tensors='pt')
        inputs.append(tokenized)
        input_ids.append(tokenized['input_ids'])

    outputs = []
    lhs = []
    for single_input in inputs:
        output = model(**single_input)
        outputs.append(output)
        lhs.append(output['last_hidden_state'][:, 1:-1, :].detach().numpy())

    decoder = lambda inputs_ids: [tokenizer.decode(x) for x in inputs_ids[0]][1:-1]
    decoder(input_ids[0])
    decoded_sentences = list(map(decoder, input_ids))

    hashtag_extractor = lambda decoded_sentence: [idx for idx, val in enumerate(decoded_sentence) if '##' in val]
    hashtag_list = list(map(hashtag_extractor, decoded_sentences))

    def fuse_hashtags(hashtag_list):
        to_fuse = hashtag_list[::-1]
        list_of_indices = []
        for idx in to_fuse:
            temp_list = [idx]
            former_element = idx - 1
            while former_element in to_fuse:
                temp_list.append(former_element)
                to_fuse.remove(former_element)
                former_element -= 1
            temp_list.append(former_element)
            temp_list = tuple(temp_list[::-1])
            list_of_indices.append(temp_list)
        return list_of_indices

    hashtags_to_fuse = list(map(fuse_hashtags, hashtag_list))

    def fuse(pos_list, lhs, decoded_list):
        for chain in pos_list:
            first_pos = chain[0]
            last_pos = chain[-1]
            word = ''.join(map(lambda x: x.replace('#', ''), decoded_list[first_pos:last_pos + 1]))
            vector = np.mean(lhs[0, first_pos:last_pos + 1, :], axis=0)
            decoded_list[first_pos] = word
            lhs[:, first_pos, :] = vector

            for i in range(last_pos, first_pos, -1):
                decoded_list.pop(i)
                lhs = np.delete(lhs, i, axis=1)
            
        return lhs, decoded_list

    lhs, decoded_sentences = zip(*list(map(fuse, hashtags_to_fuse, lhs, decoded_sentences)))

    return (lhs, decoded_sentences)

In [9]:
def meanie(sentence_vectors):
    sentence_embeddings = np.empty((0, 768))
    for vector in sentence_vectors:
        sentence_embeddings = np.append(sentence_embeddings, np.mean(vector, axis=1), axis=0)
    return sentence_embeddings

def flatten_list(a_list):
    return [item for sublist in a_list for item in sublist]

def flatten_array(list_of_arrays):
    new_array = np.empty((1, 0, 768))
    for array in list_of_arrays:
        new_array = np.append(new_array, array, axis=1)
    return new_array


def filter_text(text):
    pre = PreProcessor()
    filtered_text = pre.preprocess_text(' '.join(text), remove_short_long=True)
    index = []
    next_pos = 0
    for word in filtered_text:
        for idx in range(next_pos, len(text)):
            if word == text[idx]:
                index.append(idx)
                next_pos = idx + 1
                break
    return index
    
with Pool(2) as pool:
    output = pool.map(create_embedding_sentence, df['abstract_text'])
    embeddings, decoded_text = zip(*output)
    sentence_embeddings = pool.map(meanie, embeddings)
    decoded_words = pool.map(flatten_list, decoded_text)
    word_embeddings = pool.map(flatten_array, embeddings)
    indices = pool.map(filter_text, decoded_words)

temp1, temp2 = [], []
for i in range(len(indices)):
    temp1.append(np.take(word_embeddings[i], indices[i], axis=1))
    temp2.append([decoded_words[i][x] for x in indices[i]])

word_embeddings = temp1
decoded_words = temp2

In [11]:
df['word_embeddings'] = word_embeddings
df['decoded_text'] = decoded_words
df['decoded_sentences'] = decoded_text
df['sentence_embeddings'] = sentence_embeddings
df['document_embeddings'] = df['sentence_embeddings'].apply(lambda x: np.mean(x, axis=0))

In [12]:
with open(src_dir + "/data/" + seed_name + "/pre_embedding.pkl", 'wb') as outfile:
    pickle.dump(df, outfile)