In [1]:
%pip install transformers
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import re

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [2]:
df = pd.read_csv('examples.tsv', delimiter='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract
0,0,Analysis of Relative Gene Expression Data Usin...,The two most commonly used methods to analyze ...
1,1,Deep Residual Learning for Image Recognition,Deeper neural networks are more difficult to t...
2,2,A short history ofSHELX,An account is given of the development of the ...
3,3,Basic local alignment search tool,"A new approach to rapid sequence comparison, b..."
4,4,,Random forests are a combination of tree predi...


In [3]:
papers = list()
for index, row in df.iterrows():
    if row['title'] and row['abstract']:
        sentences = re.split(r"\b[.!?;]\s", str(row['abstract']))
        sentences = [sentence for sentence in sentences if len(sentence) > 0]
        papers.append(
            {
                'title': row['title'],
                'sentences': sentences
            }
        )

In [4]:
df.iloc[0]['abstract']

'The two most commonly used methods to analyze data from real-time, quantitative PCR experiments are absolute quantification and relative quantification. Absolute quantification determines the input copy number, usually by relating the PCR signal to a standard curve. Relative quantification relates the PCR signal of the target transcript in a treatment group to that of another sample such as an untreated control. The 2 −ΔΔ C T method is a convenient way to analyze the relative changes in gene expression from real-time quantitative PCR experiments. The purpose of this report is to present the derivation, assumptions, and applications of the 2 −ΔΔ C T method. In addition, we present the derivation and applications of two variations of the 2 −ΔΔ C T method that may be useful in the analysis of real-time, quantitative PCR data. '

In [5]:
papers[167]

{'title': 'BEDTools: a flexible suite of utilities for comparing genomic features',
 'sentences': ['Testing for correlations between different sets of genomic features is a fundamental task in genomics research',
  'However, searching for overlaps between features with existing web-based methods is complicated by the massive datasets that are routinely produced with current sequencing technologies',
  'Fast and flexible tools are therefore required to ask complex questions of these data in an efficient manner.This article introduces a new software suite for the comparison, manipulation and annotation of genomic features in Browser Extensible Data (BED) and General Feature Format (GFF) format',
  'BEDTools also supports the comparison of sequence alignments in BAM format to both BED and GFF features',
  'The tools are extremely efficient and allow the user to compare large datasets (e.g',
  'next-generation sequencing data) with both public and custom genome annotation tracks',
  'BEDTo

In [6]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [7]:
tokens = {'input_ids': [], 'attention_mask': []}
for paper in papers[:3]:
    for sentence in paper['sentences']:
        # encode each sentence and append to dictionary
        new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                           truncation=True, padding='max_length',
                                           return_tensors='pt')
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
        
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [8]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [9]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-5.3435e-01,  1.6028e-01, -6.1676e-02,  ..., -5.4697e-01,
           1.2346e-01,  4.1495e-01],
         [-5.0971e-01, -2.3434e-01,  1.6776e-01,  ..., -6.6199e-01,
           2.4208e-01, -1.2140e-02],
         [-5.9011e-01, -4.8241e-01, -4.6855e-01,  ..., -4.3545e-01,
           3.7199e-01, -4.9600e-01],
         ...,
         [-4.2932e-01, -1.3987e-02,  1.8443e-01,  ..., -7.2213e-02,
           3.2792e-01,  2.8302e-01],
         [-2.0810e-01, -5.7676e-02,  1.0728e-01,  ..., -3.1878e-02,
           2.3129e-01,  3.6510e-01],
         [-1.6877e-01,  4.9826e-03,  1.2360e-02,  ...,  3.4051e-02,
           1.3897e-01,  1.6843e-01]],

        [[-2.4747e-01, -1.4939e-01,  5.9418e-01,  ..., -4.4344e-01,
          -2.6041e-02,  1.2963e+00],
         [ 2.3194e-01, -2.6046e-01,  7.3989e-01,  ..., -4.4468e-01,
          -2.6440e-01,  1.4715e+00],
         [ 1.8986e-01, -2.1698e-02,  7.3000e-01,  ..., -4.6517e-01,
          -1.4789e-01,  1.3726e+00],
         ...,
         [-8.4648e-03, -2

In [10]:
embeddings.shape

torch.Size([25, 128, 768])

In [11]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([25, 128])

In [12]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([25, 128, 768])

In [13]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [14]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([25, 128, 768])

In [15]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([25, 768])

In [16]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([25, 768])

In [17]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[-0.4531,  0.0464,  0.0839,  ..., -0.4959,  0.1306,  0.4264],
        [ 0.0896, -0.2147,  0.7119,  ..., -0.2331, -0.2212,  1.3551],
        [ 0.0097, -0.3247,  0.0984,  ..., -0.2072, -0.3513,  0.6373],
        ...,
        [ 0.2771,  0.1674,  1.2680,  ..., -0.6421, -0.9161,  0.1106],
        [-0.3551,  0.0754,  0.4234,  ..., -0.5930, -0.8753, -0.2367],
        [-0.1512,  0.1455,  1.7145,  ..., -0.9219, -0.6154,  0.1346]],
       grad_fn=<DivBackward0>)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.6963165 , 0.5571815 , 0.7574753 , 0.5389335 , 0.7437722 ,
        0.3428052 , 0.5574206 , 0.46375385, 0.5090538 , 0.45180053,
        0.46239728, 0.46018794, 0.46551833, 0.49259296, 0.5856715 ,
        0.52892464, 0.46284395, 0.49887925, 0.5108942 , 0.65644646,
        0.62603164, 0.4288961 , 0.6689023 , 0.49857748]], dtype=float32)