In [30]:
# %pip install transformers
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import re
import numpy as np

In [2]:
df = pd.read_csv('examples.tsv', delimiter='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract
0,0,Analysis of Relative Gene Expression Data Usin...,The two most commonly used methods to analyze ...
1,1,Deep Residual Learning for Image Recognition,Deeper neural networks are more difficult to t...
2,2,A short history ofSHELX,An account is given of the development of the ...
3,3,Basic local alignment search tool,"A new approach to rapid sequence comparison, b..."
4,4,,Random forests are a combination of tree predi...


In [3]:
papers = list()
for index, row in df.iterrows():
    if row['title'] and row['abstract']:
        sentences = re.split(r"\b[.!?;]\s", str(row['abstract']))
        sentences = [sentence for sentence in sentences if len(sentence) > 0]
        papers.append(
            {
                'title': row['title'],
                'sentences': sentences
            }
        )

In [4]:
df.iloc[0]['abstract']

'The two most commonly used methods to analyze data from real-time, quantitative PCR experiments are absolute quantification and relative quantification. Absolute quantification determines the input copy number, usually by relating the PCR signal to a standard curve. Relative quantification relates the PCR signal of the target transcript in a treatment group to that of another sample such as an untreated control. The 2 −ΔΔ C T method is a convenient way to analyze the relative changes in gene expression from real-time quantitative PCR experiments. The purpose of this report is to present the derivation, assumptions, and applications of the 2 −ΔΔ C T method. In addition, we present the derivation and applications of two variations of the 2 −ΔΔ C T method that may be useful in the analysis of real-time, quantitative PCR data. '

In [5]:
num_papers = 100
sentences = list()
sent_index = dict()
idx = 0
for paper in papers[:num_papers]:
    for sentence in paper["sentences"]:
        sentences.append(sentence)
        sent_index[idx] = paper["title"]
        idx += 1

In [6]:
len(sentences)

553

In [7]:
for i in range(5,7):
    print(sent_index[i])

Analysis of Relative Gene Expression Data Using Real-Time Quantitative PCR and the 2−ΔΔCT Method
Deep Residual Learning for Image Recognition


In [8]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [9]:
tokens = {'input_ids': [], 'attention_mask': []}
for sentence in sentences:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=32,
                                        truncation=True, padding='max_length',
                                        return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])
        
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [10]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [11]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-0.5344,  0.1603, -0.0617,  ..., -0.5470,  0.1235,  0.4150],
         [-0.5097, -0.2343,  0.1678,  ..., -0.6620,  0.2421, -0.0121],
         [-0.5901, -0.4824, -0.4686,  ..., -0.4355,  0.3720, -0.4960],
         ...,
         [-0.6544, -0.0614,  0.5382,  ..., -0.3798,  0.0354,  0.5687],
         [-0.2953, -0.0849, -0.1251,  ..., -0.2406,  0.2773, -0.0399],
         [-0.2897, -0.1275, -0.1079,  ..., -0.2202,  0.2456, -0.1119]],

        [[-0.2475, -0.1494,  0.5942,  ..., -0.4434, -0.0260,  1.2963],
         [ 0.2319, -0.2605,  0.7399,  ..., -0.4447, -0.2644,  1.4715],
         [ 0.1899, -0.0217,  0.7300,  ..., -0.4652, -0.1479,  1.3726],
         ...,
         [ 0.0323, -0.3013,  0.4158,  ..., -0.0622, -0.0673,  0.8302],
         [ 0.0589, -0.2688,  0.5042,  ..., -0.0274, -0.0200,  0.8514],
         [ 0.0743, -0.2702,  0.5423,  ..., -0.0065, -0.0458,  0.8897]],

        [[ 0.1749, -0.3280,  0.1347,  ..., -0.1967, -0.3613,  0.8629],
         [ 0.0086, -0.2718,  0.6160,  ..., -0

In [12]:
embeddings.shape

torch.Size([553, 32, 768])

In [13]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([553, 32])

In [14]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([553, 32, 768])

In [15]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [16]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([553, 32, 768])

In [17]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([553, 768])

In [18]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([553, 768])

In [19]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[-0.4531,  0.0464,  0.0839,  ..., -0.4959,  0.1306,  0.4264],
        [ 0.0896, -0.2147,  0.7119,  ..., -0.2331, -0.2212,  1.3551],
        [ 0.0097, -0.3247,  0.0984,  ..., -0.2072, -0.3513,  0.6373],
        ...,
        [-0.8234, -0.3840,  0.1930,  ..., -0.7831, -0.4611,  0.2648],
        [-0.2612,  0.0569,  2.0655,  ...,  0.3290,  0.5111, -0.2976],
        [-0.3987, -0.0754,  0.0825,  ..., -0.2219,  0.9837, -0.3423]],
       grad_fn=<DivBackward0>)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
sims = cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

In [36]:
sims = sims.flatten()
for i in range(7):
    print(sent_index[i])

Analysis of Relative Gene Expression Data Using Real-Time Quantitative PCR and the 2−ΔΔCT Method
Analysis of Relative Gene Expression Data Using Real-Time Quantitative PCR and the 2−ΔΔCT Method
Analysis of Relative Gene Expression Data Using Real-Time Quantitative PCR and the 2−ΔΔCT Method
Analysis of Relative Gene Expression Data Using Real-Time Quantitative PCR and the 2−ΔΔCT Method
Analysis of Relative Gene Expression Data Using Real-Time Quantitative PCR and the 2−ΔΔCT Method
Analysis of Relative Gene Expression Data Using Real-Time Quantitative PCR and the 2−ΔΔCT Method
Deep Residual Learning for Image Recognition


In [37]:
print(sentences[0])
print("\n\n")
print(sentences[np.argmax(sims[7:])])

The two most commonly used methods to analyze data from real-time, quantitative PCR experiments are absolute quantification and relative quantification



These results are compared to the free volume equation of state and to a four‐term virial coefficient expansion


In [38]:
print(sent_index[0])
print(sent_index[np.argmax(sims[7:])])

Analysis of Relative Gene Expression Data Using Real-Time Quantitative PCR and the 2−ΔΔCT Method
Equation of State Calculations by Fast Computing Machines


In [40]:
idx = np.argmax(sims[7:])
print(idx)
print(sims[idx])


249
0.50008583
