In [1]:
import torch
import numpy as np
import pandas as pd

from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import json
import os

In [3]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "Standing on one's head at job interviews forms a lasting impression.",
    "It took him a month to finish the meal.",
    "He found a leprechaun in his walnut shell."
]

In [4]:
# df = pd.read_csv('/home/heptagon/Desktop/sentence-similarity/data/information.csv')
# df.head(3)

In [10]:
# if file_extension.endswith(".csv"):   # load csv file & convert to dataframe
#     df = pd.read_csv(filepath)

# if file_extension.endswith(".xlsx"):    # load excel file & convert to dataframe
#     df = pd.read_excel(filepath)

# if file_extension.endswith(".txt"):    # load text file into dataframe
#     df = pd.read_csv(filepath, sep=',', header=None)
    
# if file_extension.endswith(".json"):   # open json file
#     with open(filepath,'r') as file:
#         data = json.load(file)
#     df = pd.DataFrame(data)    # loading into a DataFrame
    


In [11]:
# sentences = []
# for key_str in tqdm(df.Occupation):
#     sentences.append(key_str)


In [12]:
len(sentences)

6

In [13]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# initialize dictionary that will contain tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # tokenize sentence and append to dictionary lists
    new_tokens = tokenizer.encode_plus(sentence,
                                       max_length=128,
                                       truncation=True,
                                       padding='max_length',
                                       return_tensors='pt'
                                      )
    
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [14]:
tokens['input_ids'].shape

torch.Size([6, 128])

In [15]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [16]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-6.9229e-02,  6.2300e-01,  3.5371e-02,  ...,  8.0334e-01,
           1.6314e+00,  3.2812e-01],
         [ 3.6730e-02,  6.8419e-01,  1.9460e-01,  ...,  8.4759e-02,
           1.4747e+00, -3.0080e-01],
         [-1.2142e-02,  6.5431e-01, -7.2717e-02,  ..., -3.2600e-02,
           1.7717e+00, -6.8121e-01],
         ...,
         [ 1.9532e-01,  1.1085e+00,  3.3905e-01,  ...,  1.2826e+00,
           1.0114e+00, -7.2754e-02],
         [ 9.0217e-02,  1.0288e+00,  3.2973e-01,  ...,  1.2940e+00,
           9.8650e-01, -1.1125e-01],
         [ 1.2404e-01,  9.7365e-01,  3.9329e-01,  ...,  1.1359e+00,
           8.7685e-01, -1.0435e-01]],

        [[-3.2124e-01,  8.2512e-01,  1.0554e+00,  ..., -1.8555e-01,
           1.5169e-01,  3.9366e-01],
         [-7.1457e-01,  1.0297e+00,  1.1217e+00,  ...,  3.3118e-02,
           2.3820e-01, -1.5632e-01],
         [-2.3522e-01,  1.1353e+00,  8.5941e-01,  ..., -4.3096e-01,
          -2.7242e-02, -2.9677e-01],
         ...,
         [-5.4000e-01,  3

In [17]:
embeddings.shape

torch.Size([6, 128, 768])

In [18]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([6, 128])

In [19]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([6, 128, 768])

In [20]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([6, 128, 768])

In [21]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([6, 768])

In [22]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([6, 768])

In [23]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[ 0.0745,  0.8637,  0.1795,  ...,  0.7734,  1.7247, -0.1803],
        [-0.3715,  0.9729,  1.0840,  ..., -0.2552, -0.2759,  0.0358],
        [-0.5030,  0.7950, -0.1240,  ...,  0.1441,  0.9704, -0.1791],
        [-0.0132,  0.9773,  1.4516,  ..., -0.8462, -1.4004, -0.4118],
        [-0.2019,  0.0597,  0.8603,  ..., -0.0100,  0.8431, -0.0841],
        [-0.2131,  1.0175, -0.8833,  ...,  0.7371,  0.1947, -0.3011]],
       grad_fn=<DivBackward0>)

In [24]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

In [25]:
mean_pooled

array([[ 0.0744615 ,  0.86369663,  0.17946403, ...,  0.77344006,
         1.7247488 , -0.18027496],
       [-0.37146333,  0.9729013 ,  1.0839937 , ..., -0.25521275,
        -0.2759373 ,  0.03575867],
       [-0.50298226,  0.794986  , -0.12402522, ...,  0.14406362,
         0.97037494, -0.17911562],
       [-0.01324306,  0.9772857 ,  1.451594  , ..., -0.846165  ,
        -1.4004318 , -0.41184372],
       [-0.20192645,  0.05970357,  0.8602745 , ..., -0.01000803,
         0.84306246, -0.0840771 ],
       [-0.21311913,  1.0174934 , -0.8832755 , ...,  0.73710376,
         0.19469155, -0.30111268]], dtype=float32)

In [26]:
# # calculate cosine_similarity
# cosine_similarity(
#     [mean_pooled[0]],
#     mean_pooled[1:]
# )

In [27]:
%%time

result = cosine_similarity(mean_pooled,mean_pooled)
print(result)

[[1.         0.33088908 0.72192585 0.17475504 0.44709677 0.5548364 ]
 [0.33088908 0.99999964 0.24826953 0.2923194  0.20174855 0.2950728 ]
 [0.72192585 0.24826953 1.         0.25110355 0.5565801  0.41768277]
 [0.17475504 0.2923194  0.25110355 0.99999994 0.26012164 0.13192454]
 [0.44709677 0.20174855 0.5565801  0.26012164 1.0000002  0.22627155]
 [0.5548364  0.2950728  0.41768277 0.13192454 0.22627155 0.9999998 ]]
CPU times: user 2.22 ms, sys: 333 µs, total: 2.55 ms
Wall time: 2.29 ms


In [37]:
# while True:
rows=np.argwhere(result>.6)
rows

array([[0, 0],
       [0, 2],
       [1, 1],
       [2, 0],
       [2, 2],
       [3, 3],
       [4, 4],
       [5, 5]])

In [38]:
temp_dict = {}
for i in rows:
    if i[0] not in temp_dict.keys():
        temp_dict[i[0]] = []    
        
    if i[0] == i[1]:
        temp_dict[i[0]].append(i[0])
        continue
    temp_dict[i[0]].append(i[1])
    
print(temp_dict)  

# if temp_dict[i[0]] in temp_dict.values():


{0: [0, 2], 1: [1], 2: [0, 2], 3: [3], 4: [4], 5: [5]}


In [30]:
temp_dict = {}
for i in rows:
    if i[0] not in temp_dict.keys():
        temp_dict[i[0]] = []    
    if i[0] == i[1]:
        temp_dict[i[0]].append(sentences[i[0]])
        continue
    temp_dict[i[0]].append(sentences[i[1]])
print(temp_dict)   

{0: ['Three years later, the coffin was still full of Jello.', 'The person box was packed with jelly many dozens of months later.', 'He found a leprechaun in his walnut shell.'], 1: ['The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.'], 2: ['Three years later, the coffin was still full of Jello.', 'The person box was packed with jelly many dozens of months later.', 'It took him a month to finish the meal.'], 3: ["Standing on one's head at job interviews forms a lasting impression."], 4: ['The person box was packed with jelly many dozens of months later.', 'It took him a month to finish the meal.'], 5: ['Three years later, the coffin was still full of Jello.', 'He found a leprechaun in his walnut shell.']}


In [31]:
# import csv

# import pandas as pd
# import numpy as np
# from scipy import spatial
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')


# def get_embeddings(filename):
#     with open(filename) as csv_file:
#         # read the csv file
#         csv_reader = csv.reader(csv_file)

#     # now we can use this csv files into the pandas
#     df = pd.DataFrame([csv_reader], index=None)

#     df_embedding = df.assign(embeddings=df['Lyric'].apply(
#         lambda x: model.encode(str(x))))
#     print(df_embedding)
#     return df_embedding


# def get_similarity_score(inp, filename):
#     data = get_embeddings(filename)
#     inp_vector = model.encode(inp)
#     s = data['embeddings'].apply(
#         lambda x: 1 - spatial.distance.cosine(x, inp_vector))
#     data = data.assign(similarity=s)
#     return (data.sort_values('similarity', ascending=False))


# if __name__ == '__main__':

#     filename = 'lyrics.csv'     # csv file name

#     print(get_similarity_score('thinking about you', filename))
