In [1]:
import torch
import numpy as np
import pandas as pd

from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import json
import os

In [3]:
# sentences = [
#     "Three years later, the coffin was still full of Jello.",
#     "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
#     "The person box was packed with jelly many dozens of months later.",
#     "Standing on one's head at job interviews forms a lasting impression.",
#     "It took him a month to finish the meal.",
#     "He found a leprechaun in his walnut shell."
# ]

In [4]:
df = pd.read_csv('/home/heptagon/Desktop/sentence-similarity/information.csv')
df.head(3)

Unnamed: 0,Employed by this Industry,Code,Occupation,Projected Growth (2021-2031),Projected Job Openings (2021-2031)
0,89%,27-3011.00,Broadcast Announcers and Radio Disc Jockeys,Decline,2100
1,85%,39-3021.00,Motion Picture Projectionists,Much faster than average,700
2,80%,27-3023.00,"News Analysts, Reporters, and Journalists",Decline,4900


In [5]:
sentences = []
for key_str in tqdm(df.Occupation):
    sentences.append(key_str)


100%|███████████████████████████████████████| 66/66 [00:00<00:00, 368607.28it/s]


In [6]:
# if file_extension.endswith(".csv"):   # load csv file & convert to dataframe
#     df = pd.read_csv(filepath)

# if file_extension.endswith(".xlsx"):    # load excel file & convert to dataframe
#     df = pd.read_excel(filepath)

# if file_extension.endswith(".txt"):    # load text file into dataframe
#     df = pd.read_csv(filepath, sep=',', header=None)
    
# if file_extension.endswith(".json"):   # open json file
#     with open(filepath,'r') as file:
#         data = json.load(file)
#     df = pd.DataFrame(data)    # loading into a DataFrame
    


In [7]:
len(sentences)

66

In [8]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# initialize dictionary that will contain tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # tokenize sentence and append to dictionary lists
    new_tokens = tokenizer.encode_plus(sentence,
                                       max_length=128,
                                       truncation=True,
                                       padding='max_length',
                                       return_tensors='pt'
                                      )
    
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [9]:
tokens['input_ids'].shape

torch.Size([66, 128])

In [10]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [11]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[ 0.0608,  0.2513,  0.7702,  ...,  0.9841, -0.1151, -0.6919],
         [ 0.9133,  0.1304,  1.0075,  ...,  0.7064,  0.2474, -0.8052],
         [ 0.3163,  0.4403,  0.6640,  ...,  0.6056, -0.0948, -0.6583],
         ...,
         [ 0.2646,  0.0861,  0.6944,  ...,  0.8370, -0.0601, -0.4292],
         [ 0.2012,  0.1064,  0.6938,  ...,  0.9053, -0.0914, -0.4583],
         [ 0.1990,  0.1777,  0.8084,  ...,  0.8776, -0.0473, -0.4129]],

        [[-0.0499, -0.0384,  1.8612,  ..., -0.0534,  0.2067, -0.2082],
         [ 0.6296,  0.1262,  1.4102,  ..., -0.1588,  0.1785,  0.3420],
         [ 1.0020, -0.1525,  1.3424,  ..., -0.2481,  0.1890,  0.1235],
         ...,
         [ 0.1118, -0.2418,  1.4215,  ...,  0.0388,  0.2120,  0.1326],
         [ 0.1635, -0.1824,  1.4771,  ...,  0.0860,  0.1856,  0.0857],
         [ 0.0577, -0.2924,  1.4674,  ...,  0.0199,  0.0755,  0.2070]],

        [[ 0.0287,  0.5112,  0.6561,  ..., -0.0769,  0.5289, -0.7474],
         [ 0.6630, -0.0700,  0.7891,  ..., -0

In [12]:
embeddings.shape

torch.Size([66, 128, 768])

In [13]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([66, 128])

In [14]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([66, 128, 768])

In [15]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([66, 128, 768])

In [16]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([66, 768])

In [17]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([66, 768])

In [18]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[ 0.3616,  0.0936,  0.9224,  ...,  0.9126, -0.0742, -0.7194],
        [ 0.5937, -0.0762,  1.6927,  ..., -0.0575,  0.0362, -0.0171],
        [ 0.4736,  0.3709,  0.6660,  ..., -0.1013,  0.5430, -1.0005],
        ...,
        [ 0.0290,  0.0665,  1.3636,  ..., -0.7722, -0.9682,  0.0473],
        [ 0.6357,  0.0324,  1.8577,  ..., -0.5765, -0.7784, -0.7426],
        [ 0.2514,  0.7104,  1.6318,  ..., -0.3017, -0.4447,  0.1687]],
       grad_fn=<DivBackward0>)

In [19]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

In [20]:
mean_pooled

array([[ 0.36157602,  0.09358482,  0.92236423, ...,  0.91257954,
        -0.07423987, -0.7193927 ],
       [ 0.593666  , -0.07615194,  1.6927055 , ..., -0.05754603,
         0.03617936, -0.01708179],
       [ 0.47358528,  0.3708792 ,  0.66595584, ..., -0.1012688 ,
         0.54300445, -1.0005151 ],
       ...,
       [ 0.02898995,  0.06645408,  1.3635979 , ..., -0.7722109 ,
        -0.9681676 ,  0.04727154],
       [ 0.63572997,  0.0324267 ,  1.8576798 , ..., -0.576498  ,
        -0.7783937 , -0.7426126 ],
       [ 0.25139147,  0.7104294 ,  1.6318375 , ..., -0.30172223,
        -0.44473323,  0.16868499]], dtype=float32)

In [21]:
# # calculate cosine_similarity
# cosine_similarity(
#     [mean_pooled[0]],
#     mean_pooled[1:]
# )

In [22]:
%%time

result = cosine_similarity(mean_pooled,mean_pooled)
print(result)

[[0.9999995  0.40466464 0.5272178  ... 0.36013797 0.38598788 0.36719847]
 [0.40466464 1.0000001  0.36913183 ... 0.37598765 0.49970976 0.51804054]
 [0.5272178  0.36913183 1.0000002  ... 0.400357   0.43337142 0.5019165 ]
 ...
 [0.36013797 0.37598765 0.400357   ... 1.0000002  0.49113446 0.6049912 ]
 [0.38598788 0.49970976 0.43337142 ... 0.49113446 0.9999999  0.57435197]
 [0.36719847 0.51804054 0.5019165  ... 0.6049912  0.57435197 1.0000002 ]]
CPU times: user 2 µs, sys: 4 ms, total: 4.01 ms
Wall time: 4.71 ms


In [23]:
# while True:
rows=np.argwhere(result>.6)
rows

array([[ 0,  0],
       [ 0,  3],
       [ 0, 21],
       ...,
       [65, 62],
       [65, 63],
       [65, 65]])

In [24]:
temp_dict = {}
for i in rows:
    if i[0] not in temp_dict.keys():
        temp_dict[i[0]] = []    
        
    if i[0] == i[1]:
        temp_dict[i[0]].append(i[0])
        continue
    temp_dict[i[0]].append(i[1])
    
print(temp_dict)  

# if temp_dict[i[0]] in temp_dict.values():


{0: [0, 3, 21, 30], 1: [1, 6, 12, 16, 19, 25], 2: [2, 3, 9], 3: [0, 2, 3, 5, 6, 7, 8, 9, 11, 13, 15, 16, 17, 19, 20, 21, 24, 25, 26, 30, 33, 35, 37, 39, 41, 42, 50, 59, 62, 64], 4: [4, 5, 21, 29, 33, 38, 41, 45], 5: [3, 4, 5, 7, 9, 13, 21, 24, 25, 28, 30, 32, 33, 37, 39, 41, 42, 46, 56, 59, 62, 63], 6: [1, 3, 6, 7, 8, 9, 11, 12, 13, 15, 16, 19, 25, 26, 35, 43, 47, 62, 64, 65], 7: [3, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 19, 20, 22, 24, 25, 27, 32, 33, 34, 35, 36, 37, 39, 42, 43, 44, 46, 47, 50, 51, 53, 54, 55, 56, 58, 59, 62, 64, 65], 8: [3, 6, 7, 8, 9, 10, 11, 15, 19, 25, 27, 35, 37, 39, 43, 47, 50, 54, 55, 62, 64, 65], 9: [2, 3, 5, 6, 7, 8, 9, 10, 11, 15, 17, 20, 21, 24, 25, 33, 35, 36, 37, 39, 42, 43, 46, 47, 50, 51, 55, 56, 58, 59, 62, 64, 65], 10: [7, 8, 9, 10, 35, 38, 43, 47, 51, 54, 55, 56], 11: [3, 6, 7, 8, 9, 11, 12, 15, 19, 35, 36, 42, 47, 48, 58, 62, 65], 12: [1, 6, 7, 11, 12, 16, 23, 25], 13: [3, 5, 6, 13, 21, 25, 26, 28, 30, 33, 62], 14: [14, 60], 15: [3, 6, 7, 8, 9, 11, 15,

In [25]:
temp_dict = {}
for i in rows:
    if i[0] not in temp_dict.keys():
        temp_dict[i[0]] = []    
    if i[0] == i[1]:
        temp_dict[i[0]].append(sentences[i[0]])
        continue
    temp_dict[i[0]].append(sentences[i[1]])
print(temp_dict)   

{0: ['Broadcast Announcers and Radio Disc Jockeys', 'Broadcast Technicians', 'Radio, Cellular, and Tower Equipment Installers and Repairers', 'Radio Frequency Identification Device Specialists'], 1: ['Motion Picture Projectionists', 'Producers and Directors', 'Film and Video Editors', 'Camera Operators, Television, Video, and Film', 'Actors', 'Audio and Video Technicians'], 2: ['News Analysts, Reporters, and Journalists', 'Broadcast Technicians', 'Media Technical Directors/Managers'], 3: ['Broadcast Announcers and Radio Disc Jockeys', 'News Analysts, Reporters, and Journalists', 'Broadcast Technicians', 'Telecommunications Line Installers and Repairers', 'Producers and Directors', 'Media Programming Directors', 'Talent Directors', 'Media Technical Directors/Managers', 'Editors', 'Sound Engineering Technicians', 'Costume Attendants', 'Camera Operators, Television, Video, and Film', 'Media and Communication Equipment Workers, All Other', 'Actors', 'Media and Communication Workers, All Ot

In [30]:
temp_dict.values()

dict_values([['Broadcast Announcers and Radio Disc Jockeys', 'Broadcast Technicians', 'Radio, Cellular, and Tower Equipment Installers and Repairers', 'Radio Frequency Identification Device Specialists'], ['Motion Picture Projectionists', 'Producers and Directors', 'Film and Video Editors', 'Camera Operators, Television, Video, and Film', 'Actors', 'Audio and Video Technicians'], ['News Analysts, Reporters, and Journalists', 'Broadcast Technicians', 'Media Technical Directors/Managers'], ['Broadcast Announcers and Radio Disc Jockeys', 'News Analysts, Reporters, and Journalists', 'Broadcast Technicians', 'Telecommunications Line Installers and Repairers', 'Producers and Directors', 'Media Programming Directors', 'Talent Directors', 'Media Technical Directors/Managers', 'Editors', 'Sound Engineering Technicians', 'Costume Attendants', 'Camera Operators, Television, Video, and Film', 'Media and Communication Equipment Workers, All Other', 'Actors', 'Media and Communication Workers, All Ot

In [32]:
df['Similar Text'] = temp_dict.values()

In [33]:
df

Unnamed: 0,Employed by this Industry,Code,Occupation,Projected Growth (2021-2031),Projected Job Openings (2021-2031),Similar Text
0,89%,27-3011.00,Broadcast Announcers and Radio Disc Jockeys,Decline,2100,"[Broadcast Announcers and Radio Disc Jockeys, ..."
1,85%,39-3021.00,Motion Picture Projectionists,Much faster than average,700,"[Motion Picture Projectionists, Producers and ..."
2,80%,27-3023.00,"News Analysts, Reporters, and Journalists",Decline,4900,"[News Analysts, Reporters, and Journalists, Br..."
3,73%,27-4012.00,Broadcast Technicians,Decline,2800,"[Broadcast Announcers and Radio Disc Jockeys, ..."
4,68%,49-2022.00,Telecommunications Equipment Installers and Re...,Faster than average,22500,[Telecommunications Equipment Installers and R...
...,...,...,...,...,...,...
61,10%,15-1251.00,Computer Programmers,Decline,9600,"[Desktop Publishers, Web and Digital Interface..."
62,10%,27-3042.00,Technical Writers,Average,5400,"[Broadcast Technicians, Telecommunications Lin..."
63,10%,17-2061.00,Computer Hardware Engineers,Average,5300,[Telecommunications Line Installers and Repair...
64,10%,27-1027.00,Set and Exhibit Designers,Average,2700,"[Broadcast Technicians, Producers and Director..."


In [34]:
df.to_csv('/home/heptagon/Desktop/sentence-similarity/information_similar_text.csv')

In [26]:
# import csv

# import pandas as pd
# import numpy as np
# from scipy import spatial
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')


# def get_embeddings(filename):
#     with open(filename) as csv_file:
#         # read the csv file
#         csv_reader = csv.reader(csv_file)

#     # now we can use this csv files into the pandas
#     df = pd.DataFrame([csv_reader], index=None)

#     df_embedding = df.assign(embeddings=df['Lyric'].apply(
#         lambda x: model.encode(str(x))))
#     print(df_embedding)
#     return df_embedding


# def get_similarity_score(inp, filename):
#     data = get_embeddings(filename)
#     inp_vector = model.encode(inp)
#     s = data['embeddings'].apply(
#         lambda x: 1 - spatial.distance.cosine(x, inp_vector))
#     data = data.assign(similarity=s)
#     return (data.sort_values('similarity', ascending=False))


# if __name__ == '__main__':

#     filename = 'lyrics.csv'     # csv file name

#     print(get_similarity_score('thinking about you', filename))
