In [1]:
import pandas as pd
import os
from glob import glob
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: NVIDIA A100-PCIE-40GB


In [7]:
path = '../coded_speeches/'
files = sorted(glob(path+'*_ak.xlsx'))
print(files)


['../coded_speeches/108_ak.xlsx', '../coded_speeches/10_ak.xlsx', '../coded_speeches/137_ak.xlsx', '../coded_speeches/138_ak.xlsx', '../coded_speeches/139_ak.xlsx', '../coded_speeches/140_ak.xlsx', '../coded_speeches/142_ak.xlsx', '../coded_speeches/143_ak.xlsx', '../coded_speeches/144_ak.xlsx', '../coded_speeches/145_ak.xlsx', '../coded_speeches/14_ak.xlsx', '../coded_speeches/150_ak.xlsx', '../coded_speeches/152_ak.xlsx', '../coded_speeches/153_ak.xlsx', '../coded_speeches/154_ak.xlsx', '../coded_speeches/155_ak.xlsx', '../coded_speeches/156_ak.xlsx', '../coded_speeches/157_ak.xlsx', '../coded_speeches/158_ak.xlsx', '../coded_speeches/15_ak.xlsx', '../coded_speeches/163_ak.xlsx', '../coded_speeches/164_ak.xlsx', '../coded_speeches/19_ak.xlsx', '../coded_speeches/21_ak.xlsx', '../coded_speeches/23_ak.xlsx', '../coded_speeches/24_ak.xlsx', '../coded_speeches/25_ak.xlsx', '../coded_speeches/27_ak.xlsx', '../coded_speeches/30B_ak.xlsx', '../coded_speeches/31_ak.xlsx', '../coded_speeches/

In [8]:
cols = pd.read_excel(files[1], skiprows=7).drop(columns=['Unnamed: 0']).columns
print(len(cols))
data = pd.DataFrame(columns=[*cols])

temp_values = []
for file in files:
    temp = pd.read_excel(file, skiprows=7).drop(columns=['Unnamed: 0'])
    temp_values.append(temp)
    
new_cols = dict(zip(cols, ["sentence", "Metaphor","Simile","Rhetorical questions", "Stories / anecdotes", "Contrasts", "Lists", "Repetition", "Moral conviction", "Sentiment of the collective", "Setting high expectations", "Confidence in goals"]))
                    # ['sentence', 'metaphor', 'simile', 'question', 'reference', 'figure_of_speech', 'lists', 'repetition', 'personal_statement', 'value_statement', 'explicit_goal', 'believe_statement']))

data = pd.concat(temp_values).rename(columns = new_cols)
data

12


Unnamed: 0,sentence,Metaphor,Simile,Rhetorical questions,Stories / anecdotes,Contrasts,Lists,Repetition,Moral conviction,Sentiment of the collective,Setting high expectations,Confidence in goals
0,"All right, good afternoon everybody, and welco...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Im here with Paul Mounds and Roland Cook Ill i...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Just to give you our briefing.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"First of all, in terms of where we are in the ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Um, another 1200 people tested positive at a 3...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
284,I firmly believe our mission is the same: a sa...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
285,I look forward to working with you all in the ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
286,Thank you all for your work as public servants.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
287,God bless you.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

In [27]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

tokens = {'input_ids': [], 'attention_mask': []}

for sentence in data.sentence: #sentences: #
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence,
                                       max_length=149,
                                       truncation=True,
                                       padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])


In [28]:
tokens

{'input_ids': tensor([[  101,  2035,  2157,  ...,     0,     0,     0],
         [  101, 10047,  2182,  ...,     0,     0,     0],
         [  101,  2074,  2000,  ...,     0,     0,     0],
         ...,
         [  101,  4067,  2017,  ...,     0,     0,     0],
         [  101,  2643, 19994,  ...,     0,     0,     0],
         [  101,  1998,  2643,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [29]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [30]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-0.7114,  0.1948,  1.7890,  ...,  0.4281,  0.4396,  0.0871],
         [-0.8220,  0.1542,  1.7806,  ...,  0.4716,  0.5846, -0.0493],
         [-0.1947,  0.3117,  1.8465,  ...,  0.3116,  0.4400, -0.0890],
         ...,
         [-0.1679,  0.1908,  1.4436,  ...,  0.3110,  0.7330, -0.2288],
         [-0.4778,  0.3167,  1.6465,  ...,  0.3068,  0.3765,  0.0219],
         [-0.0943,  0.1934,  1.5249,  ...,  0.2464,  0.6616, -0.3687]],

        [[-0.5336,  1.2916,  1.2991,  ..., -0.3011,  0.9787,  0.6496],
         [-0.4193,  1.4947,  0.9722,  ..., -0.3570,  1.0593,  0.4234],
         [-0.5266,  1.2534,  1.1156,  ..., -0.1521,  0.9748,  0.5544],
         ...,
         [ 0.1739,  0.7020,  0.6922,  ..., -0.0747,  0.8065, -0.2749],
         [ 0.0251,  1.0979,  1.0688,  ..., -0.2234,  0.6631, -0.0346],
         [ 0.0693,  1.1019,  1.1264,  ..., -0.2567,  0.6959, -0.0401]],

        [[ 0.0762, -0.1074,  2.3503,  ...,  0.1116, -0.3783,  0.1668],
         [ 0.3385, -0.2229,  2.1474,  ..., -0

In [31]:
embeddings.shape

torch.Size([5925, 149, 768])

In [32]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([5925, 149])

In [33]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([5925, 149, 768])

In [34]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [35]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([5925, 149, 768])

In [36]:
masked_embeddings

tensor([[[-0.7114,  0.1948,  1.7890,  ...,  0.4281,  0.4396,  0.0871],
         [-0.8220,  0.1542,  1.7806,  ...,  0.4716,  0.5846, -0.0493],
         [-0.1947,  0.3117,  1.8465,  ...,  0.3116,  0.4400, -0.0890],
         ...,
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000]],

        [[-0.5336,  1.2916,  1.2991,  ..., -0.3011,  0.9787,  0.6496],
         [-0.4193,  1.4947,  0.9722,  ..., -0.3570,  1.0593,  0.4234],
         [-0.5266,  1.2534,  1.1156,  ..., -0.1521,  0.9748,  0.5544],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000]],

        [[ 0.0762, -0.1074,  2.3503,  ...,  0.1116, -0.3783,  0.1668],
         [ 0.3385, -0.2229,  2.1474,  ..., -0

In [37]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([5925, 768])

In [38]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([5925, 768])

In [39]:
summed_mask

tensor([[22., 22., 22.,  ..., 22., 22., 22.],
        [16., 16., 16.,  ..., 16., 16., 16.],
        [ 9.,  9.,  9.,  ...,  9.,  9.,  9.],
        ...,
        [12., 12., 12.,  ..., 12., 12., 12.],
        [ 6.,  6.,  6.,  ...,  6.,  6.,  6.],
        [12., 12., 12.,  ..., 12., 12., 12.]])

In [40]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[-0.4715,  0.2704,  1.7815,  ...,  0.3355,  0.4884, -0.1272],
        [-0.2590,  1.2567,  1.0579,  ..., -0.3197,  0.9744,  0.2253],
        [ 0.1154, -0.1998,  2.2266,  ...,  0.2250, -0.4676, -0.0172],
        ...,
        [ 0.2021,  0.7994,  1.4249,  ...,  0.1985, -0.8231,  0.2869],
        [ 0.3602,  0.3985,  2.6640,  ...,  0.6533, -0.0673,  0.1198],
        [-0.4116,  0.4883,  0.4389,  ..., -0.0656,  0.0272, -0.5090]],
       grad_fn=<DivBackward0>)

In [42]:
from sklearn.metrics.pairwise import cosine_similarity

# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate


AttributeError: 'numpy.ndarray' object has no attribute 'detach'

In [48]:
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([0.74474734, 0.7056477 , 0.74258924, 0.832778  , 0.74243563,
       0.757377  ], dtype=float32)

In [63]:
similarities = pd.DataFrame(cosine_similarity(
    mean_pooled
))

In [64]:
similarities

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5915,5916,5917,5918,5919,5920,5921,5922,5923,5924
0,1.000000,0.482315,0.539548,0.270033,0.321339,0.372275,0.209080,0.420433,0.329566,0.399162,...,0.382432,0.265092,0.389649,0.422999,0.525015,0.455656,0.591651,0.501383,0.435362,0.406074
1,0.482315,1.000000,0.380242,0.420791,0.285500,0.378000,0.279478,0.287651,0.335097,0.236579,...,0.109086,0.285455,0.404042,0.293651,0.338417,0.208758,0.309797,0.199518,0.263104,0.283418
2,0.539548,0.380242,1.000000,0.508724,0.034013,0.483690,0.170454,0.597477,0.250988,0.334286,...,0.525182,0.201829,0.476673,0.327965,0.199285,0.310883,0.397520,0.421711,0.532285,0.271947
3,0.270033,0.420791,0.508724,1.000000,0.131673,0.566762,0.260344,0.402828,0.233060,0.413735,...,0.474286,0.183959,0.413786,0.324119,0.179560,0.145713,0.196488,0.145052,0.337623,0.203689
4,0.321339,0.285500,0.034013,0.131673,1.000000,0.385955,0.436009,0.259814,0.485782,0.302233,...,0.046259,0.248319,0.309219,0.340503,0.517919,0.181491,0.373292,0.192808,0.166256,0.249057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5920,0.455656,0.208758,0.310883,0.145713,0.181491,0.313879,0.341948,0.285803,0.147127,0.642333,...,0.558408,0.479228,0.394025,0.313824,0.497622,1.000000,0.517000,0.607723,0.631624,0.720904
5921,0.591651,0.309797,0.397520,0.196488,0.373292,0.382894,0.335793,0.419410,0.176808,0.429384,...,0.307559,0.311661,0.369906,0.392328,0.676277,0.517000,1.000000,0.543064,0.403320,0.394194
5922,0.501383,0.199518,0.421711,0.145052,0.192808,0.371203,0.224236,0.396928,0.148387,0.461251,...,0.492570,0.325035,0.455352,0.465523,0.417250,0.607723,0.543064,1.000000,0.648152,0.524719
5923,0.435362,0.263104,0.532285,0.337623,0.166256,0.358936,0.210689,0.401272,0.287279,0.586112,...,0.659249,0.225752,0.571199,0.257648,0.399903,0.631624,0.403320,0.648152,1.000000,0.580427


In [88]:
similarities[(similarities>0.7) & (similarities!=1.0) & (similarities<0.999999)] #rounding errors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5915,5916,5917,5918,5919,5920,5921,5922,5923,5924
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5920,,,,,,,,,,,...,,,,,,,,,,0.720904
5921,,,,,,,,,,,...,,,,,,,,,,
5922,,,,,,,,,,,...,,,,,,,,,,
5923,,,,,,,,,,,...,,,,,,,,,,
