# Bert Transformer - Sentence Similarity

In [1]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

# pandas settings
pd.set_option('display.max_columns', None)

In [2]:
# loading data set
df_cases = pd.read_csv('mother_jones_mass_shootings_1982-2023.csv')
df_cases.head()

Unnamed: 0,case,location,date,summary,fatalities,injured,total_victims,location.1,age_of_shooter,prior_signs_mental_health_issues,mental_health_details,weapons_obtained_legally,where_obtained,weapon_type,weapon_details,race,gender,sources,mental_health_sources,sources_additional_age,latitude,longitude,type,year
0,Michigan State University shooting,"East Lansing, Michigan",2/13/2023,"Anthony D. McRae, 43, opened fire at Berkey Ha...",3,5,8,School,43,-,-,yes,-,semiautomatic handguns,-,Black,M,https://www.cnn.com/us/live-news/michigan-stat...,-,-,-,-,Mass,2023
1,Half Moon Bay spree shooting,"Half Moon Bay, California",1/23/2023,"Chunli Zhao, 67, suspected of carrying out the...",7,1,8,workplace,67,-,-,-,-,semiautomatic handgun,-,Asian,M,https://www.cnn.com/us/live-news/half-moon-bay...,-,-,-,-,Spree,2023
2,LA dance studio mass shooting,"Monterey Park, California",1/21/2023,"Huu Can Tran, 72, fled the scene in a white va...",11,10,21,Other,72,yes,"According to the LA Times, ""Two law enforcemen...",-,-,semiautomatic assault weapon (Details pending),-,Asian,M,https://www.latimes.com/california/story/2023-...,https://www.latimes.com/california/story/2023-...,-,-,-,Mass,2023
3,Virginia Walmart shooting,"Chesapeake, Virginia",11/22/2022,"Andre Bing, 31, who worked as a supervisor at ...",6,6,12,Workplace,31,-,-,-,-,semiautomatic handgun,-,Black,M,https://www.washingtonpost.com/dc-md-va/2022/1...,-,-,-,-,Mass,2022
4,LGBTQ club shooting,"Colorado Springs, Colorado",11/19/2022,"Anderson L. Aldrich, 22, wore body armor and o...",5,25,30,Other,22,yes,Aldrich reportedly had a history of menacing b...,-,-,semiautomatic rifle; semiautomatic handgun,-,White,M,https://coloradosun.com/2022/11/20/club-q-shoo...,https://www.nytimes.com/2022/11/20/us/colorado...,-,-,-,Mass,2022


In [3]:
# dropping columns
df_cases_summary = df_cases.copy()
df_cases_summary = df_cases_summary.iloc[:, :4]

In [4]:
# column types
df_cases_summary.dtypes

case        object
location    object
date        object
summary     object
dtype: object

In [5]:
# converting dtypes
dict_convert = {
    'date': 'datetime64[ns]'}

df_cases_summary = df_cases_summary.astype(dict_convert)
print(df_cases_summary.dtypes)

case                object
location            object
date        datetime64[ns]
summary             object
dtype: object


In [6]:
# empty lists for case text
list_cases_summary = []

for case in df_cases_summary['summary']:
    list_cases_summary.append(case)
    
print(list_cases_summary[:5])    

['Anthony D. McRae, 43, opened fire at Berkey Hall and the MSU union, according to local police. Following an intense manhunt in the area, he was found dead from a self-inflicted gunshot wound, police said.', 'Chunli Zhao, 67, suspected of carrying out the attacks at a mushroom farm and near a trucking facility, was apprehended by police. Zhao reportedly worked at the mushroom farm.', 'Huu Can Tran, 72,\xa0fled the scene in a white van and later shot himself to death as police closed in.', 'Andre Bing, 31, who worked as a supervisor at the store, opened fire on co-workers and then fatally shot himself, according to local authorities.', 'Anderson L. Aldrich, 22, wore body armor and opened fire upon entering the club as a dance party was underway; he was subdued by unarmed patrons who tackled him amid the carnage and held him down until police arrived.']


In [7]:
# tokenizing from pre-trained model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# storing model
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [8]:
# empty dictionary for tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

# looping through case summaries
for text in list_cases_summary:
    
    # tokenizing text and appending to dictionary
    new_tokens = tokenizer.encode_plus(
        text,
        max_length=100,
        truncation=True,
        padding='max_length',
        return_tensors='pt')
    
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    
# reformatting to single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])                                                                      

In [9]:
# running tokens through model
model_outputs = model(**tokens)
type(model_outputs)

transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions

In [10]:
# retriving last hidden state
text_embeddings = model_outputs.last_hidden_state
text_embeddings

tensor([[[ 0.0626,  0.2279, -0.0783,  ..., -0.0746, -0.3474, -0.5830],
         [-0.1216,  0.3103, -0.5912,  ..., -0.6785, -0.5178, -1.5699],
         [-0.9428,  0.3043,  0.3378,  ..., -0.2616, -0.4112, -0.2871],
         ...,
         [-0.1356,  0.1915, -0.1093,  ..., -0.1365, -0.1776, -0.1400],
         [-0.1325,  0.1970, -0.0926,  ..., -0.1461, -0.1710, -0.1646],
         [-0.1018,  0.1579, -0.0858,  ..., -0.1707, -0.1438, -0.1899]],

        [[ 0.2279,  0.2430,  0.0702,  ..., -0.2046, -0.2835, -0.1278],
         [ 0.2710, -0.1799, -0.0650,  ...,  0.1422, -2.0253, -0.8202],
         [-0.2051, -0.4086, -0.2413,  ..., -0.1062,  0.0648,  0.5318],
         ...,
         [-0.2075, -0.1459,  0.2435,  ...,  0.0043, -0.0589,  0.2288],
         [-0.1920, -0.1383,  0.2663,  ..., -0.0045, -0.0507,  0.2099],
         [-0.1567, -0.1624,  0.2872,  ..., -0.0299, -0.0425,  0.1851]],

        [[ 0.0305,  0.1763, -0.3852,  ...,  0.4632, -0.4403, -0.0806],
         [-0.4423,  0.4096,  0.0305,  ...,  0

In [11]:
# displaying shape
text_embeddings.shape

torch.Size([140, 100, 384])

In [12]:
# assigning attention mask
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([140, 100])

In [13]:
# resizing attention mask
mask = attention_mask.unsqueeze(-1).expand(text_embeddings.size()).float()
mask.shape

torch.Size([140, 100, 384])

In [14]:
print(mask)

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [15]:
# Multiplying tensors to apply attention mask
masked_embeddings = text_embeddings * mask
masked_embeddings.shape

torch.Size([140, 100, 384])

In [16]:
print(masked_embeddings)

tensor([[[ 0.0626,  0.2279, -0.0783,  ..., -0.0746, -0.3474, -0.5830],
         [-0.1216,  0.3103, -0.5912,  ..., -0.6785, -0.5178, -1.5699],
         [-0.9428,  0.3043,  0.3378,  ..., -0.2616, -0.4112, -0.2871],
         ...,
         [-0.0000,  0.0000, -0.0000,  ..., -0.0000, -0.0000, -0.0000],
         [-0.0000,  0.0000, -0.0000,  ..., -0.0000, -0.0000, -0.0000],
         [-0.0000,  0.0000, -0.0000,  ..., -0.0000, -0.0000, -0.0000]],

        [[ 0.2279,  0.2430,  0.0702,  ..., -0.2046, -0.2835, -0.1278],
         [ 0.2710, -0.1799, -0.0650,  ...,  0.1422, -2.0253, -0.8202],
         [-0.2051, -0.4086, -0.2413,  ..., -0.1062,  0.0648,  0.5318],
         ...,
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [-0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
         [-0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000]],

        [[ 0.0305,  0.1763, -0.3852,  ...,  0.4632, -0.4403, -0.0806],
         [-0.4423,  0.4096,  0.0305,  ...,  0

In [17]:
# summing embedding remainder
summed_embeddings = torch.sum(masked_embeddings, 1) # axis 1
summed_embeddings.shape

torch.Size([140, 384])

In [18]:
# summing values that will be given attention
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([140, 384])

In [19]:
print(summed_mask)

tensor([[53., 53., 53.,  ..., 53., 53., 53.],
        [41., 41., 41.,  ..., 41., 41., 41.],
        [27., 27., 27.,  ..., 27., 27., 27.],
        ...,
        [27., 27., 27.,  ..., 27., 27., 27.],
        [31., 31., 31.,  ..., 31., 31., 31.],
        [32., 32., 32.,  ..., 32., 32., 32.]])


In [20]:
# calculating dense vectors (sum embedding / # of values given attention)
mean_pooled = summed_embeddings/ summed_mask
print(mean_pooled)

tensor([[ 0.0747,  0.1777, -0.1114,  ..., -0.1926, -0.1617, -0.1582],
        [-0.0963,  0.1076,  0.0373,  ..., -0.0981, -0.1569,  0.1062],
        [ 0.0174,  0.3368, -0.4481,  ...,  0.3432, -0.1583,  0.0934],
        ...,
        [ 0.0372,  0.1361, -0.1849,  ...,  0.0863,  0.0880, -0.0638],
        [ 0.2252,  0.1714, -0.2246,  ..., -0.2780, -0.0119, -0.1841],
        [-0.0882,  0.2136, -0.4408,  ...,  0.0027, -0.3168,  0.0586]],
       grad_fn=<DivBackward0>)


In [21]:
# storing case names as a list
list_cases = df_cases['case'].tolist()

# converting mean pooled to NumPy array
mean_pooled = mean_pooled.detach().numpy()

In [22]:
# creating function to return case names and vector combinations
def list_combinations(x, y):
    case_combinations = list(combinations(x, 2))
    vector_combinations = list(combinations(y, 2))
    return case_combinations, vector_combinations

In [23]:
# assigning case and vector combinations
case_combinations, vector_combinations = list_combinations(list_cases, mean_pooled)
print(case_combinations[0], vector_combinations[0])

('Michigan State University shooting', 'Half Moon Bay spree shooting') (array([ 7.47392774e-02,  1.77651465e-01, -1.11362457e-01,  1.57698557e-01,
        1.38210133e-01,  6.33460656e-02,  4.22190391e-02, -1.34048029e-03,
       -1.81915611e-01,  2.45709777e-01,  1.29961437e-02,  2.10031923e-02,
       -9.02394727e-02,  2.20664442e-01,  8.66822302e-02, -3.42523269e-02,
       -2.28874326e-01,  7.44825825e-02, -1.11496069e-01, -8.14281404e-02,
       -9.95013397e-03,  2.23477870e-01, -1.58526659e-01, -6.24246523e-02,
        1.56078739e-02, -1.87496990e-02,  5.63595220e-02, -1.09594157e-02,
       -1.50498509e-01, -7.14748427e-02,  8.75643268e-02, -2.81634629e-01,
       -7.87040964e-02,  4.19440754e-02,  2.86191314e-01,  1.93254054e-02,
        1.83150053e-01,  2.71915048e-01, -9.94178578e-02, -4.07547243e-02,
       -2.13996351e-01,  1.58732146e-01,  3.07464480e-01,  2.29178712e-01,
       -1.07297339e-01,  4.13783379e-02, -1.68944821e-01, -1.50620982e-01,
        1.86852425e-01,  1.9

In [27]:
# creating to return dataframe with cases and similarity
def text_cosine_similarity(text, vectors):
    # creating empty lists to store cases and scores
    list_case_1 = []
    list_case_2 = []
    list_similarity = []

    # looping through case and vector combinations
    for i, j in zip(text, vectors):
        # append cases
        list_case_1.append(i[0])
        list_case_2.append(i[1])
        
        # append similarity score
        similarity_score = round(cosine_similarity([j[0]], j[1:])[0][0], 4)
        list_similarity.append(similarity_score)
        
    # creating dictionary from lists
    dict_cases = {'case_1': list_case_1, 
                  'case_2': list_case_2,
                  'similarity_score': list_similarity}
    
    # creating pandas dataframe from dictionary
    df = pd.DataFrame(dict_cases)
    return df

In [28]:
# retrieving top 10 most similar cases
df_cases = text_cosine_similarity(case_combinations, vector_combinations)
df_top = df_cases.nlargest(10, 'similarity_score')
df_top

Unnamed: 0,case_1,case_2,similarity_score
5928,Florida awning manufacturer shooting,Accent Signage Systems shooting,0.7191
4096,Mercy Hospital shooting,Fort Hood shooting 2,0.6991
3101,Pensacola Naval base shooting,Chattanooga military recruitment center,0.6913
3129,Pensacola Naval base shooting,Northern Illinois University shooting,0.6691
2447,Orange office complex shooting,Caltrans maintenance yard shooting,0.6613
7952,Accent Signage Systems shooting,Standard Gravure shooting,0.661
527,Virginia Walmart shooting,Caltrans maintenance yard shooting,0.6573
4062,Mercy Hospital shooting,T&T Trucking shooting,0.656
5942,Florida awning manufacturer shooting,Atlantis Plastics shooting,0.6545
8739,Northern Illinois University shooting,Welding shop shooting,0.6532


In [29]:
# exporting top 10 most similar cases
df_top.to_csv('top_similar_cases.csv')