In [1]:
# Data Structures
import numpy  as np
import pandas as pd

# Keyword extraction
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Loading the data
df = pd.read_excel('Clusters definitions_vf01.xlsx')
df.head()

Unnamed: 0,index,Criteria,Cluster,Cluster_name,Cluster code,Cluster definition,Description_all
0,1,33,5,Communication,COMMUN,Communication needs for adequate and proper wa...,Communication. Communication is the biggest ch...
1,2,21,8,Project requirements,PROJRE,A project's success or failure depends on the ...,Reporting requirement. Clear software requirem...
2,3,21,17,Project management,PROJMA,Project management performance questions: exte...,Effective leadership. The teams may be formed ...
3,4,19,14,Team relationship,TEAMRE,Teamwork is based on team member relationships...,Team issues. Within the global team context th...
4,5,18,19,Personality dimensions,PERSDI,"Emotional stability (i.e., calm, steady, self-...",Religion and political individual attitudes (p...


In [3]:
# Importing the clustering algorithms - initialize our model and tokenizer
model_name = 'sentence-transformers/all-mpnet-base-v2'
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [4]:
sentences = df['Cluster definition']

In [5]:
sentences

0     Communication needs for adequate and proper wa...
1     A project's success or failure depends on the ...
2     Project management performance questions: exte...
3     Teamwork is based on team member relationships...
4     Emotional stability (i.e., calm, steady, self-...
5     Organizational commitments. Employees feel com...
6     Specialty ability of the teams. The gap in the...
7     Handling of data describes competence in the h...
8     Requirements change management is a rich commu...
9     Criticality of the task. Criticality is the im...
10    Knowledge interchange rate is a process of exc...
11    Software changes are inevitable due to the dyn...
12    Organization: structure. The organizational st...
13    Architectural design practices are about imple...
14    The stakeholder relationship is associated wit...
15    Social facilities. Inequality manifests in the...
16    Gender segregation at work is widespread; with...
17    A Green or Sustainable Product Life Manage

In [6]:
sentences_list = list(df['Cluster definition'])

In [7]:
sentences_list

["Communication needs for adequate and proper ways of communication in general. In addition, the reduced communication frequency with the project team members became a problem due to the need for more informal or face-to-face contact. Each culture has its standards, styles, and moral principles, which can provoke communication related issues when individual belonging from different cultural background communicates with another one. Temporal issues are related to the time difference between teams that work Delayed feedback and responses are problematic and restrict the possibility of synchronous interaction, cooperation, and confidential assessment. Loss of tacit knowledge due to replacement of onshore with offshore staff. Reduced opportunities for synchronous communication were also a significant risk factor in GSD. Has the team met or talked personally? This event grows the relationship between people at different sites, increasing the efficient outsourcing relationships in organizati

In [8]:
# Declaring the variables - tokenize the sentences
tokens = {'input_ids': [], 'attention_mask': []}


In [9]:
for sentence in sentences_list:
    new_tokens = tokenizer.encode_plus(sentence, max_length=384,
                                       truncation=True, padding='max_length',return_tensors='pt',
                                       return_attention_mask=True)
    
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

In [10]:
tokens['input_ids']

[tensor([    0,  4811,  3795,  2009, 11710,  2002,  5376,  3975,  2001,  4811,
          2003,  2240,  1016,  2003,  2808,  1014,  2000,  4363,  4811,  6079,
          2011,  2000,  2626,  2140,  2376,  2154,  1041,  3295,  2353,  2004,
          2000,  2346,  2009,  2066, 11904,  2034,  2231,  1015,  2004,  1015,
          2231,  3971,  1016,  2173,  3230,  2042,  2053,  4785,  1014,  6786,
          1014,  2002,  7195,  6485,  1014,  2033,  2068, 27899,  4811,  3145,
          3318,  2047,  3269,  7499,  2017,  2371,  3455,  4285, 10643,  2019,
          2011,  2182,  2032,  1016, 15854,  3318,  2028,  3145,  2004,  2000,
          2055,  4493,  2094,  2784,  2012,  2151,  8398, 12251,  2002, 10964,
          2028, 18640,  2002, 21577,  2000,  6065,  2001, 26355,  8097, 17179,
          2275,  8294,  1014,  6796,  1014,  2002, 18781,  7671,  1016,  3283,
          2001, 11941, 26247,  3720,  2353,  2004,  6114,  2001,  2010, 19212,
          2011, 12199,  3099,  1016,  4363,  6699,  

In [11]:
# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])


In [12]:
# Checking the variables
tokens['input_ids']
type(tokens['input_ids'])
tokens['input_ids'].shape

torch.Size([25, 384])

In [13]:
# Making the operations - Processing these tokens through our model
outputs = model(**tokens)
outputs.keys()


odict_keys(['last_hidden_state', 'pooler_output'])

In [14]:
# The dense vector declarations of our text are contained within the outputs ’last_hidden_state’ tensor
embeddings = outputs.last_hidden_state
embeddings
embeddings.shape

torch.Size([25, 384, 768])

In [15]:
embeddings

tensor([[[ 0.1317,  0.0752, -0.1132,  ...,  0.1299, -0.0700,  0.0078],
         [ 0.1518, -0.1253, -0.1091,  ...,  0.0503, -0.2181,  0.0616],
         [ 0.2307, -0.0992, -0.1023,  ...,  0.1091, -0.0753, -0.0610],
         ...,
         [ 0.0374, -0.1356, -0.1270,  ..., -0.0070,  0.0313, -0.0184],
         [ 0.0366,  0.0035, -0.0527,  ...,  0.2252, -0.0130,  0.0382],
         [ 0.0480,  0.0903, -0.0634,  ...,  0.1836, -0.0138, -0.0481]],

        [[ 0.1590,  0.0625, -0.0387,  ...,  0.0623,  0.1466, -0.0530],
         [ 0.1888, -0.0075, -0.0409,  ..., -0.0011,  0.1412,  0.0216],
         [ 0.1341,  0.4550,  0.0109,  ..., -0.0041,  0.0700, -0.0309],
         ...,
         [ 0.2395,  0.1673,  0.0049,  ...,  0.0154,  0.1551,  0.0117],
         [ 0.0710, -0.2543, -0.1233,  ..., -0.0280,  0.1192, -0.0459],
         [ 0.1710,  0.0297, -0.0781,  ...,  0.0694,  0.1325, -0.0807]],

        [[ 0.0726, -0.0731, -0.1003,  ...,  0.0304, -0.0376,  0.0371],
         [ 0.0576,  0.4526, -0.0370,  ..., -0

In [16]:
# Resize our attention_mask tensor
attention = tokens['attention_mask']
attention.shape

torch.Size([25, 384])

In [17]:
mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1., 

In [18]:
# Multiply the two tensors to apply the attention masks
mask_embeddings = embeddings * mask
mask_embeddings

tensor([[[ 0.1317,  0.0752, -0.1132,  ...,  0.1299, -0.0700,  0.0078],
         [ 0.1518, -0.1253, -0.1091,  ...,  0.0503, -0.2181,  0.0616],
         [ 0.2307, -0.0992, -0.1023,  ...,  0.1091, -0.0753, -0.0610],
         ...,
         [ 0.0374, -0.1356, -0.1270,  ..., -0.0070,  0.0313, -0.0184],
         [ 0.0366,  0.0035, -0.0527,  ...,  0.2252, -0.0130,  0.0382],
         [ 0.0480,  0.0903, -0.0634,  ...,  0.1836, -0.0138, -0.0481]],

        [[ 0.1590,  0.0625, -0.0387,  ...,  0.0623,  0.1466, -0.0530],
         [ 0.1888, -0.0075, -0.0409,  ..., -0.0011,  0.1412,  0.0216],
         [ 0.1341,  0.4550,  0.0109,  ..., -0.0041,  0.0700, -0.0309],
         ...,
         [ 0.2395,  0.1673,  0.0049,  ...,  0.0154,  0.1551,  0.0117],
         [ 0.0710, -0.2543, -0.1233,  ..., -0.0280,  0.1192, -0.0459],
         [ 0.1710,  0.0297, -0.0781,  ...,  0.0694,  0.1325, -0.0807]],

        [[ 0.0726, -0.0731, -0.1003,  ...,  0.0304, -0.0376,  0.0371],
         [ 0.0576,  0.4526, -0.0370,  ..., -0

In [19]:
mask_embeddings.shape

torch.Size([25, 384, 768])

In [20]:
# Then we sum the remained of the embeddings along axis 1
summed = torch.sum(mask_embeddings, 1)
summed.shape

torch.Size([25, 768])

In [21]:
summed

tensor([[ 16.8987, -27.6957, -36.9562,  ...,  30.3001,   3.4418,   1.6699],
        [ 62.6257, -24.1880, -24.9794,  ...,  -1.2664,  49.6596,  -6.7724],
        [ 23.9693,  -8.3128, -28.6554,  ...,   4.1733,  26.4848, -12.8923],
        ...,
        [  4.2820, -40.5016, -26.5494,  ...,  -2.6433,   5.4979,  -1.2843],
        [-34.1677, -32.7682, -29.4033,  ..., -17.9574,  32.1449, -17.9390],
        [  7.5624,  14.1438,  -0.8781,  ...,  19.6955,  -4.8225,  -0.9024]],
       grad_fn=<SumBackward1>)

In [22]:
# Sum the number of values that must be given attention in each position of the tensor
counts = torch.clamp(mask.sum(1), min=1e-9)
counts.shape


torch.Size([25, 768])

In [23]:
counts

tensor([[384., 384., 384.,  ..., 384., 384., 384.],
        [384., 384., 384.,  ..., 384., 384., 384.],
        [384., 384., 384.,  ..., 384., 384., 384.],
        ...,
        [384., 384., 384.,  ..., 384., 384., 384.],
        [303., 303., 303.,  ..., 303., 303., 303.],
        [191., 191., 191.,  ..., 191., 191., 191.]])

In [24]:
# Calculate the mean as the sum of the embedding activation’s summed divided bythe number of values that should be given attention in each position counts
mean_pooled = summed / counts
mean_pooled.shape

torch.Size([25, 768])

In [25]:
mean_pooled

tensor([[ 0.0440, -0.0721, -0.0962,  ...,  0.0789,  0.0090,  0.0043],
        [ 0.1631, -0.0630, -0.0651,  ..., -0.0033,  0.1293, -0.0176],
        [ 0.0624, -0.0216, -0.0746,  ...,  0.0109,  0.0690, -0.0336],
        ...,
        [ 0.0112, -0.1055, -0.0691,  ..., -0.0069,  0.0143, -0.0033],
        [-0.1128, -0.1081, -0.0970,  ..., -0.0593,  0.1061, -0.0592],
        [ 0.0396,  0.0741, -0.0046,  ...,  0.1031, -0.0252, -0.0047]],
       grad_fn=<DivBackward0>)

In [26]:
# The final operations - calculate the cosine similarity between the vectors
from sklearn.metrics.pairwise import cosine_similarity
mean_pooled = mean_pooled.detach().numpy()
data_25g = cosine_similarity(
[𝑚𝑒𝑎𝑛_𝑝𝑜𝑜𝑙𝑒𝑑[0]],
𝑚𝑒𝑎𝑛_𝑝𝑜𝑜𝑙𝑒𝑑[1 :]
)
data_25g # data_23g is the final similarities matrix.

array([[0.46013463, 0.6306762 , 0.6923963 , 0.4492907 , 0.5837878 ,
        0.5882536 , 0.4037146 , 0.5387015 , 0.55183405, 0.5135999 ,
        0.33149904, 0.45323727, 0.39357653, 0.5245671 , 0.5724843 ,
        0.4065375 , 0.21237576, 0.52212936, 0.57209784, 0.43324545,
        0.5079555 , 0.3336914 , 0.21317077, 0.33262378]], dtype=float32)

In [None]:
data_25gT = data_25g.T

In [None]:
data_25gT

In [None]:
dfdata_25gT = pd.DataFrame(data_25gT)

In [None]:
dfdata_25gT.to_excel("saida.xlsx", index=False)