In [1]:
import pandas as pd
import numpy as np
from typing import *
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import torch
import sklearn
from sentence_transformers import SentenceTransformer

In [2]:
import sklearn.cluster

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    # print(last_hidden.shape)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [16]:
# tokenizer = AutoTokenizer.from_pretrained(r"Embedding_Model/gte_base")
# model = AutoModel.from_pretrained(r"Embedding_Model/gte_base")
model: SentenceTransformer = SentenceTransformer(r'Embedding_Model\gte_base_sentence')
model = model.to(device)

In [17]:
data_input = pd.read_csv(r"Generated_Result\40k.csv", sep=",")

In [18]:
data_input.fillna("None", inplace=True)
data_input.head()

Unnamed: 0,project,risk,stakeholder,idx
0,external dose assessment project|measurement e...,accident|air dose|environmental changes,participants|city officials in Minamisoma and ...,10000
1,land treatment unit,pollution migration,State of Sao Paulo|Brazil,10001
2,Design of a bioretention plantertrench infiltr...,Intercepting all of the runoff from a syntheti...,Urban retrofit project,10002
3,Construction site,Ecological environment damage low work efficie...,Surrounding environment (pollution monitoring ...,10004
4,Investigating the differences in urinary conce...,Air pollution|specifically PM25 and ozone|pose...,s Stud,10005


In [19]:
stupid: Dict[str, List] = {}
for col in ["project", "risk", "stakeholder"]:
  temp = data_input.loc[:, col].values.tolist()
  blyat = []
  for tep_val in temp:
    if tep_val[0] != tep_val[0] or tep_val=="None": continue
    tep_val = tep_val.split("|")
    blyat = [*blyat, *tep_val]
  stupid[col] = blyat

# stupid_copy = {col: [val for val in data_input[col].str.split("|", expand=True).stack().dropna()]\
#            for col in ["Project", "Risk", "Stakeholder"]}
print(len(stupid["project"]), len(stupid["risk"]), len(stupid["stakeholder"]))

67548 77362 55630


In [23]:
temp_project = [stupid["risk"][val: val+5] for val in range(0, len(stupid["risk"]), 5)]

In [None]:
# stupid["stakeholder"][:5]
temp_project[:5]

In [38]:
import math
class Node:
  def __init__(self, name, embeddings):
    self.name: str =name
    self.embeddings: np.ndarray = embeddings

  def __call__(self):
    return self.embeddings

def embedding_value(col, max_lim=1000, threshold=200):
    lens = max_lim if max_lim > threshold else len(stupid[col])
    output_project = []
    for item in range(0, math.ceil(lens/threshold)):
        centre = stupid[col][item*threshold:(item+1)*threshold if (item+1)*threshold<lens-1 else lens-1]
        if lens - item*threshold < 2_000:
          print(item, len(centre), "left", lens - item*threshold)
        # Tokenize the input texts
        res = model.encode(centre, batch_size=len(centre), convert_to_numpy=1) # return a numpy vector in threhold * embedding dim shape
        # should I use class Node
        output_project.extend([Node(*new_one) for new_one in zip(centre, [*res])])
        del res, centre
    return output_project # use multi-processing next time

In [39]:
columns = data_input.columns.to_list()
print(columns)
# projectRes = embedding_value(col=columns[0], max_lim=1000, threshold=200)
# len(projectRes)

['project', 'risk', 'stakeholder', 'idx']


In [40]:
from importlib import reload
# import word_embedding_acc as weacc
# reload(weacc)
len(stupid["risk"])

77362

In [41]:
# %%time
import concurrent.futures
import gc

counter = 0
def main():
    data = []
    projectRes, newShape = [], 0
    with concurrent.futures.ThreadPoolExecutor() as executor:
        projectRes = executor.submit(embedding_value, col=columns[1], max_lim=10, threshold=200).result()
    # projectRes = embedding_value(col = columns[1], max_lim=10, threshold=200)
    newShape = len(projectRes)
    for items in range(newShape):
        pidx: Node = projectRes[items]
        data.append([pidx.name, *pidx.embeddings])
    dataset = pd.DataFrame(data, columns=["name", *[f"embedding_{val}" for val in range(768)]])
    return dataset

if __name__ == "__main__":
    dataset = main()
    gc.collect()
    print(dataset.shape)

377 200 left 1962
378 200 left 1762
379 200 left 1562
380 200 left 1362
381 200 left 1162
382 200 left 962
383 200 left 762
384 200 left 562
385 200 left 362
386 161 left 162
(77361, 769)


In [42]:
dataset.to_csv(rf"Generated_Result/optimized embedding/{columns[1]}_{dataset.shape[0]}.csv", index=False)
dataset.head()

Unnamed: 0,name,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,accident,-0.01959,-0.018509,-0.036323,-0.023507,0.057477,0.02133,0.061816,0.046516,-0.017038,...,0.022243,0.004786,-0.00361,-0.021985,0.046742,-0.037195,-0.016547,-0.001641,0.004159,0.008347
1,air dose,0.02934,0.040698,-0.005331,-0.036518,0.037146,0.008199,0.060314,0.017974,0.006976,...,-0.023678,-0.008291,-0.007202,-0.036296,0.012963,-0.014799,0.018555,0.028678,0.022601,0.019587
2,environmental changes,-0.001443,-0.012442,0.000558,-0.018727,0.088015,-0.001502,0.0318,0.064791,-0.042642,...,0.017494,0.006163,-0.014204,-0.024825,0.032355,-0.049312,-0.016147,-0.014306,0.016398,-0.006852
3,pollution migration,-5e-06,-0.014417,0.006132,0.000865,0.073386,0.028526,0.004738,0.042137,-0.033695,...,0.011427,-0.004061,-0.005336,-0.024035,0.031248,-0.044846,0.01825,-0.002414,-0.015324,-0.001739
4,Intercepting all of the runoff from a syntheti...,0.03453,-0.034736,0.046489,-0.012127,0.049874,0.016386,0.044432,0.041485,-0.032075,...,0.011834,0.034384,-0.053902,-0.032633,0.007691,-0.034596,0.006485,0.000472,-0.007849,0.015651


In [43]:
import gc
del dataset
gc.collect()
torch.cuda.empty_cache()

### Code to Find simlarity

In [44]:
from sentence_transformers.util import cos_sim

df = pd.read_csv(r"Generated_Result\optimized embedding\risk_77361.csv")
col = df.columns
# df = df.drop(col[0], axis=1)
df.head()

Unnamed: 0,name,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,accident,-0.01959,-0.018509,-0.036323,-0.023507,0.057477,0.02133,0.061816,0.046516,-0.017038,...,0.022243,0.004786,-0.00361,-0.021985,0.046742,-0.037195,-0.016547,-0.001641,0.004159,0.008347
1,air dose,0.02934,0.040698,-0.005331,-0.036518,0.037146,0.008199,0.060314,0.017974,0.006976,...,-0.023678,-0.008291,-0.007202,-0.036296,0.012963,-0.014799,0.018555,0.028678,0.022601,0.019587
2,environmental changes,-0.001443,-0.012442,0.000558,-0.018727,0.088015,-0.001502,0.0318,0.064791,-0.042642,...,0.017494,0.006163,-0.014204,-0.024825,0.032355,-0.049312,-0.016147,-0.014306,0.016398,-0.006852
3,pollution migration,-5e-06,-0.014417,0.006132,0.000865,0.073386,0.028526,0.004738,0.042137,-0.033695,...,0.011427,-0.004061,-0.005336,-0.024035,0.031248,-0.044846,0.01825,-0.002414,-0.015324,-0.001739
4,Intercepting all of the runoff from a syntheti...,0.03453,-0.034736,0.046489,-0.012127,0.049874,0.016386,0.044432,0.041485,-0.032075,...,0.011834,0.034384,-0.053902,-0.032633,0.007691,-0.034596,0.006485,0.000472,-0.007849,0.015651


In [45]:
data = torch.tensor(df.iloc[:,1:].values).to(device)

In [47]:
from multiprocessing.pool import ThreadPool
import concurrent.futures

def wordTracing(indices: np.array, preIdx: int, df: pd.Series=df) -> dict[int, list[str]]:
  keys, cluster = np.unique(indices[:,0]), {}
  for key in keys:
    clt = indices[indices[:,0]==key][:,1]
    cluster[key+preIdx] = df.iloc[:,0].values[clt].tolist()
  return cluster

def dictUpate(obj: dict[int, list[str]], target: dict[int, list[str]]) -> dict:
  for key, val in target.items():
      if key in obj.keys():
          obj[key].extend(val)
      else:
          obj[key] = val
  return obj

def _validation(shape: tuple[int], data: dict[int, list[str]]) -> None:
   dataLen = sum([len(val) for val in data.values()])
   if abs(shape[0]-dataLen) < 2: return True
   raise ValueError("The data length is not equal to the shape of the input data")

storage: dict = {}
def logRegister(data: tuple[dict[int, list[str], tuple[int]]])->dict:
    global storage
    Rdata, shape = data
    _validation(shape, Rdata)
    return dictUpate(storage, Rdata)
    # raise NotImplementedError("The function is expected either adding value up of dict cluster or append it")

def sim2idxFetching(data: torch.tensor, pieces: torch.tensor, preIdx: int, confi: float, *args)->set[dict[int:Union[list, np.array]], tuple[int, int]]:
  """
  designed suspected to support for threading accerlation
  """
  coSim = cos_sim(data, pieces) # tensor[dim=2]
  wordMask = torch.gt(coSim, confi)
  indices: torch.tensor = torch.nonzero(wordMask, as_tuple=False).cpu().numpy()
  # delete simlarity on word itself
  conMask = (indices[:, 0]+preIdx)<indices[:,1] # tensor[n, 2], set as idx
  indices: np.array = indices[conMask]
  return wordTracing(indices, preIdx), indices.shape

def siMultiAcc(data: torch.tensor, piecesNum: int, confi: float)->Union[list,None]:
  global storage
  lenth, pool = len(data), ThreadPool()
  lenths, pieces = [num for num in range(0, lenth, int(lenth/piecesNum))], int(lenth/piecesNum)
  for val in lenths:
    if val+pieces >= lenth: 
       pieces = lenth - val
    # temp1 = sim2idxFetching(data[val:val+pieces], data, val, confi)
    # logRegister(temp1)
    pool.apply_async(sim2idxFetching, args=(data[val:val+pieces], data, val, confi), callback=logRegister)
  pool.close()
  pool.join()
  return storage

In [48]:
storage=siMultiAcc(data, 300, 0.88)

In [49]:
ast, zeroNum = 0, 0
for k, v in storage.items(): 
    ast+=len(v)
    if len(v)<=1: zeroNum+=1
print(ast, len(storage.keys()), zeroNum)

# project 2174677 48058 7346
# risk 7422715 64419 6056
# stakeholder 6390279 44901 3768

7422715 64419 6056


In [67]:
testingData = data[:1000]

resotre = torch.gt((cos_sim(testingData, testingData)), 0.88)
Nindices = torch.nonzero(resotre, as_tuple=False).cpu().numpy()
Nindices = Nindices[Nindices[:,0]<Nindices[:,1]]
Nindices.shape

(3422, 2)

In [75]:
# checkingItem = list(storage.keys())
# storage[checkingItem[46]]
# print(isinstance(str("hello"), str), str("hello"))
# testTorch = Nindices
# reverseTest = testTorch[:, ::-1]
# np.where(np.all(testTorch==reverseTest, axis=1))
storage[zeroNum]

['and local communities involved in the study and implementation of the']

In [50]:
certain=[]
for idx in storage.keys():
    vaList = [df.iloc[idx,0], *storage[idx]]
    lenList = [len(v) for v in vaList if isinstance(v, str)]
    if len(lenList)<=1: continue
    cenIdx = lenList.index(min(lenList))
    vaList = [vaList.pop(cenIdx), *vaList]
    certain.append(vaList)
writeDf = pd.DataFrame(certain)

In [52]:
writeDf = writeDf.fillna("N")
writeDf.columns=["shortest word", "original word"]+list(writeDf.columns[2:])

In [53]:
writeDf.to_csv(rf"Generated_Result\riskSim_77361_{0.88}.csv", index=False)
writeDf.head()

Unnamed: 0,shortest word,original word,2,3,4,5,6,7,8,9,...,2304,2305,2306,2307,2308,2309,2310,2311,2312,2313
0,smoke,smoking habits,Smoking,smoke,Smoking,cigarette smoke,smoke,Habits,smoke,smoke,...,N,N,N,N,N,N,N,N,N,N
1,none,none detected,none detected,none identified,none detected,none detected,none detected,none detected,none detected,none detected,...,detection,none detected,none detected,none detected,N,N,N,N,N,N
2,Fragile,fragile relationships,forced relationships,fragile,potential to damage relationships,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
3,complexity,increasing complexity,Complexity,Complexity of problems,Complexities,complexity,Complexity,computational complexity,organisational complexity,project complexity,...,N,N,N,N,N,N,N,N,N,N
4,risk,Accident risk potential,high accident risk potential,High accident risk locations,accident risk,risks,safety risk assessment,high risk of accidents,a risk analysis,risks,...,N,N,N,N,N,N,N,N,N,N


In [54]:
import gc

del data, storage
gc.collect()
torch.cuda.empty_cache()
# beside stakeholder, rest of them shall be reloaded again
# torch.save(torch.cat(output_project, axis=0), "Embedding_Model/gte_large_embedding/three_ouput1_risk.pt")

In [None]:
full_emd, atten_mask = outputs.last_hidden_state, batch_dict['attention_mask']
print(full_emd.shape, atten_mask.shape, embeddings.shape)

In [None]:
# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())

[[69.65817260742188, 88.03556060791016, 68.79690551757812]]
