In [2]:
import pandas as pd
import numpy as np
from typing import *
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import torch
import sklearn
from sentence_transformers import SentenceTransformer

In [3]:
import sklearn.cluster

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    # print(last_hidden.shape)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [5]:
# tokenizer = AutoTokenizer.from_pretrained(r"Embedding_Model/gte_base")
# model = AutoModel.from_pretrained(r"Embedding_Model/gte_base")
model: SentenceTransformer = SentenceTransformer(r'Embedding_Model\gte_base_sentence')
model = model.to(device)

In [None]:
data_input = pd.read_csv(r"Generated_Result\40k.csv", sep=",")

In [7]:
data_input.fillna("None", inplace=True)
data_input.head()

Unnamed: 0,project,risk,stakeholder,idx
0,external dose assessment project|measurement e...,accident|air dose|environmental changes,participants|city officials in Minamisoma and ...,10000
1,land treatment unit,pollution migration,State of Sao Paulo|Brazil,10001
2,Design of a bioretention planter/trench infilt...,Intercepting all of the runoff from a syntheti...,Urban retrofit project,10002
3,,,,10003
4,Investigating the differences in urinary conce...,Air pollution|specifically PM2.5 and ozone|pos...,Stud,10005


In [8]:
stupid: Dict[str, List] = {}
for col in ["project", "risk", "stakeholder"]:
  temp = data_input.loc[:, col].values.tolist()
  blyat = []
  for tep_val in temp:
    if tep_val[0] != tep_val[0] or tep_val=="None": continue
    tep_val = tep_val.split("|")
    blyat = [*blyat, *tep_val]
  stupid[col] = blyat

# stupid_copy = {col: [val for val in data_input[col].str.split("|", expand=True).stack().dropna()]\
#            for col in ["Project", "Risk", "Stakeholder"]}
print(len(stupid["project"]), len(stupid["risk"]), len(stupid["stakeholder"]))

51197 53611 41968


In [None]:
temp_project = [stupid["risk"][val: val+5] for val in range(0, len(stupid["risk"]), 5)]

In [None]:
# stupid["stakeholder"][:5]

In [None]:
import math
class Node:
  def __init__(self, name, embeddings):
    self.name: str =name
    self.embeddings: np.ndarray = embeddings

  def __call__(self):
    return self.embeddings

def embedding_value(col, max_lim=1000, threshold=200):
    lens = max_lim if max_lim > threshold else len(stupid[col])
    output_project = []
    for item in range(0, math.ceil(lens/threshold)):
        centre = stupid[col][item*threshold:(item+1)*threshold if (item+1)*threshold<lens-1 else lens-1]
        if lens - item*threshold < 2_000:
          print(item, len(centre), "left", lens - item*threshold)
        # Tokenize the input texts
        res = model.encode(centre, batch_size=len(centre), convert_to_numpy=1) # return a numpy vector in threhold * embedding dim shape
        output_project.extend([Node(*new_one) for new_one in zip(centre, [*res])])
    return output_project # use multi-processing next time

In [None]:
columns = data_input.columns.to_list()
print(columns)
# projectRes = embedding_value(col=columns[0], max_lim=1000, threshold=200)
# len(projectRes)

['project', 'risk', 'stakeholder', 'idx']


In [None]:
from importlib import reload
# import word_embedding_acc as weacc
# reload(weacc)

In [None]:
# %%time
import concurrent.futures
import gc

counter = 0
def main():
    data = []
    projectRes, newShape = [], 0
    with concurrent.futures.ThreadPoolExecutor() as executor:
        projectRes = executor.submit(embedding_value, col=columns[2], max_lim=10, threshold=200).result()
    # projectRes = embedding_value(col = columns[1], max_lim=10, threshold=200)
    newShape = len(projectRes)
    for items in range(newShape):
        pidx: Node = projectRes[items]
        data.append([pidx.name, *pidx.embeddings])
    dataset = pd.DataFrame(data, columns=["name", *[f"embedding_{val}" for val in range(768)]])
    return dataset

if __name__ == "__main__":
    dataset = main()
    gc.collect()
    print(dataset.shape)

200 200 left 1968
201 200 left 1768
202 200 left 1568
203 200 left 1368
204 200 left 1168
205 200 left 968
206 200 left 768
207 200 left 568
208 200 left 368
209 167 left 168
(41967, 769)


In [None]:
dataset.to_csv(rf"/content/drive/MyDrive/CoLab/Llama2/{columns[2]}_{dataset.shape[0]}.csv", index=False)
dataset.head()

Unnamed: 0,name,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,participants,-0.016678,-0.026659,0.001124,0.003264,0.039721,0.029079,0.044729,0.02192,-0.027886,...,-0.004901,0.001474,-0.012618,-0.055273,0.006341,-0.006275,0.011817,-0.003702,0.023715,-0.014546
1,city officials in Minamisoma and three other c...,-0.009912,-0.020571,0.013642,0.004861,-0.00333,0.012202,0.0249,0.017151,-0.02888,...,0.008761,-0.033951,-0.034549,-0.033243,0.045119,-0.033248,0.011001,-0.00325,0.0313,-0.012715
2,State of Sao Paulo,0.012836,-0.031345,-0.006634,0.032957,0.04684,0.03614,0.033692,0.026611,-0.02519,...,0.039229,-0.015854,0.001034,-0.026716,0.032323,-0.026598,-0.025396,-0.003875,0.031559,0.005412
3,Brazil,0.000632,0.01514,0.004691,0.046276,0.045533,0.012279,0.025348,0.028964,-0.00808,...,0.045257,-0.020082,-0.010256,-0.039007,0.031437,-0.043322,-0.021757,-0.00134,0.046231,0.024015
4,Urban retrofit project,0.028076,-0.047304,0.017173,0.018277,0.067573,0.044029,0.046983,0.027641,-0.012649,...,0.024481,0.000205,-0.013029,-0.015881,0.031574,-0.016904,0.014212,-0.01204,0.010977,-0.009301


In [4]:
import gc
gc.collect()

20

### Code to Find simlarity

In [4]:
from sentence_transformers.util import cos_sim

df = pd.read_csv(r"Generated_Result\project_1000.csv")
col = df.columns
# df = df.drop(col[0], axis=1)
df.head()

Unnamed: 0,name,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,external dose assessment project,0.057842,0.014634,-0.038342,-0.015296,0.050132,-0.000727,0.034195,-0.004186,-0.010972,...,0.008032,-0.006561,0.003851,-0.054879,0.003959,-0.033597,-0.006155,0.012678,-0.002262,0.004868
1,measurement every hour for two weeks,0.010446,0.009351,0.022793,-0.034335,0.043989,0.023018,0.057842,0.025674,-0.035095,...,-0.022321,0.003317,-0.006444,-0.046955,0.013387,0.00659,0.010199,-0.00111,0.02689,0.00811
2,land treatment unit,0.031785,-0.011074,0.002117,0.001701,0.069107,0.027693,0.046104,0.038738,0.022972,...,0.014123,0.005151,0.016073,-0.0024,0.0166,-0.014453,0.006358,0.018079,-0.011331,-0.001639
3,Design of a bioretention planter/trench infilt...,0.022572,-0.008976,-0.025084,0.002828,0.064042,0.008688,0.050499,0.001198,-0.018325,...,-0.004005,0.021712,-0.012523,-0.013642,0.020244,-0.018446,0.015984,0.029632,-0.044049,-0.027612
4,Investigating the differences in urinary conce...,0.045364,-0.001862,-0.030748,0.038176,0.061989,-0.010115,0.036649,0.033987,-0.023255,...,-0.019685,0.008672,-0.004035,-0.024788,0.029516,-0.057607,-0.028398,0.025369,0.029742,-0.036338


In [5]:
data = torch.tensor(df.iloc[:,1:].values).to(device)

In [47]:
from multiprocessing.pool import ThreadPool
import concurrent.futures

def wordTracing(indices: np.array, preIdx: int, df: pd.Series=df) -> dict[int, list[str]]:
  keys, cluster = np.unique(indices[:,0]), {}
  for key in keys:
    clt = indices[indices[:,0]==key][:,1]
    cluster[key+preIdx] = df.iloc[:,0].values[clt].tolist()
  return cluster

def dictUpate(obj: dict[int, list[str]], target: dict[int, list[str]]) -> dict:
  for key, val in target.items():
      if key in obj.keys():
          obj[key].extend(val)
      else:
          obj[key] = val
  return obj

def _validation(shape: tuple[int], data: dict[int, list[str]]) -> None:
   dataLen = sum([len(val) for val in data.values()])
   if abs(shape[0]-dataLen) < 2: return True
   raise ValueError("The data length is not equal to the shape of the input data")

storage: dict = {}
def logRegister(data: tuple[dict[int, list[str], tuple[int]]])->dict:
    global storage
    Rdata, shape = data
    _validation(shape, Rdata)
    return dictUpate(storage, Rdata)
    # raise NotImplementedError("The function is expected either adding value up of dict cluster or append it")

def sim2idxFetching(data: torch.tensor, pieces: torch.tensor, preIdx: int, confi: float, *args)->set[dict[int:Union[list, np.array]], tuple[int, int]]:
  """
  designed suspected to support for threading accerlation
  """
  coSim = cos_sim(data, pieces) # tensor[dim=2]
  wordMask = torch.gt(coSim, confi)
  indices: torch.tensor = torch.nonzero(wordMask, as_tuple=False).numpy()
  # delete simlarity on word itself
  conMask = (indices[:, 0]+preIdx)!=indices[:,1] # tensor[n, 2], set as idx
  indices: np.array = indices[conMask]
  return wordTracing(indices, preIdx), indices.shape

def siMultiAcc(data: torch.tensor, piecesNum: int, confi: float)->Union[list,None]:
  global storage
  lenth, pool = len(data), ThreadPool()
  lenths, pieces = [num for num in range(0, lenth, int(lenth/piecesNum))], int(lenth/piecesNum)
  for val in lenths:
    if val+pieces >= lenth: 
       pieces = lenth - val
    temp1 = sim2idxFetching(data[val:val+pieces], data, val, confi)
    logRegister(temp1)
    # pool.apply_async(sim2idxFetching, args=(data[val:val+pieces], data, val, confi), callback=logRegister)
  pool.close()
  pool.join()
  return storage

In [66]:
storage=siMultiAcc(data, 20, 0.88)

In [75]:
ast = 0
for _, v in storage.items(): ast+=len(v)
print(ast, len(storage.keys()))

988 282


In [70]:
resotre = torch.gt((cos_sim(data, data)), 0.88)
Nindices = torch.nonzero(resotre, as_tuple=False).numpy()
Nindices = Nindices[Nindices[:,0]!=Nindices[:,1]]
Nindices.shape

(798, 2)

In [None]:
Nindices

In [84]:
certain=[]
for idx in storage.keys():
    vaList = [df.iloc[idx,0], *storage[idx]]
    lenList = [len(v) for v in vaList]
    cenIdx = lenList.index(min(lenList))
    vaList = [vaList.pop(cenIdx), *vaList]
    certain.append(vaList)
writeDf = pd.DataFrame(certain)

In [85]:
writeDf = writeDf.fillna("N")
writeDf.columns=["shortest word", "original word"]+list(writeDf.columns[2:])

In [86]:
writeDf.to_csv(rf"Generated_Result\project_1000_{0.88}_simExample.csv", index=False)

In [None]:
# beside stakeholder, rest of them shall be reloaded again
torch.save(torch.cat(output_project, axis=0), "Embedding_Model/gte_large_embedding/three_ouput1_risk.pt")

In [None]:
torch.cuda.empty_cache()

In [None]:
full_emd, atten_mask = outputs.last_hidden_state, batch_dict['attention_mask']
print(full_emd.shape, atten_mask.shape, embeddings.shape)

In [None]:
# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())

[[69.65817260742188, 88.03556060791016, 68.79690551757812]]
