In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import pandas_gbq
import random

In [None]:
reference_df = pandas_gbq.read_gbq("SELECT * FROM consumption.table_reference")

# Load the pre-trained multilingual model
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

# Generate embeddings (output as numpy array by default)

embeddings = model.encode(reference_df['name_value'].tolist(), convert_to_numpy=True)

reference_df['embeddings'] = embeddings.tolist()


Downloading: 100%|[32m██████████[0m|


In [None]:
# Extract 1,000 base records
target_query = """
SELECT name_value
FROM `consumption.table_reference`
LIMIT 1000
"""
target_df = pandas_gbq.read_gbq(target_query)

# Modification functions
def modify_text(text):
    # Define the three modification conditions
    def add_special_characters(text):
        return text + "@#"

    def omit_characters(text):
        return ''.join([c for i, c in enumerate(text) if (i + 1) % 3 != 0])

    def abbreviate_words(text):
        common_terms = ['university', 'research', 'institute']
        return ' '.join([word[:3] if word not in common_terms else word for word in text.split()])

    # Randomly choose one condition to apply
    modification_function = random.choice([add_special_characters, omit_characters, abbreviate_words])
    return modification_function(text)

target_df['modified_ror'] = target_df['name_value'].apply(modify_text)
target_df['embeddings'] = model.encode(target_df['modified_ror'].tolist(), convert_to_numpy=True).tolist()


Downloading: 100%|[32m██████████[0m|


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Convert embeddings to numpy arrays
ref_embeddings = np.array(reference_df['embeddings'].tolist())
target_embeddings = np.array(target_df['embeddings'].tolist())

# Find top matches
matches = []
for target_emb in target_embeddings:
    similarities = cosine_similarity([target_emb], ref_embeddings)[0]
    top_match_idx = np.argmax(similarities)
    matches.append({
        'original': reference_df.iloc[top_match_idx]['name_value'],
        'similarity': similarities[top_match_idx]
    })

# Add matches to target dataframe
target_df['best_match'] = [m['original'] for m in matches]
target_df['similarity_score'] = [m['similarity'] for m in matches]


In [None]:
target_df

Unnamed: 0,name_value,modified_ror,embeddings,best_match,similarity_score
0,Thailand Science Research and Innovation,ThilndScene esarh ndInovtin,"[-0.014762087725102901, 0.02025180123746395, 0...",Náttúrufræðistofnun,0.722130
1,Azacycles (Czechia),Azcyle (zehi),"[-0.0048642405308783054, 0.017687104642391205,...",Azacycles (Czechia),0.813416
2,"All India Institute of Ayurveda, New Delhi",All Ind Ins of Ayu New Del,"[0.041992682963609695, 0.01915094442665577, 0....","All India Institute of Ayurveda, New Delhi",0.582827
3,World Water Watch,WoldWaerWach,"[0.00441273208707571, 0.017883552238345146, -0...",Vilans,0.648154
4,Electronic Navigation Research Institute,Ele Nav Res Ins,"[-0.01648768037557602, -0.010164789855480194, ...",Náttúrufræðistofnun,0.621461
...,...,...,...,...,...
995,German Doctors,German Doctors@#,"[-0.05446318909525871, -0.033359602093696594, ...",German Doctors,0.896681
996,Grand Valley State University,Grand Valley State University@#,"[-0.03233146667480469, 0.030547451227903366, -...",Grand Valley State University,0.910969
997,Community Foundation of South Puget Sound,Comuit FunatonofSothPuetSond,"[0.00996410846710205, 0.0014225002378225327, 0...",Pothecary Witham Weld Solicitors,0.660583
998,Communication Matters,Communication Matters@#,"[-0.09123321622610092, 0.023889178410172462, -...",Communication Matters,0.853543
