### Imports

In [2]:
from pathlib import Path

import pandas as pd
from dotenv import dotenv_values
from rapidfuzz import fuzz, process
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [3]:
config = dotenv_values("../.env")

### Data

In [5]:
results_dir = "."
print("results_dir:", results_dir)

results_dir: .


In [9]:
class DataSet:
    def __init__(self, file_path: Path, x_column: str, y_column: str):
        self.file_path = file_path
        self.x_column = x_column
        self.y_column = y_column

        self._df = self.read_data()

    def read_data(self) -> pd.DataFrame:
        df = pd.read_excel(self.file_path)
        df = df.apply(lambda x: x.str.strip())
        df = df.dropna()
        return df

    @property
    def unique_x(self) -> list[str]:
        return self.df[self.x_column].unique().tolist()

    @property
    def unique_y(self) -> list[str]:
        return self.df[self.y_column].unique().tolist()

    @property
    def df(self) -> pd.DataFrame:
        return self._df

    def __len__(self) -> int:
        return len(self.df)

In [11]:
ds_old = DataSet(
    file_path="Employer Match Data - BCG.xlsx",
    x_column="EID SCANNED EMPLOYER NAME",
    y_column="TML EMPLOYER NAME",
)
ds_new = DataSet(
    file_path="Employer Match Data - BCG - extended.xlsx",
    x_column="EID Employer Name",
    y_column="TML List Names",
)

In [12]:
"""print("Old employers:")
print("x:", ds_old.unique_x)
print("y:", ds_old.unique_y)

print("\nNew employers:")
print("x:", ds_new.unique_x)
print("y:", ds_new.unique_y)
"""
print("\n----------------")
print(f"Old count (x): {len(ds_old.unique_x)}")
print(f"Old count (y): {len(ds_old.unique_y)}")

print(f"\nNew count (x): {len(ds_new.unique_x)}")
print(f"New count (y): {len(ds_new.unique_y)}")

print("\n----------------")
print(f"Total common (y): {len(set(ds_old.unique_y) & set(ds_new.unique_y))}")
print(f"Total unique (y): {len(set(ds_old.unique_y) | set(ds_new.unique_y))}")



----------------
Old count (x): 159
Old count (y): 24849

New count (x): 9485
New count (y): 5767

----------------
Total common (y): 1627
Total unique (y): 28989


### Code

In [14]:
def save_similarities(df: pd.DataFrame, model_name: str, file_name: str):
    model_name_flat = (
        model_name.replace("/", "_").replace("-", "_").replace(".", "_").strip("_")
    )
    save_path = results_dir / f"{file_name}__{model_name_flat}.csv"

    df.to_csv(save_path, index=False)

    print(f"\n💾 Results saved to {save_path.name}")

In [15]:
def calculate_similarities_embedings(
    model_name: str,
    query: list[str],
    corpus: list[str],
    top_n: int = 5,
) -> pd.DataFrame:
    print(f"🤖 Model {model_name} loading...")
    model = SentenceTransformer(model_name)

    print("🔍 Query embedding...")
    query_embedding = model.encode(query)
    print("📊 query embedding shape:", query_embedding.shape)

    print("📚 Corpus embedding...")
    corpus_embeddings = model.encode(corpus)
    print("📊 corpus embeddings shape:", corpus_embeddings.shape)

    print("🧮 Similarity calculation...")
    similarities = model.similarity(query_embedding, corpus_embeddings)
    print("📊 similarities shape:", similarities.shape)

    print("📋 Sorting...")
    similarities = similarities.sort(axis=1, descending=True)
    top_indices = similarities.indices
    top_scores = similarities.values

    print("📝 Results...")
    results = []
    for i, query_text in enumerate(query):
        row = {"query": query_text}

        for j in range(top_n):
            row[f"hit_{j + 1}"] = corpus[top_indices[i][j]]
            row[f"score_{j + 1}"] = float(top_scores[i][j])

        results.append(row)

    return pd.DataFrame(results)

In [16]:
def calculate_similarities_fuzzy(
    scorer_method,
    query: list[str],
    corpus: list[str],
    top_n: int = 5,
) -> pd.DataFrame:
    results = []
    for _, query_text in tqdm(enumerate(query), total=len(query)):
        row = {"query": query_text}

        matches = process.extract(query_text, corpus, scorer=scorer_method, limit=top_n)

        for j, match in enumerate(matches):
            row[f"hit_{j + 1}"] = match[0]
            row[f"score_{j + 1}"] = float(match[1])

        results.append(row)

    return pd.DataFrame(results)

In [17]:
query = ds_new.unique_x
corpus = list(set(ds_old.unique_y) | set(ds_new.unique_y))

file_name = ds_new.file_path.stem

print("Query len:", len(query))
print("Corpus len:", len(corpus))

print("File name:", file_name)

AttributeError: 'str' object has no attribute 'stem'

##### Sentence similarity

In [None]:
model_name = "BAAI/bge-large-en-v1.5"

df_embeddings_top = calculate_similarities_embedings(
    model_name=model_name,
    query=query,
    corpus=corpus,
    top_n=5,
)

save_similarities(
    df=df_embeddings_top,
    model_name=model_name,
    file_name=file_name,
)

##### Fuzzy matching

In [None]:
scorer_method = fuzz.WRatio

df_fuzzy_top = calculate_similarities_fuzzy(
    scorer_method=scorer_method,
    query=query,
    corpus=corpus,
    top_n=5,
)

save_similarities(
    df=df_fuzzy_top,
    model_name=str(scorer_method.__name__),
    file_name=file_name,
)
