### Imports

In [17]:
from pathlib import Path

import nltk
import numpy as np
import pandas as pd
import rapidfuzz
from dotenv import dotenv_values
from nltk.tokenize import word_tokenize
from openai import OpenAI
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
config = dotenv_values("../.env")

### Data

In [None]:
data_dir = Path(config["RAW_DATA_DIR"])
print("data_dir:", data_dir)

results_dir = Path(config["RESULTS_DIR"])
print("results_dir:", results_dir)

In [4]:
class DataSet:
    def __init__(self, file_path: Path, x_column: str, y_column: str):
        self.file_path = file_path
        self.x_column = x_column
        self.y_column = y_column

        self._df = self.read_data()

    def read_data(self) -> pd.DataFrame:
        df = pd.read_excel(self.file_path)
        df = df.apply(lambda x: x.str.strip())
        df = df.dropna()
        return df

    @property
    def unique_x(self) -> list[str]:
        return self.df[self.x_column].unique().tolist()

    @property
    def unique_y(self) -> list[str]:
        return self.df[self.y_column].unique().tolist()

    @property
    def df(self) -> pd.DataFrame:
        return self._df

    def __len__(self) -> int:
        return len(self.df)

In [5]:
ds_old = DataSet(
    file_path=data_dir / "Employer Match Data - BCG.xlsx",
    x_column="EID SCANNED EMPLOYER NAME",
    y_column="TML EMPLOYER NAME",
)
ds_new = DataSet(
    file_path=data_dir / "Employer Match Data - BCG - extended.xlsx",
    x_column="EID Employer Name",
    y_column="TML List Names",
)

In [None]:
print("Old employers:")
print("x:", ds_old.unique_x)
print("y:", ds_old.unique_y)

print("\nNew employers:")
print("x:", ds_new.unique_x)
print("y:", ds_new.unique_y)

print("\n----------------")
print(f"Old count (x): {len(ds_old.unique_x)}")
print(f"Old count (y): {len(ds_old.unique_y)}")

print(f"\nNew count (x): {len(ds_new.unique_x)}")
print(f"New count (y): {len(ds_new.unique_y)}")

print("\n----------------")
print(f"Total common (y): {len(set(ds_old.unique_y) & set(ds_new.unique_y))}")
print(f"Total unique (y): {len(set(ds_old.unique_y) | set(ds_new.unique_y))}")

In [None]:
query = ds_new.unique_x
corpus = list(set(ds_old.unique_y) | set(ds_new.unique_y))

file_name = ds_new.file_path.stem

print("Query len:", len(query))
print("Corpus len:", len(corpus))

print("File name:", file_name)

In [8]:
def save_similarities(df: pd.DataFrame, model_name: str, file_name: str):
    model_name_flat = (
        model_name.replace("/", "_").replace("-", "_").replace(".", "_").strip("_")
    )
    save_path = results_dir / f"{file_name}__{model_name_flat}.csv"

    df.to_csv(save_path, index=False)

    print(f"\n💾 Results saved to {save_path.name}")

### TFIDF

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english", use_idf=True, norm="l2")
database_tfidf = tfidf_vectorizer.fit_transform(corpus)
texts_tfidf = tfidf_vectorizer.transform(query)
cosine_similarities = cosine_similarity(texts_tfidf, database_tfidf)
top_match_indices = np.argmax(cosine_similarities, axis=1)

In [None]:
top_matches_tfidf = pd.DataFrame(
    {
        "input": query,
        "hit_1": np.array(corpus)[top_match_indices],
        "score_1": cosine_similarities[
            np.arange(len(query)), top_match_indices
        ],
    }
)

save_similarities(top_matches_tfidf, "tfidf", file_name)

### BM25

In [None]:
nltk.download('punkt_tab')

In [13]:
corpus_tokenized = [word_tokenize(doc) for doc in corpus]
query_tokenized = [word_tokenize(doc) for doc in query]

bm25 = BM25Okapi(corpus_tokenized)
scores_matrix = np.array([bm25.get_scores(query) for query in query_tokenized])
best_match_indices = np.argmax(scores_matrix, axis=1)

In [None]:
top_matches_bm25 = pd.DataFrame([
    {
        "input": query[i],
        "hit_1": corpus[best_match_indices[i]],
        "score_1": scores_matrix[i, best_match_indices[i]],
    }
    for i in range(len(query))
])

save_similarities(top_matches_bm25, "bm25", file_name)

# Embeddings

In [None]:
client = OpenAI(api_key=config["OPENAI_API_KEY"])

In [30]:
def get_embeddings(data):
    print(f"Embedding {data}")
    response = client.embeddings.create(input=data, model="text-embedding-ada-002")
    return [e.embedding for e in response.data]

In [21]:
def process_by_chunk(data, process_chunk, chunk_size):
    processed_chunks = []

    for start_idx in range(0, len(data), chunk_size):
        print(
            f"Processing {start_idx}-{start_idx + chunk_size} of {len(data)}", end="\r"
        )
        end_idx = min(start_idx + chunk_size, len(data))
        chunk = data[start_idx:end_idx]

        processed_chunk = process_chunk(chunk)
        processed_chunks.append(processed_chunk)

    return np.concatenate(processed_chunks)

In [None]:
len(query)

# query_embeddings = np.array(get_embeddings(query))
query_embeddings = process_by_chunk(query, get_embeddings, 1000)

In [None]:
corpus_embeddings = process_by_chunk(corpus, get_embeddings, 1000)

In [37]:
similarity_scores = cosine_similarity(query_embeddings, corpus_embeddings)

In [None]:
top_matches_embeddings = pd.DataFrame([
    {
        "input": query[i],
        "hit_1": corpus[best_match_indices[i]],
        "score_1": similarity_scores[i, best_match_indices[i]],
    }
    for i in range(len(query))
])

save_similarities(top_matches_embeddings, "text-embedding-ada-002", file_name)

# Fuzzy

In [39]:
def find_best_fuzzy_match(query, database, threshold=50):
    match_ = rapidfuzz.process.extractOne(
        query, database, scorer=rapidfuzz.fuzz.ratio, score_cutoff=threshold
    )
    if match_:
        match, score, index = match_
        return [match, score]
    else:
        return [None, None]

In [40]:
top_matches_fuzzy = pd.DataFrame(columns=["input", "hit_1", "score_1"])
top_matches_fuzzy[["hit_1", "score_1"]] = [
    find_best_fuzzy_match(t, corpus) for t in query
]
top_matches_fuzzy["input"] = query

In [None]:
save_similarities(top_matches_fuzzy, "fuzzy_ratio", file_name)
