In [91]:
from utils import Utils
from tqdm import tqdm 
import re
import random

In [None]:
utils = Utils("../.env")

In [None]:
vals = utils.get_full()

In [None]:
vendors = list(vals.keys())
vendors[0:10]

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from geonamescache import GeonamesCache

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words("english"))
custom_stopwords = {
    "llc", "inc", "corp", "company", "corporation", "group", 
    "limited", "technologies", "solutions", "systems", 
    "enterprises", "international", "global", "services",
    "industries", "manufacturing", "partners", "holdings"
}
us_states = ["alabama", "alaska", "arizona", "arkansas", "california", "colorado", "connecticut", "delaware", "florida", "georgia", "hawaii", "idaho", "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana", "maine", "maryland", "massachusetts", "michigan", "minnesota", "mississippi", "missouri", "montana", "nebraska", "nevada", "new hampshire", "new jersey", "new mexico", "new york", "north carolina", "north dakota", "ohio", "oklahoma", "oregon", "pennsylvania", "rhode island", "south carolina", "south dakota", "tennessee", "texas", "utah", "vermont", "virginia", "washington", "west virginia", "wisconsin", "wyoming"]
us_states_abbreviations = ["al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy"]

stop_words.update(custom_stopwords)
stop_words.update(us_states)
stop_words.update(us_states_abbreviations)
gc = GeonamesCache()
place_names = set()
for place_data in gc.get_cities().values():
    place_names.add(place_data["name"].lower())

In [None]:
def remove_city(word):
    tokens = word.split(" ")
    filtered_tokens = []
    i = 0
    while i < len(tokens):
        if i + 2 < len(tokens):
            three_word_sequence = " ".join(tokens[i:i+3])
            if three_word_sequence in place_names:
                i += 3
                continue

        if i + 1 < len(tokens):
            two_word_sequence = " ".join(tokens[i:i+2])
            if two_word_sequence in place_names:
                i += 2
                continue

        if tokens[i] in place_names:
            i += 1
            continue

        filtered_tokens.append(tokens[i])
        i += 1

    return " ".join(filtered_tokens)

In [None]:
def token_and_stem(item):
    input_string = item.lower()
    no_long_numbers = re.sub(r'\b\d{3,}\b', '', input_string)    
    alphanum = re.sub(r'[^a-zA-Z0-9]', ' ', no_long_numbers)
    no_places = remove_city(alphanum)
    tokens = word_tokenize(no_places)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return ' '.join(stemmed_tokens)

In [None]:
def function_tester(func, data):
    parsed_items = {}

    for item in tqdm(data, desc="Processing", unit="item"):
        new = func(item)
        if new not in parsed_items:
            parsed_items[new] = []
        parsed_items[new].append(item)
    
    print("Original Length: ", len(data))
    print("New Length: ", len(parsed_items))
    return parsed_items

In [None]:
processed_dict = function_tester(token_and_stem, vendors)

In [None]:
sorted_keys = sorted(processed_dict.keys())

for key in sorted_keys:
    print(f"{key}: {processed_dict[key]}")



In [None]:
from fuzzywuzzy import fuzz
potential_matches = {}
for word1 in sorted_keys:
    potential_matches[word1] = []
    for word2 in unique_stems:
        similarity_score = fuzz.ratio(word1, word2)
        if similarity_score >= 80 and word1 != word2:
            potential_matches[word1].append(word2)
potential_matches

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

embeddings = []
for word in sorted_keys:
    inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings.append(outputs[0][0][0].numpy())

cosine_similarities = cosine_similarity(embeddings, embeddings)
similarity_threshold = 0.9

potential_matches = {}
for i, word1 in enumerate(sorted_keys):
    potential_matches[word1] = []
    for j, word2 in enumerate(sorted_keys):
        if cosine_similarities[i][j] >= similarity_threshold and word1 != word2:
            potential_matches[word1].append(word2)
potential_matches


In [None]:
potential_matches