In [None]:
from collections import defaultdict
import difflib
import spacy
import pandas as pd

### Importing Data

In [None]:
path_to_csv = ""  # add your path here
df = pd.read_csv(path_to_csv)

### Grouping Markers: Levenshtein distance

In [None]:
# create single list of keywords
list_keywords = df["Keyword"][:500].tolist()
# keeps unique elements
unique_keywords = list(set(list_keywords))

# configurable threshold the higher the more smiliar words have to be to be grouped
levenshtein_threshold = 0.6
similar_groups = []

# iterate through each unique keyword
for keyword in unique_keywords:
    # initialize flag to track if the keyword belongs to an existing group
    found_group = False

    # iterate through existing groups
    for group in similar_groups:
        for existing_keyword in group:
            # calculate  similarity between  current keyword and an existing keyword
            similarity = difflib.SequenceMatcher(
                None, keyword, existing_keyword
            ).ratio()
            # if the similarity is above threshold, add the keyword to group
            if similarity > levenshtein_threshold:
                group.append(keyword)
                found_group = True
                # break inner loop since keyword already in a group
                break

    # if keyword didn't match existing group, create a new group
    if not found_group:
        similar_groups.append([keyword])

# print groups
print("Groups based on Levenshtein Distance:")
for group in similar_groups:
    print(group)

### Grouping Markers: Pretrained Pipeline

In [None]:
# load spaCy model with word vectors
nlp = spacy.load("en_core_web_md")

# initialize dictionary to store similar groups of keywords
similar_groups = defaultdict(list)

# iterate through each unique keyword
for keyword1 in unique_keywords:
    # initialize flag to track if  keyword belongs to an existing group
    group_found = False

    for group, group_keywords in similar_groups.items():
        # check if keyword  similar to any keyword in the group
        # configurable threshold the higher the more smiliar words have to be to be grouped
        similarity_threshold = 0.8

        # check if current keyword is similar to any keyword in  group
        if any(
            nlp(keyword1).similarity(nlp(keyword2)) >= similarity_threshold
            for keyword2 in group_keywords
        ):
            # if similarity is above threshold, add the current keyword to  group
            group_keywords.append(keyword1)
            group_found = True
            # break inner loop since keyword already in a group
            break

    # if keyword didn't match existing group, create a new group
    if not group_found:
        similar_groups[keyword1].append(keyword1)

# Print grouped side effects
for group, keywords in similar_groups.items():
    print(f"Group: {group} - Similar Keywords: {', '.join(keywords)}")