In [9]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict
import json

In [5]:
# Define functions to allow for different levels of cleaning
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_raw(text):
    return str(text).lower().split()

def clean_light(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower())  # remove punctuation and symbols
    # text = re.sub(r'[^a-z\s]', '', str(text).lower())  # commented out to allow for numbers
    tokens = text.split()
    return [w for w in tokens if w not in stop_words]

def clean_moderate(text):
    tokens = clean_light(text)
    return [lemmatizer.lemmatize(w) for w in tokens]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\schel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\schel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# Create keyword extraction function
def extract_keywords(df, fields, cleaner_fn):
    keyword_counter = Counter()

    for _, row in df.iterrows():
        query_tokens = set(cleaner_fn(row['query']))
        
        for field in fields:
            field_tokens = set(cleaner_fn(row[field]))
            shared = query_tokens.intersection(field_tokens)
            keyword_counter.update(shared)
    
    return keyword_counter

In [7]:
df_load = pd.read_parquet("../data/processed/df_features.parquet")
df = df_load.copy()

In [8]:
# Product text fields to clean
text_columns = ['product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color']

# Clean and extract keywords
raw_keywords = extract_keywords(df, text_columns, clean_raw)
light_keywords = extract_keywords(df, text_columns, clean_light)
moderate_keywords = extract_keywords(df, text_columns, clean_moderate)

print("Raw Top 10:", raw_keywords.most_common(10))
print("Light Cleaned Top 10:", light_keywords.most_common(10))
print("Moderate Cleaned Top 10:", moderate_keywords.most_common(10))


Raw Top 10: [('for', 114), ('therapy', 51), ('cold', 50), ('charger', 42), ('coffee', 41), ('tobacco', 39), ('pillow', 36), ('throw', 35), ('smoking', 34), ('black', 33)]
Light Cleaned Top 10: [('therapy', 53), ('cold', 50), ('black', 44), ('coffee', 42), ('charger', 42), ('tobacco', 40), ('pillow', 36), ('throw', 35), ('smoking', 34), ('calcium', 34)]
Moderate Cleaned Top 10: [('pillow', 86), ('dress', 71), ('charger', 67), ('therapy', 53), ('cold', 50), ('black', 44), ('coffee', 42), ('tobacco', 40), ('bottle', 35), ('throw', 35)]


In [10]:
# Save extracted keywords to json dict
raw_keywords_dict = dict(raw_keywords)
light_keywords_dict = dict(light_keywords)
moderate_keywords_dict = dict(moderate_keywords)

with open("../src/text_processing/artifacts/raw_keywords.json", "w") as f:
    json.dump(raw_keywords_dict, f, indent=2)

with open("../src/text_processing/artifacts/light_keywords.json", "w") as f:
    json.dump(light_keywords_dict, f, indent=2)

with open("../src/text_processing/artifacts/moderate_keywords.json", "w") as f:
    json.dump(moderate_keywords_dict, f, indent=2)