In [9]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict
import json

In [5]:
# Define functions to allow for different levels of cleaning
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_raw(text):
    return str(text).lower().split()

def clean_light(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower())  # remove punctuation and symbols
    # text = re.sub(r'[^a-z\s]', '', str(text).lower())  # commented out to allow for numbers
    tokens = text.split()
    return [w for w in tokens if w not in stop_words]

def clean_moderate(text):
    tokens = clean_light(text)
    return [lemmatizer.lemmatize(w) for w in tokens]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\schel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\schel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
df_load = pd.read_parquet("../data/processed/df.parquet")
df = df_load.copy()

In [6]:
# Create keyword extraction function
def extract_keywords(df, fields, cleaner_fn):
    keyword_counter = Counter()

    for _, row in df.iterrows():
        query_tokens = set(cleaner_fn(row['query']))
        
        for field in fields:
            field_tokens = set(cleaner_fn(row[field]))
            shared = query_tokens.intersection(field_tokens)
            keyword_counter.update(shared)
    
    return keyword_counter

In [8]:
# Product text fields to clean
text_columns = ['product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color']

# Clean and extract keywords
raw_keywords = extract_keywords(df, text_columns, clean_raw)
light_keywords = extract_keywords(df, text_columns, clean_light)
moderate_keywords = extract_keywords(df, text_columns, clean_moderate)

print("Raw Top 10:", raw_keywords.most_common(10))
print("Light Cleaned Top 10:", light_keywords.most_common(10))
print("Moderate Cleaned Top 10:", moderate_keywords.most_common(10))


Raw Top 10: [('for', 114), ('therapy', 51), ('cold', 50), ('charger', 42), ('coffee', 41), ('tobacco', 39), ('pillow', 36), ('throw', 35), ('smoking', 34), ('black', 33)]
Light Cleaned Top 10: [('therapy', 53), ('cold', 50), ('black', 44), ('coffee', 42), ('charger', 42), ('tobacco', 40), ('pillow', 36), ('throw', 35), ('smoking', 34), ('calcium', 34)]
Moderate Cleaned Top 10: [('pillow', 86), ('dress', 71), ('charger', 67), ('therapy', 53), ('cold', 50), ('black', 44), ('coffee', 42), ('tobacco', 40), ('bottle', 35), ('throw', 35)]


In [10]:
# Save extracted keywords to json dict
raw_keywords_dict = dict(raw_keywords)
light_keywords_dict = dict(light_keywords)
moderate_keywords_dict = dict(moderate_keywords)

with open("../src/text_processing/artifacts/raw_keywords.json", "w") as f:
    json.dump(raw_keywords_dict, f, indent=2)

with open("../src/text_processing/artifacts/light_keywords.json", "w") as f:
    json.dump(light_keywords_dict, f, indent=2)

with open("../src/text_processing/artifacts/moderate_keywords.json", "w") as f:
    json.dump(moderate_keywords_dict, f, indent=2)

#### Try simple combining step for baseline (without imputation first)

#### Future Optimization Notes

* Text Component Ordering: Consider reordering fields in `combine_product_text()` to prioritize short, essential fields first:
  - Current: Title > Description > Features > Brand > Color  
  - Better: Title > Brand > Color > Features > Description
  - Prevents truncation of critical search attributes (brand/color) when hitting character limits

In [15]:
# Text combination and processing functions (future src/text_processing/)
def simple_clean_text(text):
    """Minimal text cleaning for baseline."""
    if pd.isna(text):
        return ""
    return str(text).lower().strip()

# max_chars at 2000 to begin but may need to adjust truncations to be below ~512 for transformer models
def combine_product_text(row, max_chars=2000, use_imputation=False):
    """Combine product text fields into single string for embedding."""
    # Get text fields
    title = simple_clean_text(row['product_title'])
    description = simple_clean_text(row['product_description'])
    bullet_point = simple_clean_text(row['product_bullet_point'])
    brand = simple_clean_text(row['product_brand'])
    color = simple_clean_text(row['product_color'])
    
    # Optional imputation (for later comparison)
    if use_imputation:
        # Use bullet_point for missing description
        if not description and bullet_point:
            description = bullet_point
        # Use description for missing bullet_point (truncated)
        if not bullet_point and description:
            bullet_point = description[:500]
    
    # Combine with clear separators
    components = []
    if title:
        components.append(f"Title: {title}")
    if description:
        components.append(f"Description: {description}")
    if bullet_point:
        components.append(f"Bullets: {bullet_point}")
    if brand:
        components.append(f"Brand: {brand}")
    if color:
        components.append(f"Color: {color}")
    
    combined = " | ".join(components)
    
    # Truncate if too long
    if len(combined) > max_chars:
        combined = combined[:max_chars].rsplit(' ', 1)[0] + "..."
    
    return combined

In [14]:
df.head(1)

Unnamed: 0,example_id,query,query_id,product_id,split,product_title,product_description,product_bullet_point,product_brand,product_color
0,20232,1 cup coffee maker without water reservoir,711,B07GV2S1GS,train,"Keurig K-Mini Coffee Maker, Single Serve K-Cup...",,"FITS ANYWHERE: Less than 5 inches wide, perfec...",Keurig,Black


In [16]:
# Test the function
print("Testing combine_product_text function:")
sample_row = df.iloc[0]
print(f"Query: {sample_row['query']}")
print(f"Combined text (baseline): {combine_product_text(sample_row)[:200]}...")
print(f"Combined text (with imputation): {combine_product_text(sample_row, use_imputation=True)[:200]}...")

Testing combine_product_text function:
Query: 1 cup coffee maker without water reservoir
Combined text (baseline): Title: keurig k-mini coffee maker, single serve k-cup pod coffee brewer, 6 to 12 oz. brew sizes, matte black | Bullets: fits anywhere: less than 5 inches wide, perfect for small spaces
your perfect am...
Combined text (with imputation): Title: keurig k-mini coffee maker, single serve k-cup pod coffee brewer, 6 to 12 oz. brew sizes, matte black | Description: fits anywhere: less than 5 inches wide, perfect for small spaces
your perfec...


In [17]:
# Create baseline and comparison datasets
## Baseline: minimal cleaning, no imputation
df_baseline = df.copy()
df_baseline['combined_text'] = df_baseline.apply(
    lambda row: combine_product_text(row, max_chars=2000, use_imputation=False), 
    axis=1
)

## Comparison: with imputation (for later analysis)
df_imputed = df.copy()
df_imputed['combined_text'] = df_imputed.apply(
    lambda row: combine_product_text(row, max_chars=2000, use_imputation=True), 
    axis=1
)

print("Dataset comparison:")
print(f"Baseline combined text - Min: {df_baseline['combined_text'].str.len().min()}, "
      f"Max: {df_baseline['combined_text'].str.len().max()}, "
      f"Mean: {df_baseline['combined_text'].str.len().mean():.1f}")

print(f"Imputed combined text - Min: {df_imputed['combined_text'].str.len().min()}, "
      f"Max: {df_imputed['combined_text'].str.len().max()}, "
      f"Mean: {df_imputed['combined_text'].str.len().mean():.1f}")

# Check how many rows have empty combined text
empty_baseline = (df_baseline['combined_text'].str.len() == 0).sum()
empty_imputed = (df_imputed['combined_text'].str.len() == 0).sum()
print(f"\nRows with empty combined text - Baseline: {empty_baseline}, Imputed: {empty_imputed}")


Dataset comparison:
Baseline combined text - Min: 15, Max: 2002, Mean: 1109.2
Imputed combined text - Min: 15, Max: 2002, Mean: 1276.7

Rows with empty combined text - Baseline: 0, Imputed: 0


In [20]:
# Save preliminarily processed datasets
df_baseline.to_parquet("../data/processed/df_baseline_clean.parquet")
df_imputed.to_parquet("../data/processed/df_imputed_clean.parquet")