# The Home Depot - Use Case Study (Content recommender system)

## Library initialization and Data Load

In [1]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Download necessary NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Load stopwords and initialize lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load datasets
content_data = pd.read_csv('data/content_data_MASTER.csv')
label_data = pd.read_csv('data/labels_MASTER.csv', header=None)
test_data = pd.read_csv('data/test_MASTER.csv')

# Set the header using the row at index 726
label_data.columns = label_data.iloc[726].values
label_data = label_data.drop(index=726)

# Display the first few rows for exploration
print(content_data.head())
print(label_data.head())
print(test_data.head())

                                    slug  \
0         best-grow-lights-for-seedlings   
1          best-wall-decor-for-your-home   
2  how-to-kill-weeds-early-in-the-season   
3               best-pellets-for-smoking   
4                best-work-light-for-you   

                                   title  
0         Best Grow Lights for Seedlings  
1          Best Wall Decor for Your Home  
2  How to Kill Weeds Early in the Season  
3             Grill Pellets Buying Guide  
4                Best Work Light for You  
                                          searchTerm  \
0                                   #4 #14 connector   
1  03-09 z1000 04-12 z750 05-07 z750s 11-12 z750r...   
2                                1 gal #380c-3 paint   
3                                   1 gal #610 paint   
4                                 1 gal #ae-38 paint   

                                slug     Label  
0             types-of-pipe-fittings  RELEVANT  
1     best-air-filters-for-your-home  RELE

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shaki\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shaki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shaki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Exploratory Data Analysis

In [2]:
# 1. Basic Information
print("Basic Information of the Content Data:")
print(content_data.info())

print("\nBasic Information of the Labels Data:")
print(label_data.info())

print("\nBasic Information of the Test Data:")
print(test_data.info())

Basic Information of the Content Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3191 entries, 0 to 3190
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   slug    3191 non-null   object
 1   title   3191 non-null   object
dtypes: object(2)
memory usage: 50.0+ KB
None

Basic Information of the Labels Data:
<class 'pandas.core.frame.DataFrame'>
Index: 853 entries, 0 to 853
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   searchTerm  853 non-null    object
 1   slug        853 non-null    object
 2   Label       853 non-null    object
dtypes: object(3)
memory usage: 26.7+ KB
None

Basic Information of the Test Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   searchTerm  5 non-null      object
dtypes: object(1)
memory

In [3]:
# 2. Summary statistics
print("\nSummary of Content Data:")
print(content_data.describe())

print("\nSummary of Labels Data:")
print(label_data.describe())

print("\nSummary of Test Data:")
print(test_data.describe())


Summary of Content Data:
                        slug                        title
count                   3191                         3191
unique                  3141                         3146
top     garage-storage-ideas  How to Get Rid of Fire Ants
freq                       3                            3

Summary of Labels Data:
              searchTerm                 slug     Label
count                853                  853       853
unique               853                  499         2
top     #4 #14 connector  bathroom-tile-ideas  RELEVANT
freq                   1                   11       612

Summary of Test Data:
                                   searchTerm
count                                       5
unique                                      5
top     34 in. to 36 in. x 72 in. shower door
freq                                        1


In [4]:
# 4. Distribution of 'Label' column in the labels data
print("\nDistribution of the 'Label' column in Labels Data:")
print(label_data['Label'].value_counts())


Distribution of the 'Label' column in Labels Data:
Label
RELEVANT        612
NOT RELEVANT    241
Name: count, dtype: int64


In [5]:
# 5. Check for duplicates in the content data
print("\nDuplicate Titles in Content Data:")
print(content_data[content_data.duplicated(subset='title', keep=False)])


Duplicate Titles in Content Data:
                                                   slug  \
43                         how-to-clean-an-air-purifier   
46                   best-safety-equipment-for-painting   
125             how-to-insulate-windows-in-cold-weather   
212   the-best-ice-cream-makers-for-homemade-frozen-...   
216                         how-to-paint-interior-doors   
...                                                 ...   
2669                       front-yard-landscaping-ideas   
2715  5-clever-kitchen-upgrades-you-never-knew-you-n...   
2730                               best-carpet-for-pets   
3107             how-to-install-hexagon-tile-backsplash   
3108             how-to-install-hexagon-tile-backsplash   

                                                  title  
43                         How to Clean an Air Purifier  
46                   Best Safety Equipment for Painting  
125             How to Insulate Windows in Cold Weather  
212   The Best Ice Cream

In [6]:
# 6. Frequent search terms in the labels data
print("\nTop 10 Frequent Search Terms in Labels Data:")
print(label_data['searchTerm'].value_counts().head(10))


Top 10 Frequent Search Terms in Labels Data:
searchTerm
#4 #14 connector                         1
klein tools electronics                  1
kohler 2-piece 1.28 toilet in biscuit    1
kohler 22.25 in. sink in sandbar         1
laguna point patio furniture             1
laminate floor sealant                   1
lbl lighting 1-light led                 1
led sunlite uv light bulb                1
led vanity light                         1
legrand conduit elbow                    1
Name: count, dtype: int64


In [7]:
# 7. Frequent content titles in the content data
print("\nTop 10 Frequent Content Titles in Content Data:")
print(content_data['title'].value_counts().head(10))


Top 10 Frequent Content Titles in Content Data:
title
How to Get Rid of Fire Ants         3
Garage Storage Ideas                3
How to Hang a Wallpaper Border      2
How to Clean Stainless Steel        2
How to Put on a Duvet Cover         2
How to Build a Chicken Coop         2
How to Care for Orchids             2
How to Pack for a Move              2
Types of Sandpaper and Abrasives    2
How to Clean a Toilet               2
Name: count, dtype: int64


## Data Cleaning

In [8]:
#labels.to_csv('labels_check.csv', index=False)
#content.to_csv('content_check.csv', index=False)
#test.to_csv('test_check.csv', index=False)

In [9]:
# Define preprocessing function with stopword removal, lemmatization, and POS tagging
def preprocess_text(text):
    text = text.lower()
    tokens = text.split()
    processed_tokens = []
    for word, tag in pos_tag(tokens):
        if word not in stop_words:
            pos = tag[0].lower()
            pos = pos if pos in ['a', 'n', 'v'] else 'n'  # Use 'n' (noun) for all other cases
            lemmatized_word = lemmatizer.lemmatize(word, pos)
            processed_tokens.append(lemmatized_word)
    return ' '.join(processed_tokens)

# Apply preprocessing to content titles, search terms, and labels
content_data['preprocessed_title'] = content_data['title'].apply(preprocess_text)
label_data['preprocessed_searchTerm'] = label_data['searchTerm'].apply(preprocess_text)
test_data['preprocessed_searchTerm'] = test_data['searchTerm'].apply(preprocess_text)

## Model Building

In [10]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [11]:
# Function to get BERT embeddings for a sentence
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    hidden_states = outputs.hidden_states
    sentence_embedding = torch.mean(hidden_states[-1], dim=1)
    return sentence_embedding

# Get BERT embeddings for all content titles
content_embeddings = []
for title in content_data['preprocessed_title']:
    content_embeddings.append(get_bert_embeddings(title).detach().numpy())

content_embeddings = np.vstack(content_embeddings)

In [12]:
# Function to generate recommendations for a search term using cosine similarity
def generate_recommendations(search_term, content_embeddings, content_data, top_n=5, similarity_threshold=0.12):
    search_embedding = get_bert_embeddings(search_term).detach().numpy()
    cosine_similarities = cosine_similarity(search_embedding, content_embeddings).flatten()
    
    # Filter out content below the similarity threshold
    valid_indices = np.where(cosine_similarities >= similarity_threshold)[0]
    
    if len(valid_indices) == 0:
        return pd.DataFrame()  # Return empty DataFrame if no content passes the threshold
    
    top_indices = valid_indices[np.argsort(cosine_similarities[valid_indices])[-top_n:][::-1]]
    
    recommendations = content_data.iloc[top_indices].copy()  # Copy to avoid warnings
    recommendations.loc[:, 'similarity_score'] = cosine_similarities[top_indices]
    
    return recommendations[['slug', 'title', 'similarity_score']]

In [13]:
# Generate recommendations for labeled dataset and evaluate
y_true = []
y_pred = []

for i, row in label_data.iterrows():
    search_term = row['preprocessed_searchTerm']
    slug = row['slug']
    true_label = row['Label'] == 'RELEVANT'
    
    # Get recommendations for the search term with top_n set to 5 and fine-tuned threshold
    recommendations = generate_recommendations(search_term, content_embeddings, content_data, top_n=5, similarity_threshold=0.12)
    
    # Check if the relevant content slug is in the top recommendations
    predicted_label = slug in recommendations['slug'].values
    y_true.append(true_label)
    y_pred.append(predicted_label)

# Evaluate model accuracy
print("Classification Report:")
print(classification_report(y_true, y_pred))
print("Accuracy Score:", accuracy_score(y_true, y_pred))

Classification Report:
              precision    recall  f1-score   support

       False       0.31      0.98      0.47       241
        True       0.96      0.15      0.26       612

    accuracy                           0.39       853
   macro avg       0.64      0.57      0.37       853
weighted avg       0.78      0.39      0.32       853

Accuracy Score: 0.38569753810082064


In [14]:
# Generate recommendations for each search term in the test data
recommendations_dict = {}
for search_term in test_data['preprocessed_searchTerm']:
    recommendations_dict[search_term] = generate_recommendations(search_term, content_embeddings, content_data)

# Display recommendations for each search term in the test set
for search_term, recs in recommendations_dict.items():
    print(f"Search Term: {search_term}")
    print(recs)
    print("\n")

Search Term: 34 in. 36 in. x 72 in. shower door
                                    slug  \
3117     six-step-walk-in-shower-install   
873      how-to-clean-glass-shower-doors   
1746  how-to-install-a-pivot-shower-door   
376      types-of-shower-bases-and-walls   
330              the-best-shower-curtain   

                                          title  similarity_score  
3117            Six Step Walk-In Shower Install          0.691530  
873             How to Clean Glass Shower Doors          0.663976  
1746         How To Install a Pivot Shower Door          0.656824  
376             Types of Shower Bases and Walls          0.655347  
330   The Best Shower Curtain for Your Bathroom          0.652931  


Search Term: outdoor prelit christmas tree
                                       slug  \
2991  festive-outdoor-christmas-decorations   
730           how-to-water-a-christmas-tree   
1213                     DIY-holiday-pallet   
2760   traditional-nostalgic-christmas-tree   