# The North Face ecommerce

## Preprocessing of textual data

Importing librairies.

In [43]:
# Data manipulation
import pandas as pd

# ML
from sklearn.cluster import DBSCAN
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# NLP
import spacy
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS

# Vizualisation
import matplotlib.pyplot as plt
import wordcloud

# OS
import os

# RegEx
import re

Loading data.

In [None]:
# Load data from CSV file
corpus = pd.read_csv(os.getcwd()+"/data/sample-data.csv", index_col="id")

corpus.head()

Unnamed: 0_level_0,description
id,Unnamed: 1_level_1
1,Active classic boxers - There's a reason why o...
2,Active sport boxer briefs - Skinning up Glory ...
3,Active sport briefs - These superbreathable no...
4,"Alpine guide pants - Skin in, climb ice, switc..."
5,"Alpine wind jkt - On high ridges, steep ice an..."


Exploring first documents of the corpus, to determine which preprocessing are needed.

In [45]:
# Print 5 first documents
for i in range(5):
    print(corpus.iloc[i]["description"])

Active classic boxers - There's a reason why our boxers are a cult favorite - they keep their cool, especially in sticky situations. The quick-drying, lightweight underwear takes up minimal space in a travel pack. An exposed, brushed waistband offers next-to-skin softness, five-panel construction with a traditional boxer back for a classic fit, and a functional fly. Made of 3.7-oz 100% recycled polyester with moisture-wicking performance. Inseam (size M) is 4 1/2". Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>"Silky Capilene 1 fabric is ultralight, breathable and quick-to-dry"</li> <li>"Exposed, brushed elastic waistband for comfort"</li> <li>5-panel construction with traditional boxer back</li> <li>"Inseam (size M) is 4 1/2"""</li></ul><br><br><b>Fabric: </b>3.7-oz 100% all-recycled polyester with Gladiodor natural odor control for the garment. Recyclable through the Common Threads Recycling Program<br><br><b>Weight: </b>99 g (3.5 oz)<br><br>

Cleaning the texts : removing HTML markup, removing numbers, removing special characters and lowering characters.

In [46]:
# Remove HTML markup
corpus["description_cleaned"] = corpus["description"].apply(lambda doc : re.sub("<[A-Za-z/]+>", " ", doc))

# Remove numbers, special characters and lower every character
corpus["description_cleaned"] = corpus["description_cleaned"].apply(lambda doc : re.sub("[^A-Za-z]+", " ", doc).lower())

Checking the result post-cleaning.

In [47]:
# Print 5 first documents
for i in range(5):
    print(corpus.iloc[i]["description_cleaned"])

active classic boxers there s a reason why our boxers are a cult favorite they keep their cool especially in sticky situations the quick drying lightweight underwear takes up minimal space in a travel pack an exposed brushed waistband offers next to skin softness five panel construction with a traditional boxer back for a classic fit and a functional fly made of oz recycled polyester with moisture wicking performance inseam size m is recyclable through the common threads recycling program details silky capilene fabric is ultralight breathable and quick to dry exposed brushed elastic waistband for comfort panel construction with traditional boxer back inseam size m is fabric oz all recycled polyester with gladiodor natural odor control for the garment recyclable through the common threads recycling program weight g oz made in mexico 
active sport boxer briefs skinning up glory requires enough movement without your boxers deciding to poach their own route the form fitting active sport bo

Tokenizing the cleaned descriptions using the `spacy` pipeline `en_core_web_sm`. Removing english stop words contains in the `spacy.lang.en.stop_words` package.

In [None]:
# Load NLP with the english pipeline en_core_web_sm
nlp = en_core_web_sm.load()

# Tokenize descriptions using nlp and lemmatize each token not in the stop words set
corpus["token_cleaned"] = corpus["description_cleaned"].apply(lambda doc : [token.lemma_ for token in nlp(doc) if token.text not in STOP_WORDS])

corpus.head()

Unnamed: 0_level_0,description,description_cleaned,token_cleaned
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Active classic boxers - There's a reason why o...,active classic boxers there s a reason why our...,"[active, classic, boxer, s, reason, boxer, cul..."
2,Active sport boxer briefs - Skinning up Glory ...,active sport boxer briefs skinning up glory re...,"[active, sport, boxer, brief, skin, glory, req..."
3,Active sport briefs - These superbreathable no...,active sport briefs these superbreathable no f...,"[active, sport, brief, superbreathable, fly, b..."
4,"Alpine guide pants - Skin in, climb ice, switc...",alpine guide pants skin in climb ice switch to...,"[alpine, guide, pant, skin, climb, ice, switch..."
5,"Alpine wind jkt - On high ridges, steep ice an...",alpine wind jkt on high ridges steep ice and a...,"[alpine, wind, jkt, high, ridge, steep, ice, a..."


Preparing the text to be encoded by detokenize it.

In [None]:
# Join every cleaned token of a document into one string separate by a space
corpus["vectorizer_ready"] = corpus["token_cleaned"].apply(lambda token : " ".join(token))

corpus.head()

Unnamed: 0_level_0,description,description_cleaned,token_cleaned,vectorizer_ready
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Active classic boxers - There's a reason why o...,active classic boxers there s a reason why our...,"[active, classic, boxer, s, reason, boxer, cul...",active classic boxer s reason boxer cult favor...
2,Active sport boxer briefs - Skinning up Glory ...,active sport boxer briefs skinning up glory re...,"[active, sport, boxer, brief, skin, glory, req...",active sport boxer brief skin glory require mo...
3,Active sport briefs - These superbreathable no...,active sport briefs these superbreathable no f...,"[active, sport, brief, superbreathable, fly, b...",active sport brief superbreathable fly brief m...
4,"Alpine guide pants - Skin in, climb ice, switc...",alpine guide pants skin in climb ice switch to...,"[alpine, guide, pant, skin, climb, ice, switch...",alpine guide pant skin climb ice switch rock t...
5,"Alpine wind jkt - On high ridges, steep ice an...",alpine wind jkt on high ridges steep ice and a...,"[alpine, wind, jkt, high, ridge, steep, ice, a...",alpine wind jkt high ridge steep ice alpine ja...


Encoding the descriptions with TF-IDF transformation using `TfidfVectorizer`

In [None]:
# Instanciate TD-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")

# Encode corpus
corpus_vectorized = vectorizer.fit_transform(corpus["vectorizer_ready"])
corpus_vectorized = corpus_vectorized.toarray()

Creating a Dataframe in order to easily manipulate data.

In [None]:
# Generate a list of strings in order to create an index for the DataFrame, starting at 1 to match the first id of the inital corpus
product_list = ["product_"+str(x) for x in range(1, len(corpus_vectorized)+1)]

# Create a DataFrame from the vectorized corpus. Columns represent word of the vectorizer vocabulary and rows product description.
corpus_vectorized_df = pd.DataFrame(
    data=corpus_vectorized,
    index=product_list,
    columns=vectorizer.get_feature_names_out()
)

corpus_vectorized_df.head()

Unnamed: 0,abandon,ability,able,abrasion,abrasive,abroad,absolute,absorb,absorption,abstract,...,yosemite,young,yvon,zinger,zip,zipper,zippered,zipping,zips,zone
product_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
product_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
product_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
product_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.15216,0.176855,0.0,0.0,0.0
product_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147104,0.0,0.0,0.0,0.0


## Groups of products with similar descriptions 