1. As a first step, you must pre-process the documents. In particular, for the text fields (title,
description) you should:
● Removing stop words
● Tokenization
● Removing punctuation marks
● Stemming
● and... anything else you think it's needed (bonus point)

In [3]:
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [18]:
import json
import regex

In [12]:
dataset_path = '../../data/fashion_products_dataset.json'
with open(dataset_path, 'r') as file:
    data = json.load(file)
print(f"Total products in dataset: {len(data)}")

data_sample_title = data[0]['title']
print(f"Sample product title: {data_sample_title}")

Total products in dataset: 28080
Sample product title: Solid Women Multicolor Track Pants


In [23]:
def extract_product_details(details):
    """
    Extracts only the descriptive values from structured product_details.
    Example input: [{"Color": "Blue"}, {"Material": "Cotton"}]
    Output: "Blue Cotton"
    """
    values = []
    for category in details:
        values.extend(v for v in category.values())
    return " ".join(values)

sample_details = data[0]['product_details']
extracted_details = extract_product_details(sample_details)
print(f"Extracted product details: {extracted_details}")

Extracted product details: 1005COMBO2 Elastic Side Pockets Cotton Blend Solid Multicolor


In [24]:
def build_terms(document):
    """
    Preprocess the document text (title + description + product_details extracted) removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.

    Argument:
    document -- a dictionary with 'title' and 'description' keys

    Returns:
    tokens - a list of tokens corresponding to the input text after the preprocessing
    """
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    text = document['title'] + ' ' + document['description'] + ' ' + extract_product_details(document['product_details'])
    text = text.lower()
    text = ''.join(char if char.isalnum() or char.isspace() else ' ' for char in text)
    text = text.split(" ")
    text = [term for term in text if term not in stop_words]
    text = [term for term in text if term != '']
    text = [stemmer.stem(term) for term in text]
    return text

sample_document = data[0]
sample_terms = build_terms(sample_document)
print(f"Sample document terms: {sample_terms}")

Sample document terms: ['solid', 'women', 'multicolor', 'track', 'pant', 'yorker', 'trackpant', 'made', '100', 'rich', 'comb', 'cotton', 'give', 'rich', 'look', 'design', 'comfort', 'skin', 'friendli', 'fabric', 'itch', 'free', 'waistband', 'great', 'year', 'round', 'use', 'proudli', 'made', 'india', '1005combo2', 'elast', 'side', 'pocket', 'cotton', 'blend', 'solid', 'multicolor']


In [None]:
def build_metadata(document):
    """
    Preprocess the document other fields (category, sub_category, brand, seller) removing stop words,
    transforming in lowercase and return the tokens of the text.

    Argument:
    document -- a dictionary with 'category', 'sub_category', 'brand', 'seller' keys

    Returns:
    tokens - a list of tokens corresponding to the input text after the preprocessing
    """
    stop_words = set(stopwords.words('english'))

    text = document['category'] + ' ' + document['sub_category'] + ' ' + document['brand'] + ' ' + document['seller']
    text = text.lower()
    text = ''.join(char if char.isalnum() or char.isspace() else ' ' for char in text)
    text = text.split(" ")
    text = [term for term in text if term not in stop_words]
    text = [term for term in text if term != '']
    return text

sample_document = data[0]
sample_metadata = build_metadata(sample_document)
print(f"Sample document metadata: {sample_metadata}")

Sample document metadata: ['clothing', 'accessories', 'bottomwear', 'york', 'shyam', 'enterprises']


In [28]:
def preprocess_document(document):
    tokens = build_terms(document)
    metadata_tokens = build_metadata(document)
    return {
        'pid': document['pid'],
        'tokens': tokens,
        'metadata_tokens': metadata_tokens,
        'title': document['title'],
        'description': document['description'],
        'brand': document['brand'],
        'category': document['category'],
        'sub_category': document['sub_category'],
        'product_details': document['product_details'],
        'seller': document['seller'],
        'out_of_stock': document['out_of_stock'],
        'selling_price': document['selling_price'],
        'discount': document['discount'],
        'actual_price': document['actual_price'],
        'average_rating': document['average_rating'],
        'url': document['url']
    }

preprocessed_data = [preprocess_document(doc) for doc in data]
print(f"Total preprocessed products: {len(preprocessed_data)}")

Total preprocessed products: 28080
