* We expect the input to be the data produced by 010_data_consolidation

In [1]:
# Load data

In [2]:
input = "prepared-data-20221013"

In [3]:
import datasets as ds

In [4]:
dataset = ds.load_from_disk(input + "/train")
dataset

Dataset({
    features: ['__typename', 'id', 'usItemId', 'fitmentLabel', 'name', 'checkStoreAvailabilityATC', 'seeShippingEligibility', 'brand', 'type', 'shortDescription', 'weightIncrement', 'imageInfo.thumbnailUrl', 'imageInfo.size', 'canonicalUrl', 'externalInfo', 'itemType', 'category.path', 'badges.flags', 'badges.tags', 'classType', 'averageRating', 'numberOfReviews', 'esrb', 'mediaRating', 'salesUnitType', 'sellerId', 'sellerName', 'hasSellerBadge', 'isEarlyAccessItem', 'earlyAccessEvent', 'annualEvent', 'availabilityStatusV2.display', 'availabilityStatusV2.value', 'groupMetaData.groupType', 'groupMetaData.groupSubType', 'groupMetaData.numberOfComponents', 'groupMetaData.groupComponents', 'productLocation', 'fulfillmentSpeed', 'offerId', 'preOrder.isPreOrder', 'preOrder.preOrderMessage', 'preOrder.preOrderStreetDateMessage', 'pac', 'priceInfo.priceRange', 'priceInfo.currentPrice.price', 'priceInfo.currentPrice.priceString', 'priceInfo.currentPrice.variantPriceString', 'priceInfo

In [5]:
# Remove columns that we don't need
keep_columns = ["id", "shortDescription", "category_id"]
columns_to_delete = set(dataset.column_names) - set(keep_columns)
dataset = dataset.remove_columns(list(columns_to_delete))
dataset

Dataset({
    features: ['id', 'shortDescription', 'category_id'],
    num_rows: 50243
})

In [6]:
def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["shortDescription"]

In [7]:
training_filename = "training-data.txt"
with open(training_filename, "w", encoding="utf-8") as f:
    for i in range(len(dataset)):
        f.write("\n".join(dataset[i]["shortDescription"].split(";")) + "\n")

# Fine-tuning
Let's use https://huggingface.co/course/chapter3/3?fw=pt

In [8]:
!pip install transformers > /dev/null  # Shut up if there's no error

In [9]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [10]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [11]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)
tokenizer.normalizer.normalize_str(dataset[0]["shortDescription"])

'delicious caramel macchiato flavored brew convenient coffee pods for single pod brewers helps burn calories & supports increased fat burning'

In [12]:
# Pre-tokenization
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tokenizer.pre_tokenizer.pre_tokenize_str(tokenizer.normalizer.normalize_str(dataset[0]["shortDescription"]))

[('delicious', (0, 9)),
 ('caramel', (10, 17)),
 ('macchiato', (18, 27)),
 ('flavored', (28, 36)),
 ('brew', (37, 41)),
 ('convenient', (42, 52)),
 ('coffee', (53, 59)),
 ('pods', (60, 64)),
 ('for', (65, 68)),
 ('single', (69, 75)),
 ('pod', (76, 79)),
 ('brewers', (80, 87)),
 ('helps', (88, 93)),
 ('burn', (94, 98)),
 ('calories', (99, 107)),
 ('&', (108, 109)),
 ('supports', (110, 118)),
 ('increased', (119, 128)),
 ('fat', (129, 132)),
 ('burning', (133, 140))]

In [13]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
# There are 710000 (non-unique) words in training_filename. Let's assume that 10%=7000 of them are related 
# to the product.
# Note: It could make sense to remove vocabulary related to sentiment (e.g. "delicious", "convenient")
trainer = trainers.WordPieceTrainer(vocab_size=7000, special_tokens=special_tokens)

In [14]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [15]:
tokenizer.model = models.WordPiece(unk_token="[UNK]")
tokenizer.train([training_filename], trainer=trainer)






In [16]:
tokenizer.save("tokenizer-dataset-20201013")

In [17]:
encoded = tokenizer.encode(dataset[0]["shortDescription"])
print(encoded)
print(encoded.tokens)
print(encoded.ids)

Encoding(num_tokens=21, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['delicious', 'caramel', 'macchiato', 'flavored', 'brew', 'convenient', 'coffee', 'pods', 'for', 'single', 'pod', 'brewers', 'helps', 'burn', 'calories', '&', 'supports', 'increase', '##d', 'fat', 'burning']
[238, 715, 3365, 450, 513, 651, 174, 990, 147, 792, 843, 1809, 2105, 5453, 302, 10, 2687, 6843, 91, 338, 6539]


In [18]:
print(encoded.attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [19]:
print(encoded.overflowing)

[]
