In [1]:
import pandas as pd
import os
import torch
from transformers import RobertaTokenizer, RobertaForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoTokenizer, create_optimizer, AdamWeightDecay, TFAutoModelForMaskedLM
from transformers.keras_callbacks import PushToHubCallback
from datasets import Dataset
from tokenizers import Tokenizer
import string
import tensorflow as tf
import math



In [11]:
# !pip install datasets
# !pip install tf-keras
#!pip install --upgrade huggingface_hub
#! pip install 'transformers[torch]' accelerate
#! pip install torch torchvision torchaudio
#! pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [3]:
'''# Pull data from repository
examples_path = os.path.join('..', 'data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('..', 'data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('..', 'data', 'shopping_queries_dataset_sources.csv')

# Import files and concatentate
examples = pd.read_parquet(examples_path)
products = pd.read_parquet(products_path)
sources = pd.read_csv(sources_path)

examples_products = pd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

examples_products = examples_products[examples_products['product_locale'] == 'us']

task_4 = examples_products[examples_products['large_version'] == 1]'''

"# Pull data from repository\nexamples_path = os.path.join('..', 'data', 'shopping_queries_dataset_examples.parquet')\nproducts_path = os.path.join('..', 'data', 'shopping_queries_dataset_products.parquet')\nsources_path = os.path.join('..', 'data', 'shopping_queries_dataset_sources.csv')\n\n# Import files and concatentate\nexamples = pd.read_parquet(examples_path)\nproducts = pd.read_parquet(products_path)\nsources = pd.read_csv(sources_path)\n\nexamples_products = pd.merge(\n    examples,\n    products,\n    how='left',\n    left_on=['product_locale','product_id'],\n    right_on=['product_locale', 'product_id']\n)\n\nexamples_products = examples_products[examples_products['product_locale'] == 'us']\n\ntask_4 = examples_products[examples_products['large_version'] == 1]"

In [2]:
from datasets import load_dataset

# load Google Shopping dataset from hugging face
# link to dataset: https://huggingface.co/datasets/Marqo/google-shopping-general-eval/viewer?sql=--+The+SQL+console+is+powered+by+DuckDB+WASM+and+runs+entirely+in+the+browser.%0A--+Get+started+by+typing+a+query+or+selecting+a+view+from+the+options+below.%0ASELECT+*+FROM+data+LIMIT+10%3B
pretrain_dataset = load_dataset('Marqo/google-shopping-general-eval')


In [3]:
pretrain_dataset = pretrain_dataset['data']
pretrain_dataset = pretrain_dataset.remove_columns(['image', 'item_ID', 'position'])

In [4]:
pretrain_dataset[:2]

{'query': ['Adaptive Drinking Straws', "Baby Boys' Outerwear Jackets"],
 'title': ['Swig Reusable Straws + Cleaning Brush (Jeepers Creepers + Black Glitter)',
  "Carter's Baby Boy's Hooded Sweater Jacket Size 12 Months Beige Fleece"]}

In [5]:
from collections import Counter
import re

# Function to tokenize and clean text (remove punctuation, lowercasing, etc.)
def tokenize(text):
    # Convert to lowercase and remove non-alphabetic characters
    words = re.findall(r'\b\w+\b', text.lower())
    return words

# Apply tokenization and count frequencies
word_counter = Counter()

# Pull queries column to find misspellings
queries = pretrain_dataset['query']

for text in queries:
    words = tokenize(text)
    word_counter.update(words)

# View the most common words
print(word_counter.most_common(10))  # Adjust the number to see more/less

[('automotive', 62180), ('s', 59374), ('replacement', 52626), ('children', 22013), ('books', 19676), ('women', 19647), ('sports', 17840), ('travel', 17836), ('accessories', 17691), ('kits', 16576)]


In [6]:
from spellchecker import SpellChecker

# initialize spellchecker
spell = SpellChecker()

# loop through word frequencies to find words that might be misspelled
misspelled_words = {word: freq for word, freq in word_counter.items() if word not in spell}

# Sort misspelled words dict
sorted_misspelled = sorted(misspelled_words.items(), key= lambda x: x[1])

# Print misspelled words and their frequencies
for word, freq in sorted_misspelled[:30]:  # Adjust the number as needed
    print(f'Misspelled Word: {word}, Frequency: {freq}')

Misspelled Word: dns, Frequency: 11
Misspelled Word: lifejackets, Frequency: 18
Misspelled Word: jicama, Frequency: 19
Misspelled Word: pashminas, Frequency: 19
Misspelled Word: oled, Frequency: 20
Misspelled Word: henley, Frequency: 21
Misspelled Word: marsala, Frequency: 21
Misspelled Word: paracord, Frequency: 22
Misspelled Word: mortadella, Frequency: 23
Misspelled Word: bodyboards, Frequency: 25
Misspelled Word: eggnogs, Frequency: 28
Misspelled Word: lisbon, Frequency: 28
Misspelled Word: craniomandibular, Frequency: 29
Misspelled Word: temporomandibular, Frequency: 29
Misspelled Word: clawfoot, Frequency: 30
Misspelled Word: skooters, Frequency: 32
Misspelled Word: shilajit, Frequency: 32
Misspelled Word: edamame, Frequency: 32
Misspelled Word: venice, Frequency: 33
Misspelled Word: kauai, Frequency: 33
Misspelled Word: kettlebells, Frequency: 33
Misspelled Word: quickbooks, Frequency: 33
Misspelled Word: yellowstone, Frequency: 33
Misspelled Word: darbukas, Frequency: 33
Misspe

In [7]:
# Combine relevant text columns for training
pretrain_dataset = pretrain_dataset.map(lambda example: {
    'text': str(example['query']) + " " + str(example['title'])
})

In [8]:
# Define function to remove special characters and lowercase the text
puncts = string.punctuation
def process_text(batch, puncts):
    # Remove punctuation and lowercase each text in batch
    texts = []
    for text in batch['text']:
        text = str(text)  # Ensure it's a string
        for punc in puncts:
            text = text.replace(punc, '')
        text = text.lower()  # Convert to lowercase
        texts.append(text)
    return {'text': texts}


In [9]:
# Apply the text processing with batch=True and disable caching
pretrain_dataset = pretrain_dataset.map(
    lambda batch: process_text(batch, puncts), 
    batched=True, 
    load_from_cache_file=False
)

Map: 100%|██████████| 982700/982700 [00:06<00:00, 143313.11 examples/s]


In [10]:
pretrain_dataset

Dataset({
    features: ['query', 'title', 'text'],
    num_rows: 982700
})

In [11]:
print(pretrain_dataset['text'][:5])

['adaptive drinking straws swig reusable straws  cleaning brush jeepers creepers  black glitter', 'baby boys outerwear jackets carters baby boys hooded sweater jacket size 12 months beige fleece', 'mixed drinkware sets mismatched colors wine  water glasses collection set of 4 vintage', 'rod end bearings aurora mg8 rod end bearing 12', 'hanukkah music a hanukkah suite']


In [14]:
# Load pre-trained Distilled RoBerta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
#model = RobertaForCausalLM.from_pretrained('distilroberta-base')


loading file vocab.json from cache at /Users/thomasburns/.cache/huggingface/hub/models--distilroberta-base/snapshots/fb53ab8802853c8e4fbdbcd0529f21fc6f459b2b/vocab.json
loading file merges.txt from cache at /Users/thomasburns/.cache/huggingface/hub/models--distilroberta-base/snapshots/fb53ab8802853c8e4fbdbcd0529f21fc6f459b2b/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/thomasburns/.cache/huggingface/hub/models--distilroberta-base/snapshots/fb53ab8802853c8e4fbdbcd0529f21fc6f459b2b/tokenizer_config.json
loading file tokenizer.json from cache at /Users/thomasburns/.cache/huggingface/hub/models--distilroberta-base/snapshots/fb53ab8802853c8e4fbdbcd0529f21fc6f459b2b/tokenizer.json
loading configuration file config.json from cache at /Users/thomasburns/.cache/huggingface/hub/models--distilroberta-base/snapshots/fb53ab8802853c8e4fbdbcd0529f21fc6f459b2b/config.json
Mod

In [15]:
# Create function to tokenize text
def preprocess_function(examples):
    return tokenizer(examples['text'],
                     truncation=True,
                     max_length=512)

In [16]:
# Tokenize dataset
tokenized_shopping = pretrain_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=['text', 'title', 'query']
)

In [17]:
block_size=128
chunk_size=128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [18]:
# Apply group_texts function over dataset
lm_dataset = tokenized_shopping.map(group_texts, batched=True, num_proc=4)

In [19]:
lm_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 158160
})

In [20]:
# decode ids to ensure data can be recovered
tokenizer.decode(lm_dataset[1]["input_ids"])

' dry eyes advanced relief 10 ml</s><s>cover stock paper bright color paper colorful cardstock  85’’ x 11’’ letter paper size 65lb cover </s><s>fashion photography rare vintage american eccentric fashion photographer  cameras</s><s>ice hockey masks  shields hockey shield  replacement lenses  prizm clear</s><s>adobe certification adobe photoshop dasturi paperback</s><s>space fleet science fiction voyaging volume one the plague star a graphic novel book</s><s>kids microscopes discovery kids discovery mindblown microscope set 48piece with durable metal </s><s>ceiling fan pull chain ornaments space imaginext ion crab'

In [21]:
# decode labels to ensure they match ids
tokenizer.decode(lm_dataset[1]["labels"])

' dry eyes advanced relief 10 ml</s><s>cover stock paper bright color paper colorful cardstock  85’’ x 11’’ letter paper size 65lb cover </s><s>fashion photography rare vintage american eccentric fashion photographer  cameras</s><s>ice hockey masks  shields hockey shield  replacement lenses  prizm clear</s><s>adobe certification adobe photoshop dasturi paperback</s><s>space fleet science fiction voyaging volume one the plague star a graphic novel book</s><s>kids microscopes discovery kids discovery mindblown microscope set 48piece with durable metal </s><s>ceiling fan pull chain ornaments space imaginext ion crab'

In [22]:
# Initialize data collator to randomly mask some of the tokens in each batch (15%)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [23]:
# Test masking to ensure it worked
samples = [lm_dataset[i] for i in range(2)]

for chunk in data_collator(samples)['input_ids']:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s>adapt<mask> drinking straws swig reusable straw<mask> <mask> brush jeepers creepers  black glitter</s><s>baby boys outerwear<mask> carters baby boys hooded sweater jacket<mask><mask> months beige flee<mask></s><s>mixed drinkware sets<mask>atched colors wine  water glasses collection Obst of 4 vintage</s><s>rod end bearings aurora mg8 rod<mask> bearing 12</s><s>hanukkah music a<mask>anukkah suite</s><s>climbing active protection hardware metolius climbing  mountaineering<mask> ultralight<mask> cam set</s><s>dry<mask> relief products lot of cradle<mask>rop vet plus for<mask>'

'>>> <mask><mask> advanced relief 10 ml</s><s>cover stock paper bright color paper colorful cardstock  85’’ x 11’<mask><mask> letter paper size 65lb cover </s><s>fashion photography rare vintage<mask>an eccentric fashion photographer<mask><mask></s><s>ice hockey masks  shields hockey shield  replacement lenses <mask>zm clear</s><s>adobe certification adobe photoshop<mask><mask>uri paperback</s><s>space fle

In [24]:
# Split dataset into 70% train and 30% test
train_test_split = lm_dataset.train_test_split(test_size=0.30)

# Access the train and test splits
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Display the number of samples in each set
print(f"Train set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Train set size: 110712
Test set size: 47448


In [25]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 110712
})

In [26]:
def compute_metrics(eval_pred):
    # The eval_pred object contains predictions and label_ids (true labels).
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    loss = eval_pred.metrics["eval_loss"]  # Get the evaluation loss
    perplexity = math.exp(loss)  # Compute perplexity from loss
    return {"perplexity": perplexity}

In [32]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/Users/thomasburns/opt/miniconda3/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/Users/thomasburns/opt/miniconda3/lib/python3.9/site-packages/huggingface_hub/commands/huggingface_cli.py", line 57, in main
  

In [33]:
# Pull pre-trained model from huggingface hub
model = TFAutoModelForMaskedLM.from_pretrained("twburns/group12_mlm_Distilled_Roberta")

loading configuration file config.json from cache at /Users/thomasburns/.cache/huggingface/hub/models--twburns--group12_mlm_Distilled_Roberta/snapshots/65e9ed7fd0f0980ea6f8873d2013fbc2b0eb3942/config.json
Model config RobertaConfig {
  "_name_or_path": "twburns/group12_mlm_Distilled_Roberta",
  "architectures": [
    "RobertaForCausalLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.45.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file model.safetensors from cache at /Users/thomasburns

In [34]:
training_args = TrainingArguments(
    output_dir="group12_mlm_Distilled_Roberta",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01, 
    remove_unused_columns = False,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    report_to='none'
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# trainer.train()

PyTorch: setting up devices


AttributeError: 'TFRobertaForMaskedLM' object has no attribute 'to'

In [30]:

eval_results = trainer.evaluate()

print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

100%|██████████| 5931/5931 [17:20<00:00,  5.70it/s]

Perplexity: 23.00





In [None]:
# initialize Adam optimizer
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [None]:
# Convert datasets to tf.data.Dataset format

tf_train_set = model.prepare_ft_dataset(
    train_dataset,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator
)

tf_test_set = model.prepare_tf.dataset(
    test_dataset,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

In [None]:
# Configure the model for training
model.compile(optimizer=optimizer)

In [None]:
callback = PushToHubCallback(
    output_dir="group12_mlm_Distilled_Roberta",
    tokenizer=tokenizer
)

In [None]:
model.fit(x=tf_train_set, validation_dat=tf_test_set, epochs=5, callbacks=[callback])