In [43]:
import pandas as pd
import os
import torch
import torchvision
from torch import autocast
from transformers import RobertaTokenizer, RobertaForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoTokenizer, AdamWeightDecay, AutoModelForMaskedLM, default_data_collator, RobertaTokenizer, RobertaForMaskedLM
from transformers.keras_callbacks import PushToHubCallback
from datasets import Dataset
from tokenizers import Tokenizer
import string
import tensorflow as tf
import math
from torch.utils.data import DataLoader
import collections
import numpy as np
from huggingface_hub import notebook_login
#import dask.dataframe as dd

# Update to Pandas to Dask

In [5]:
# !pip install datasets
# !pip install tf-keras
#!pip install --upgrade huggingface_hub
#! pip install 'transformers[torch]' accelerate
#! pip install torch torchvision torchaudio
#! pip install pyspellchecker
#! pip install ipywidgets



<div style="background-color: #2B5269; color: white; padding: 10px; ">
<h1> Pre-processing <h1>
</div>

In [6]:
from datasets import load_dataset

# load Google Shopping dataset from hugging face
# link to dataset: https://huggingface.co/datasets/Marqo/google-shopping-general-eval/viewer?sql=--+The+SQL+console+is+powered+by+DuckDB+WASM+and+runs+entirely+in+the+browser.%0A--+Get+started+by+typing+a+query+or+selecting+a+view+from+the+options+below.%0ASELECT+*+FROM+data+LIMIT+10%3B
pretrain_dataset = load_dataset('Marqo/google-shopping-general-eval')


Resolving data files:   0%|          | 0/45 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/45 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/44 [00:00<?, ?it/s]

In [7]:
pretrain_dataset = pretrain_dataset['data']
pretrain_dataset = pretrain_dataset.remove_columns(['image', 'item_ID', 'position'])

In [8]:
pretrain_dataset[:2]

{'query': ['Adaptive Drinking Straws', "Baby Boys' Outerwear Jackets"],
 'title': ['Swig Reusable Straws + Cleaning Brush (Jeepers Creepers + Black Glitter)',
  "Carter's Baby Boy's Hooded Sweater Jacket Size 12 Months Beige Fleece"]}

In [9]:
from collections import Counter
import re

# Function to tokenize and clean text (remove punctuation, lowercasing, etc.)
def tokenize(text):
    # Convert to lowercase and remove non-alphabetic characters
    words = re.findall(r'\b\w+\b', text.lower())
    return words

# Apply tokenization and count frequencies
word_counter = Counter()

# Pull queries column to find misspellings
queries = pretrain_dataset['query']

for text in queries:
    words = tokenize(text)
    word_counter.update(words)

# View the most common words
print(word_counter.most_common(10))  # Adjust the number to see more/less

[('automotive', 62180), ('s', 59374), ('replacement', 52626), ('children', 22013), ('books', 19676), ('women', 19647), ('sports', 17840), ('travel', 17836), ('accessories', 17691), ('kits', 16576)]


In [10]:
from spellchecker import SpellChecker

# initialize spellchecker
spell = SpellChecker()

# loop through word frequencies to find words that might be misspelled
misspelled_words = {word: freq for word, freq in word_counter.items() if word not in spell}

# Sort misspelled words dict
sorted_misspelled = sorted(misspelled_words.items(), key= lambda x: x[1])

# Print misspelled words and their frequencies
for word, freq in sorted_misspelled[:30]:  # Adjust the number as needed
    print(f'Misspelled Word: {word}, Frequency: {freq}')

Misspelled Word: dns, Frequency: 11
Misspelled Word: lifejackets, Frequency: 18
Misspelled Word: jicama, Frequency: 19
Misspelled Word: pashminas, Frequency: 19
Misspelled Word: oled, Frequency: 20
Misspelled Word: henley, Frequency: 21
Misspelled Word: marsala, Frequency: 21
Misspelled Word: paracord, Frequency: 22
Misspelled Word: mortadella, Frequency: 23
Misspelled Word: bodyboards, Frequency: 25
Misspelled Word: eggnogs, Frequency: 28
Misspelled Word: lisbon, Frequency: 28
Misspelled Word: craniomandibular, Frequency: 29
Misspelled Word: temporomandibular, Frequency: 29
Misspelled Word: clawfoot, Frequency: 30
Misspelled Word: skooters, Frequency: 32
Misspelled Word: shilajit, Frequency: 32
Misspelled Word: edamame, Frequency: 32
Misspelled Word: venice, Frequency: 33
Misspelled Word: kauai, Frequency: 33
Misspelled Word: kettlebells, Frequency: 33
Misspelled Word: quickbooks, Frequency: 33
Misspelled Word: yellowstone, Frequency: 33
Misspelled Word: darbukas, Frequency: 33
Misspe

In [11]:
# Combine relevant text columns for training
pretrain_dataset = pretrain_dataset.map(lambda example: {
    'text': str(example['query']) + " " + str(example['title'])
})

In [12]:
# Define function to remove special characters and lowercase the text
puncts = string.punctuation
def process_text(batch, puncts):
    # Remove punctuation and lowercase each text in batch
    texts = []
    for text in batch['text']:
        text = str(text)  # Ensure it's a string
        for punc in puncts:
            text = text.replace(punc, '')
        text = text.lower()  # Convert to lowercase
        texts.append(text)
    return {'text': texts}


In [13]:
# Apply the text processing with batch=True and disable caching
pretrain_dataset = pretrain_dataset.map(
    lambda batch: process_text(batch, puncts), 
    batched=True, 
    load_from_cache_file=False
)

Map:   0%|          | 0/982700 [00:00<?, ? examples/s]

In [14]:
pretrain_dataset

Dataset({
    features: ['query', 'title', 'text'],
    num_rows: 982700
})

In [15]:
print(pretrain_dataset['text'][:5])

['adaptive drinking straws swig reusable straws  cleaning brush jeepers creepers  black glitter', 'baby boys outerwear jackets carters baby boys hooded sweater jacket size 12 months beige fleece', 'mixed drinkware sets mismatched colors wine  water glasses collection set of 4 vintage', 'rod end bearings aurora mg8 rod end bearing 12', 'hanukkah music a hanukkah suite']


<div style="background-color: #2B5269; color: white; padding: 10px; ">
<h1> Training Distilled Roberta Model<h1>
</div>

In [16]:
# Load pre-trained Distilled RoBerta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

model = RobertaForCausalLM.from_pretrained('distilroberta-base')

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [17]:
# Create function to tokenize text
def preprocess_function(examples):
    return tokenizer(examples['text'],
                     truncation=True,
                     max_length=512)

In [18]:
# Tokenize dataset
tokenized_shopping = pretrain_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=['text', 'title', 'query']
)

Map (num_proc=4):   0%|          | 0/982700 [00:00<?, ? examples/s]

In [19]:
block_size=128
chunk_size=128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [20]:
# Apply group_texts function over dataset
lm_dataset = tokenized_shopping.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/982700 [00:00<?, ? examples/s]

In [21]:
lm_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 158160
})

In [22]:
# decode ids to ensure data can be recovered
tokenizer.decode(lm_dataset[1]["input_ids"])

' dry eyes advanced relief 10 ml</s><s>cover stock paper bright color paper colorful cardstock  85’’ x 11’’ letter paper size 65lb cover </s><s>fashion photography rare vintage american eccentric fashion photographer  cameras</s><s>ice hockey masks  shields hockey shield  replacement lenses  prizm clear</s><s>adobe certification adobe photoshop dasturi paperback</s><s>space fleet science fiction voyaging volume one the plague star a graphic novel book</s><s>kids microscopes discovery kids discovery mindblown microscope set 48piece with durable metal </s><s>ceiling fan pull chain ornaments space imaginext ion crab'

In [23]:
# decode labels to ensure they match ids
tokenizer.decode(lm_dataset[1]["labels"])

' dry eyes advanced relief 10 ml</s><s>cover stock paper bright color paper colorful cardstock  85’’ x 11’’ letter paper size 65lb cover </s><s>fashion photography rare vintage american eccentric fashion photographer  cameras</s><s>ice hockey masks  shields hockey shield  replacement lenses  prizm clear</s><s>adobe certification adobe photoshop dasturi paperback</s><s>space fleet science fiction voyaging volume one the plague star a graphic novel book</s><s>kids microscopes discovery kids discovery mindblown microscope set 48piece with durable metal </s><s>ceiling fan pull chain ornaments space imaginext ion crab'

In [24]:
# Dynamically pad sentences to the longest length of batch
tokenizer.pad_token = tokenizer.eos_token

# Initialize data collator to randomly mask some of the tokens in each batch (15%)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [25]:
# Test masking to ensure it worked
samples = [lm_dataset[i] for i in range(2)]

for chunk in data_collator(samples)['input_ids']:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s>adaptive drinking straws perishig<mask> straw<mask>  cleaning brush jeepers creepers  black glitter</s><s><mask> boys outerwear jackets carters inspired boys hood<mask> sweater jacket sizeotive months beige fleece</s><s>mixed drink<mask> sets mism<mask> colors wine <mask> glasses collection set of 4 vintage</s><s>rod end bearings aurora mg8 rod<mask> bearing 12</s><s>hanukkah music a hanukkah suite</s><s>climbing active protection hardware metolius<mask>  mountaineering equipment ultralight power camIDENT</s><s>dry eye relief products lot of<mask> idrop vet plus for moderate'

'>>>  dry eyes advanced relief<mask> ml</s><s><mask> stock paper bright color paper<mask> cardstock  85�<mask>’ x 11’’ letter paper size<mask>lb cover </s><s>ption photography rare vintage american eccentric fashion<mask>  cameras</s><s><mask> hockey masks  shields hockey shield  replacement lenses  prizm clear</s><s>adobeATHER<mask>obe patriarchoshop d<mask>uri paperback</s><s>space fleet science fictio

In [26]:
# Split dataset into 70% train and 30% test
train_test_split = lm_dataset.train_test_split(test_size=0.30)

# Access the train and test splits
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Display the number of samples in each set
print(f"Train set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Train set size: 110712
Test set size: 47448


In [27]:
train_test_split

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 110712
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 47448
    })
})

In [28]:
# Pull pre-trained model and tokenizer from huggingface hub
model = RobertaForCausalLM.from_pretrained("twburns/group12_mlm_Distilled_Roberta")
tokenizer = AutoTokenizer.from_pretrained('twburns/group12_mlm_Distilled_Roberta')

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [29]:
def compute_metrics(eval_pred):
    # The eval_pred object contains predictions and label_ids (true labels).
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    loss = eval_pred.metrics["eval_loss"]  # Get the evaluation loss
    perplexity = math.exp(loss)  # Compute perplexity from loss
    return {"perplexity": perplexity}

In [30]:
'''training_args = TrainingArguments(
    output_dir="group12_mlm_Distilled_Roberta",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    save_steps=100_000,
    save_total_limit=2,
    weight_decay=0.01, 
    remove_unused_columns = False,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)'''

'training_args = TrainingArguments(\n    output_dir="group12_mlm_Distilled_Roberta",\n    eval_strategy="epoch",\n    learning_rate=2e-5,\n    num_train_epochs=5,\n    save_steps=100_000,\n    save_total_limit=2,\n    weight_decay=0.01, \n    remove_unused_columns = False,\n    per_device_train_batch_size=16,\n    per_device_eval_batch_size=16\n    #push_to_hub=True,\n)\n\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    compute_metrics=compute_metrics,\n    train_dataset=train_dataset,\n    eval_dataset=test_dataset,\n    data_collator=data_collator,\n    tokenizer=tokenizer,\n)'

In [31]:
#trainer.train()

In [32]:

#eval_results = trainer.evaluate()

#print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [33]:
# Move the model to GPU (if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [34]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> Group 12 Pretrained Distilled Roberta number of parameters: {round(distilbert_num_parameters)}M'")

'>>> Group 12 Pretrained Distilled Roberta number of parameters: 82M'


In [44]:
# Load pre-trained model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("twburns/group12_mlm_Distilled_Roberta")
model = RobertaForMaskedLM.from_pretrained("twburns/group12_mlm_Distilled_Roberta")



In [77]:
text = "Adjustable Car <mask> Holder for Easy Navigation"

In [72]:
inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits

# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> Adjustable Car holder Holder for Easy Navigation'
'>>> Adjustable Car Holder Holder for Easy Navigation'
'>>> Adjustable Car holders Holder for Easy Navigation'
'>>> Adjustable Car lock Holder for Easy Navigation'
'>>> Adjustable Car opener Holder for Easy Navigation'


In [78]:
# Function to filter out duplicates in the final output
def get_unique_replacements(text, top_tokens):
    seen_replacements = set()
    results = []

    for token in top_tokens:
        decoded_token = tokenizer.decode([token]).strip()
        # Check if this replacement has already been used
        if decoded_token not in seen_replacements:
            seen_replacements.add(decoded_token)
            # Replace the mask token and store the result
            result = text.replace(tokenizer.mask_token, decoded_token)
            results.append(result)

    return results

# Sample the top tokens and generate unique replacements
top_5_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()
unique_replacements = get_unique_replacements(text, top_5_tokens)

# Print unique replacements
for replacement in unique_replacements:
    print(f"'>>> {replacement}'")


'>>> Adjustable Car holder Holder for Easy Navigation'
'>>> Adjustable Car Holder Holder for Easy Navigation'
'>>> Adjustable Car holders Holder for Easy Navigation'
'>>> Adjustable Car lock Holder for Easy Navigation'
'>>> Adjustable Car opener Holder for Easy Navigation'
'>>> Adjustable Car clip Holder for Easy Navigation'
'>>> Adjustable Car rack Holder for Easy Navigation'
'>>> Adjustable Car mount Holder for Easy Navigation'
'>>> Adjustable Car cover Holder for Easy Navigation'
'>>> Adjustable Car handle Holder for Easy Navigation'


<div style="background-color: #2B5269; color: white; padding: 10px; ">
<h1> Testing on Amazon Dataset <h1>
</div>

In [34]:
examples_path = os.path.join('..', 'data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('..', 'data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('..', 'data', 'shopping_queries_dataset_sources.csv')

examples = pd.read_parquet(examples_path)
products = pd.read_parquet(products_path)
sources = pd.read_csv(sources_path)

In [35]:
# Pull pre-trained model and tokenizer from huggingface hub
model = RobertaForCausalLM.from_pretrained("twburns/group12_mlm_Distilled_Roberta")
tokenizer = AutoTokenizer.from_pretrained('twburns/group12_mlm_Distilled_Roberta')

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [36]:
examples_products = pd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

examples_products = examples_products[examples_products['product_locale'] == 'us']

examples_products.head()


Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color
0,0,revent 80 cfm,0,B000MOO21W,us,I,0,1,train,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...,,WhisperCeiling fans feature a totally enclosed...,Panasonic,White
1,1,revent 80 cfm,0,B07X3Y6B1V,us,E,0,1,train,Homewerks 7141-80 Bathroom Fan Integrated LED ...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,80 CFM
2,2,revent 80 cfm,0,B07WDM7MQQ,us,E,0,1,train,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,White
3,3,revent 80 cfm,0,B07RH6Z8KW,us,E,0,1,train,Delta Electronics RAD80L BreezRadiance 80 CFM ...,This pre-owned or refurbished product has been...,Quiet operation at 1.5 sones\nBuilt-in thermos...,DELTA ELECTRONICS (AMERICAS) LTD.,White
4,4,revent 80 cfm,0,B07QJ7WYFQ,us,E,0,1,train,Panasonic FV-08VRE2 Ventilation Fan with Reces...,,The design solution for Fan/light combinations...,Panasonic,White


In [39]:
# Sample a random n number of queries
n = 10  # Specify the number of random queries you want to sample
random_queries = examples_products.sample(n=n, random_state=2000)  
queries = random_queries['query'].tolist()

# Sample 1,000 random products
subset_product_titles = examples_products['product_title'].sample(n=1000, random_state=2000).tolist()


In [40]:
try:
    subset_product_titles = examples_products['product_title'].sample(n=1000, random_state=2000).tolist()
except Exception as e:
    print(f"Error occurred: {e}")

In [41]:
# Create a function to generate predictions
def generate_predictions(queries, product_titles, k, batch_size):
    all_results = []  # This will hold the final results

    # Process each query
    for query in queries:
        query_results = []  # Store results for the current query
        seen_titles = set()  # Track seen product titles

        # Prepare input texts for the current query in batches
        for i in range(0, len(product_titles), batch_size):
            batch_titles = product_titles[i:i + batch_size]
            input_texts = [f"{query} <mask> {title}" for title in batch_titles]
            inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors='pt')

            # Move inputs to the appropriate device
            inputs = {key: val.to(device) for key, val in inputs.items()}

            # Make sure to use the model in evaluation mode
            model.eval()

            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits
            
            # Process each title's predictions for the current batch
            for j in range(len(batch_titles)):
                mask_index = (inputs['input_ids'][j] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]

                if mask_index.numel() == 0:  # Check if mask token was found
                    print(f"No mask token found for input: {input_texts[j]}")
                    continue  # Skip this input if no mask token was found
                
                mask_logits = logits[j, mask_index.item()]
                
                # Get the top_k predictions
                top_k_indices = torch.topk(mask_logits, k).indices
                predicted_tokens = tokenizer.convert_ids_to_tokens(top_k_indices)

                # Ensure the product title is unique for this query
                product_title = batch_titles[j]
                if product_title not in seen_titles:
                    seen_titles.add(product_title)  # Mark this title as seen
                    query_results.append({
                        'query': query,
                        'product_title': product_title,
                        'predicted_tokens': predicted_tokens,
                        'logits': mask_logits  # Store logits for sorting
                    })
        
        # Sort query results based on the relevance (logits) and limit to top k unique results
        query_results.sort(key=lambda x: x['logits'].max().item(), reverse=True)  # Sort by max logit
        top_k_results = []  # To collect unique results for this query

        for result in query_results:
            if len(top_k_results) < k:  # Limit to k results
                if result['product_title'] not in {r['product_title'] for r in top_k_results}:
                    top_k_results.append(result)

        # Append the unique results for this query to the overall results
        all_results.extend(top_k_results[:k])  # Ensure only top k are taken

    return all_results

In [42]:
# Test the model on the sampled queries
# Return top 5 results
k = 5
batch_size = 16
all_results = []

all_results = generate_predictions(queries, subset_product_titles, k, batch_size)

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(all_results)

# Display the results
print(results_df)

                                    query  \
0                beach waver curling iron   
1                beach waver curling iron   
2                beach waver curling iron   
3                beach waver curling iron   
4                beach waver curling iron   
5   long shelves for bedroom wall mounted   
6   long shelves for bedroom wall mounted   
7   long shelves for bedroom wall mounted   
8   long shelves for bedroom wall mounted   
9   long shelves for bedroom wall mounted   
10                              key chain   
11                              key chain   
12                              key chain   
13                              key chain   
14                              key chain   
15                                  spode   
16                                  spode   
17                                  spode   
18                                  spode   
19                                  spode   
20     rae dunn christmas reindeer snacks   
21     rae

<div style="background-color: #2B5269; color: white; padding: 10px; ">
<h1> Fine-Tuning <h1>
</div>

In [None]:
pretrain_dataset

Dataset({
    features: ['query', 'title', 'text'],
    num_rows: 982700
})

In [None]:
# Split pretrained dataset into 70% train and 30% test
train_test_split = pretrain_dataset.train_test_split(test_size=0.30)

# Access the train and test splits
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Display the number of samples in each set
print(train_test_split)

DatasetDict({
    train: Dataset({
        features: ['query', 'title', 'text'],
        num_rows: 687890
    })
    test: Dataset({
        features: ['query', 'title', 'text'],
        num_rows: 294810
    })
})


In [None]:
sample = train_test_split["train"].shuffle(seed=2006).select(range(3))

for row in sample:
    print(f"\n'>>> Text: {row['text']}'")


'>>> Text: baseball  softball pitching machines sports attack hack attack baseball pitching machine 1001100 with extended legs'

'>>> Text: climbing slings  runners 1 nylon runner sling'

'>>> Text: contact grills nexgrill 4burner propane gas grill in black with side burner and stainless steel '


In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_dataset = train_test_split.map(
    tokenize_function, batched=True, remove_columns=["text", "query", "title"]
)
tokenized_dataset

Map:   0%|          | 0/687890 [00:00<?, ? examples/s]

Map:   0%|          | 0/294810 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 687890
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 294810
    })
})

In [None]:
tokenizer.model_max_length

512

In [None]:
'''# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_dataset["train"]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")'''

'# Slicing produces a list of lists for each feature\ntokenized_samples = tokenized_dataset["train"]\n\nfor idx, sample in enumerate(tokenized_samples["input_ids"]):\n    print(f"\'>>> Review {idx} length: {len(sample)}\'")'

In [None]:
# Concatenate all tokenized samples and print total length
'''concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.column_names
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")'''

'concatenated_examples = {\n    k: sum(tokenized_samples[k], []) for k in tokenized_samples.column_names\n}\ntotal_length = len(concatenated_examples["input_ids"])\nprint(f"\'>>> Concatenated reviews length: {total_length}\'")'

In [None]:
'''chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")'''

'chunks = {\n    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]\n    for k, t in concatenated_examples.items()\n}\n\nfor chunk in chunks["input_ids"]:\n    print(f"\'>>> Chunk length: {len(chunk)}\'")'

In [None]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True)
lm_dataset

Map:   0%|          | 0/687890 [00:00<?, ? examples/s]

Map:   0%|          | 0/294810 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 110732
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 47427
    })
})

In [None]:
wwm_probability = 0.2

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [None]:
samples = [lm_dataset["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s>baby<mask> protection<mask><mask><mask> baby noise canceling headphones<mask> protection<mask><mask><mask> up to</s><s>cloth diaper sprayers znts bidet<mask><mask> for toilet handheld cloth diaper sprayer 40650030</s><s>bottled  canned coffee<mask> ucc coffee with milk original blend  113<mask><mask></s><s>fishing<mask> sets 134 pcs fishing tool kit fishing<mask><mask> equipment fishing pliers kit fish hook</s><s>automotive replacement heater<mask><mask> ac  heater relays  hd truck mei1246</s><s><mask><mask> fu <mask><mask> chi uniform bottoms<mask> martial arts tai<mask> trousers<mask><mask> le'

'>>> e kung fu wing chun pants</s><s>toy kitchen sets 2pieces wooden kids kitchen playset with light and sound set shop and smile</s><s>xbox 360 game keyboards turtle beach<mask><mask> keyboard</s><s>kids  baby door hangers baby do not<mask>  teal<mask> hanger</s><s>kayak  canoe<mask> racks field  stream deluxe kayak<mask> kit x5 <mask><mask> canoe carrier kit x1</s><s><mask><mask><m

In [None]:
#notebook_login()

In [None]:
train_size = 10_000
test_size = int(0.1 * train_size)

# Downsize sample for faster training
downsampled_dataset = lm_dataset["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=2006
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [None]:
batch_size = 8

model_checkpoint = "group12_mlm_Distilled_Roberta"
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]


In [None]:
training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-google",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=False,
    no_cuda=True,
    logging_steps=logging_steps,
    #use_mps_device=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [None]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

ValueError: Column name ['word_ids'] not in the dataset. Current columns in the dataset: ['input_ids', 'attention_mask', 'labels']

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 16
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [None]:
from torch.optim import AdamW
# initialize Adam optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import get_full_repo_name

model_name = "group12_mlm_Distilled_Roberta-finetuned-google"
repo_name = get_full_repo_name(model_name)
repo_name

'twburns/group12_mlm_Distilled_Roberta-finetuned-google'

In [None]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
python(3609) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(3610) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(3614) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(3631) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(3632) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(3633) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
/Users/thomasburns/Documents/Repos/esci-shopping-queries/pre-training/group12_mlm_Distilled_Roberta-finetuned-google is already a clone of https://huggingface.co/twburns/group12_mlm_Distilled_Roberta-finetuned-google. Make sure you pull the latest changes with `repo.git_pull()`.
python(3634) MallocStackLogging: can't tur

In [None]:
torch.cuda.empty_cache()  # Clear any cached memory

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        # Move batch to the MPS device
        batch = {k: v.to('mps') for k, v in batch.items()}
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        # Backward pass
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        # Move batch to the MPS device
        batch = {k: v.to(accelerator.device) for k, v in batch.items}
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/785 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
print(f"Model device: {next(model.parameters()).device}")
print(f"Batch device: {next(iter(batch.values())).device}")

Model device: mps:0
Batch device: cpu
