# Imports & Setup

## Packages/Libraries

In [17]:
# Step 1: Install the Hugging Face Transformers library
!pip install transformers
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.37.1-py2.py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.7/251.7 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wa

In [18]:
# Step 2: Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

import pandas as pd
import numpy as np
import wandb

In [21]:
wandb.login()

SyntaxError: ignored

## Data

In [5]:
sephora_data = 'https://raw.githubusercontent.com/torrileigh/sephora_cleaned_11.26.23/main/cosmetic_p11.26.23.csv'
sephora_df = pd.read_csv(sephora_data)

sephora_df.head()

Unnamed: 0,Label,brand,name,price,rank,ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,ALGENIST,GENIUS Ultimate Anti-Aging Cream,$112.00,4.3,"Water, Caprylic/Capric Triglyceride, Hydrogent...",1,1,1,1,1
1,Moisturizer,ALGENIST,Overnight Restorative Cream,$94.00,4.4,"Water, Isopropyl Isostearate, Butyrospermum Pa...",0,0,0,0,0
2,Moisturizer,ALGENIST,SUBLIME DEFENSE Ultra Lightweight UV Defense F...,$28.00,4.4,"Cyclopentasiloxane, Water, Alcohol Denat., Gly...",0,0,0,0,0
3,Moisturizer,ALGENIST,POWER Recharging Night Pressed Serum,$95.00,4.4,"Cocos Nucifera (Coconut) Water, Water (Aqua, E...",1,1,1,1,1
4,Moisturizer,ALGENIST,Firming & Lifting Neck Cream,$98.00,3.9,"Water, Isononyl Isononanoate, Dimethicone, But...",0,0,0,0,0


# Model Main

Reminders:

sephora_df

## Tokenizing Input

In [6]:
# Step 3: Load the pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Step 4: Simple test with the model
def test_model(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
    predictions = torch.nn.functional.softmax(logits, dim=-1)
    return predictions

In [8]:
# Test with a sample text
sample_text = "Hello, this is a test sentence for the BERT model."
predictions = test_model(sample_text, tokenizer, model)
print(predictions)

tensor([[0.5569, 0.4431]])


In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [15]:
# Example input
text = "Looking for a moisturizer for sensitive skin under $30"
original_labels = ["O", "O", "O", "B-PRODUCT_TYPE", "O", "B-SKIN_CONCERN", "I-SKIN_CONCERN", "O", "B-MAX_PRICE"]

# Tokenize the input
tokens = tokenizer.tokenize(text)

# Align the labels with tokens
new_labels = []
label_index = 0
prev_label = None

for token in tokens:
    if token.startswith("##"):
        # For subword tokens, use the previous label, changing B- to I- if necessary
        if prev_label and prev_label.startswith("B-"):
            new_label = "I-" + prev_label.split('-')[-1]
        else:
            new_label = prev_label
    else:
        # This is a new word, assign the label and move to the next label
        if label_index < len(original_labels):
            new_label = original_labels[label_index]
            label_index += 1
        else:
            new_label = "O"

    new_labels.append(new_label)
    prev_label = new_label

# Adjust for special tokens [CLS] and [SEP]
new_labels = ["O"] + new_labels + ["O"]

# Print the tokens with their labels
print(list(zip(["[CLS]"] + tokens + ["[SEP]"], new_labels)))


[('[CLS]', 'O'), ('looking', 'O'), ('for', 'O'), ('a', 'O'), ('moist', 'B-PRODUCT_TYPE'), ('##uri', 'I-PRODUCT_TYPE'), ('##zer', 'I-PRODUCT_TYPE'), ('for', 'O'), ('sensitive', 'B-SKIN_CONCERN'), ('skin', 'I-SKIN_CONCERN'), ('under', 'O'), ('$', 'B-MAX_PRICE'), ('30', 'O'), ('[SEP]', 'O')]


In [16]:
#confirming tokenizes correctly

# Example input
text = "Recommended serum for wrinkle prevention with long-lasting protection"
original_labels = ["O", "B-PRODUCT_TYPE", "I-PRODUCT_TYPE", "O", "B-PURPOSE", "O", "O", "B-LONG_LASTING", "I-LONG_LASTING", "O", "O"]

# Tokenize the input
tokens = tokenizer.tokenize(text)

# Align the labels with tokens
new_labels = []
label_index = 0
prev_label = None

for token in tokens:
    if token.startswith("##"):
        # For subword tokens, use the previous label, changing B- to I- if necessary
        if prev_label and prev_label.startswith("B-"):
            new_label = "I-" + prev_label.split('-')[-1]
        else:
            new_label = prev_label
    else:
        # This is a new word, assign the label and move to the next label
        if label_index < len(original_labels):
            new_label = original_labels[label_index]
            label_index += 1
        else:
            new_label = "O"

    new_labels.append(new_label)
    prev_label = new_label

# Adjust for special tokens [CLS] and [SEP]
new_labels = ["O"] + new_labels + ["O"]

# Print the tokens with their labels
print(list(zip(["[CLS]"] + tokens + ["[SEP]"], new_labels)))

[('[CLS]', 'O'), ('recommended', 'O'), ('serum', 'B-PRODUCT_TYPE'), ('for', 'I-PRODUCT_TYPE'), ('wr', 'O'), ('##ink', 'O'), ('##le', 'O'), ('prevention', 'B-PURPOSE'), ('with', 'O'), ('long', 'O'), ('-', 'B-LONG_LASTING'), ('lasting', 'I-LONG_LASTING'), ('protection', 'O'), ('[SEP]', 'O')]


## Weights & Biases