In [None]:
!pip install transformers datasets scikit-learn -q
!pip install huggingface_hub[hf_xet]
!pip install --upgrade transformers
!pip install transformers datasets seqeval
!pip install evaluate seqeval

Collecting hf-xet<2.0.0,>=1.1.1 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.1.2
Collecting transformers
  Downloading transformers-4.52.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.52.2-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalli

In [None]:
import json
import numpy as np
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.model_selection import train_test_split
import evaluate

# Load data
with open("ner_training_data_final_v2.json") as f:
    data = json.load(f)

# Extract labels
all_labels = sorted(list({
    label
    for item in data
    if "labels" in item
    for label in item["labels"]
}))
label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {i: label for label, i in label2id.items()}

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(data)

# Split
train_test = dataset.train_test_split(test_size=0.1)
datasets = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test']
})

# Tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],  # list of token lists
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )

    labels = []
    for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["tokens"]))):
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label2id[examples["labels"][i][word_idx]])
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Apply tokenizer
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

# Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Metrics
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100]
                  for pred, label in zip(predictions, labels)]

    return metric.compute(predictions=true_preds, references=true_labels)

# Training args
training_args = TrainingArguments(
    output_dir="./model_output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

# Save model
trainer.save_model("./model_output")
tokenizer.save_pretrained("./model_output", safe_serialization=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mritviksharma4[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Color,Composition,Gender,Size,Subcategory,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0006,8.2e-05,"{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5452}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 6598}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5292}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5532}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 7942}",1.0,1.0,1.0,1.0
2,0.0001,3.3e-05,"{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5452}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 6598}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5292}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5532}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 7942}",1.0,1.0,1.0,1.0
3,0.0,2e-05,"{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5452}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 6598}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5292}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5532}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 7942}",1.0,1.0,1.0,1.0


Trainer is attempting to log a value of "{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5452}" of type <class 'dict'> for key "eval/COLOR" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 6598}" of type <class 'dict'> for key "eval/COMPOSITION" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5292}" of type <class 'dict'> for key "eval/GENDER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 5532}" of type <class 'dict'> for key "eval/SIZE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect 

('./model_output/tokenizer_config.json',
 './model_output/special_tokens_map.json',
 './model_output/vocab.txt',
 './model_output/added_tokens.json',
 './model_output/tokenizer.json')

In [None]:
# trainer.save_model("./model_output")
from safetensors.torch import load_file
import torch
model_path = "./model_output/model.safetensors"
model_state_dict = load_file(model_path)
torch.save(model_state_dict, "./model_output/pytorch_model.bin")

In [None]:
import json
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet', quiet=True)
lemmatizer = WordNetLemmatizer()

# Sample color mappings loaded from your color_mappings.json (example)
color_mappings = {
    "Beige": ["Beige", "Cream", "Vanilla Sand", "Whipped Ivory", "Burnt Ivory", "Ivory Noir", "Nude"],
    "Black": [
        "Black",
        "Obsidian Black",
        "Obsidian Noir",
        "Velvét Noir",
        "Snow Obsidian",
        "Charcoal Mist",
        "Noiré Bellé",
        "Obsidian Fade",
        "Charcoal",
        "Black & White",
        "Noir Eclairs",
        "Obsidian Tide",
        "Monochrome Grid",
        "Noir",
        "Ivory Noir"
    ],
    "Blue": [
        "Blue",
        "Light Blue",
        "Royal Blue",
        "Royal Deep Blue",
        "Bluebell Bronze",
        "Blue River",
        "Cloudy Sky",
        "Iceline Indigo",
        "Sea Blue",
        "Light Powder Blue",
        "Celestial Mist",
        "Ocean",
        "Arctic Oasis",
        "Obsidian Tide",
        "Regal Horizon",
        "Indigo Stripes",
        "Navy Blue",
        "Misty Blue"
    ],
    "Brown": [
        "Brown",
        "Leopard",
        "Cocoa Luxe",
        "Chocolate Brown",
        "Mocha",
        "Golden Mocha",
        "Coffee Mocha",
        "Cocoa Drape",
        "Hazlenut",
        "Latte",
        "Velvét Espresso",
        "Taupe",
        "Light Brown",
        "Khaki",
        "Dark Brown",
        "Chestnut Brown",
        "Coffee Americano"
    ],
    "Gold": ["Gold", "Golden Drift", "Golden Ivory"],
    "Green": [
        "Green",
        "Forrest Greens",
        "Olive Green",
        "Thorneleaf",
        "Dark Green",
        "Celestial Mist",
        "Ocean",
        "Golden Tide",
        "Obsidian Tide",
        "Shadow Pine",
        "Sage Tide",
        "Emerald Green",
        "Moss Green"
    ],
    "Grey": ["Grey", "Slate Grey", "Charcoal", "Eucalyptus Mist", "Crimson Ash", "Silver"],
    "Orange": [
        "Orange",
        "Sunrise",
        "Rosy Ember",
        "Molten Ember",
        "Solar Veil",
        "Amber Solstice",
        "Citrus Créme"
    ],
    "Pink": [
        "Pink",
        "Sugar Rose",
        "Blushing Snow",
        "Berry Luxe",
        "Cherry Blossom Mint",
        "Sugar Pearl",
        "Mauve",
        "Bubblegum",
        "Orchid Pink"
    ],
    "Purple": ["Purple", "Lavender", "Lavender Sky", "Violet Cream", "Deep Purple", "Amethyst"],
    "Red": [
        "Red",
        "Maroon",
        "Crimson Pearl",
        "Burgundy",
        "Oxblood Luxe",
        "Crimson Dusk",
        "Crimson Thread"
    ],
    "Turquoise": ["Turquoise"],
    "White": [
        "White",
        "Snow Obsidian",
        "Lemon Lace",
        "Sugar Rose",
        "Lavender Sky",
        "Blushing Snow",
        "White Ember",
        "Black & White",
        "Arctic Oasis",
        "Regal Horizon",
        "Sage Tide",
        "Monochrome Grid",
        "Golden Ivory",
        "Misty Blue"
    ],
    "Yellow": [
        "Yellow",
        "Leopard",
        "Bluebell Bronze",
        "Golden Mocha",
        "Lemon Lace",
        "Sunshine",
        "Crazy Yellow",
        "Crimson Dusk",
        "Golden Tide",
        "Amber Solstice",
        "Noir Eclairs",
        "Regal Horizon"
    ],
    "Multi-Colored": [
        "Cherry Blossom Mint",
        "Sugar Pearl",
        "Iceline Indigo",
        "Bronze Obsidian",
        "Molten Ember",
        "Sugar Bloom",
        "Bluebell Bronze",
        "Multi-Colored",
        "Crimson Ash"
    ]
}

# Flatten variant to base color dict (all lowercase keys)
variant_to_base_color = {
    variant.lower(): base_color.lower()
    for base_color, variants in color_mappings.items()
    for variant in variants
}

# Gender normalization map
gender_map = {
    "men": "men", "male": "men", "guys": "men", "man": "men",
    "women": "women", "female": "women", "lady": "women", "girls": "women", "woman": "women",
    "unisex": "unisex"
}

# Size groups
size_groups = {
    "small": ["xxs", "xs", "s"],
    "medium": ["m"],
    "large": ["l", "xl", "xxl"],
    "onesize": ["onesize"]
}

product_items = {
    "t-shirts-and-shirts": ["t-shirt", "shirt"],
    "dresses": ["dress", "gown"],
    "sweaters-and-cardigans": ["sweater", "cardigan"],
    "jackets-and-blazers": ["jacket", "blazer"],
    "trousers": ["trousers", "pants"],
    "caps-and-scarves": ["cap", "scarf", "hat"],
    "ear-rings-and-bracelets": ["earrings", "bracelet", "ear ring", "ear - ring"],
    "bags": ["bag", "backpack", "handbag", "hand bag", "hand - bag"],
    "heels-and-sandals": ["heels", "sandals"],
    "shoes": ["shoes", "sneakers"],
    "sweatshirts-and-hoodies": ["sweatshirt", "hoodie"],
    "jackets": ["jacket", "blazer"]
}

intent_words = {"i", "want", "to", "buy", "get", "need", "would", "like", "looking", "for", "a"}
color_suffixes = {"colored", "tone", "shade", "color", "coloured", "colour"}

def clean_color_tokens(color_tokens):
    cleaned = []
    for token in color_tokens:
        words = token.lower().split()
        # Remove leading intent words
        while words and words[0] in intent_words:
            words.pop(0)
        # Remove trailing suffixes
        while words and words[-1] in color_suffixes:
            words.pop()
        if words:
            cleaned.append(" ".join(words))
    return cleaned

def resolve_color(color_tokens):
    phrase = " ".join(color_tokens).lower()
    if phrase in variant_to_base_color:
        return variant_to_base_color[phrase]
    # fallback: try single tokens individually
    for token in color_tokens:
        token_lower = token.lower()
        if token_lower in variant_to_base_color:
            return variant_to_base_color[token_lower]
    return None

def resolve_subcategory(subcats, gender):
    if not subcats:
        return None
    subcat = subcats[0].lower()
    matched_keys = []
    for key, values in product_items.items():
        for v in values:
            if v in subcat:
                matched_keys.append(key)
                break
    # Gender specific filtering example for jackets
    if subcat == "jackets":
        if gender == "men":
            matched_keys = [k for k in matched_keys if "jacket" in k and "blazer" not in k]
        elif gender == "women":
            matched_keys = [k for k in matched_keys if "blazer" in k]
    if not matched_keys:
        return None
    if len(matched_keys) == 1:
        return matched_keys[0]
    return matched_keys

def map_gender(gender_tokens):
    if not gender_tokens:
        return None
    g = gender_tokens[0].lower()
    return gender_map.get(g)

def map_sizes(size_tokens):
    if not size_tokens:
        return []
    sizes = []
    for token in size_tokens:
        token_lower = token.lower()
        if token_lower in size_groups:
            sizes.extend(size_groups[token_lower])
        elif token_lower in [sz for group in size_groups.values() for sz in group]:
            sizes.append(token_lower)
    return list(set(sizes))

def build_opensearch_query(ner_output):
    from collections import defaultdict

    # Group words by entity_group
    extracted = defaultdict(list)
    for item in ner_output:
        key = item["entity_group"].lower()
        extracted[key].append(item["word"])

    must_clauses = []

    # Gender clause
    gender = map_gender(extracted.get("gender", []))
    if gender:
        must_clauses.append({
            "bool": {
                "should": [
                    {"match": {"gender": gender}},
                    {"match": {"category": gender}}
                ]
            }
        })

    # Subcategory clause
    subcategory = resolve_subcategory(extracted.get("subcategory"), gender)
    if subcategory:
        if isinstance(subcategory, list):
            must_clauses.append({
                "bool": {
                    "should": [{"match": {"subcategory": val}} for val in subcategory]
                }
            })
        else:
            must_clauses.append({"match": {"subcategory": subcategory}})

    # Size clause
    sizes = map_sizes(extracted.get("size", []))
    if sizes:
        must_clauses.append({
            "bool": {
                "should": [{"match": {"sizeOptions": sz}} for sz in sizes]
            }
        })

    # Composition clause
    composition_list = extracted.get("composition", [])
    if composition_list:
        must_clauses.append({"match": {"compositionAndCare": composition_list[0].lower()}})

    # Color clause
    color_tokens = clean_color_tokens(extracted.get("color", []))
    base_color = resolve_color(color_tokens)
    if base_color:
        must_clauses.append({"match": {"baseColor": base_color}})

    return {
        "query": {
            "bool": {
                "must": must_clauses
            }
        }
    }


In [None]:
from transformers import pipeline

ner = pipeline("ner", model="./model_output", tokenizer="./model_output", aggregation_strategy="simple")
query = "sweatshirt for men white"
output = ner(query)
print(output)
query_json = build_opensearch_query(output)
print("\n📦 OpenSearch Query:\n", json.dumps(query_json, indent=2))

Device set to use cuda:0


[{'entity_group': 'SUBCATEGORY', 'score': np.float32(0.9999796), 'word': 'sweatshirt', 'start': 0, 'end': 10}, {'entity_group': 'SUBCATEGORY', 'score': np.float32(0.94678724), 'word': 'for', 'start': 11, 'end': 14}, {'entity_group': 'GENDER', 'score': np.float32(0.9999826), 'word': 'men', 'start': 15, 'end': 18}, {'entity_group': 'COLOR', 'score': np.float32(0.99997556), 'word': 'white', 'start': 19, 'end': 24}]

📦 OpenSearch Query:
 {
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "should": [
              {
                "match": {
                  "gender": "men"
                }
              },
              {
                "match": {
                  "category": "men"
                }
              }
            ]
          }
        },
        {
          "bool": {
            "should": [
              {
                "match": {
                  "subcategory": "t-shirts-and-shirts"
                }
              },
              

In [None]:
!zip -r /content/model.zip /content/model_output/

  adding: content/model_output/ (stored 0%)
  adding: content/model_output/checkpoint-5626/ (stored 0%)
  adding: content/model_output/checkpoint-5626/model.safetensors (deflated 7%)
  adding: content/model_output/checkpoint-5626/rng_state.pth (deflated 25%)
  adding: content/model_output/checkpoint-5626/vocab.txt (deflated 53%)
  adding: content/model_output/checkpoint-5626/optimizer.pt (deflated 28%)
  adding: content/model_output/checkpoint-5626/tokenizer_config.json (deflated 75%)
  adding: content/model_output/checkpoint-5626/config.json (deflated 52%)
  adding: content/model_output/checkpoint-5626/scheduler.pt (deflated 56%)
  adding: content/model_output/checkpoint-5626/tokenizer.json (deflated 71%)
  adding: content/model_output/checkpoint-5626/trainer_state.json (deflated 76%)
  adding: content/model_output/checkpoint-5626/special_tokens_map.json (deflated 42%)
  adding: content/model_output/checkpoint-5626/training_args.bin (deflated 51%)
  adding: content/model_output/model.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp model.zip "/content/drive/My Drive/Colab Notebooks/"