In [5]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.functional import softmax
import pandas as pd
from tqdm import tqdm
from transformers import pipeline



In [98]:
def merge_subtokens(ner_results):
    merged_tokens = []
    current_token = ""
    current_tag = None

    for ent in ner_results:
        word = ent['word']
        tag = ent['entity']

        if word.startswith("##"):
            # This is a subtoken; append without the '##'
            current_token += word[2:]
        else:
            # If we have a current token, save it first
            if current_token:
                merged_tokens.append((current_token, current_tag))
            # Start a new token
            current_token = word
            current_tag = tag
    
    # Append the last token
    if current_token:
        merged_tokens.append((current_token, current_tag))
    
    return merged_tokens

def normalize_tags(merged_tokens):
    # Replace all non-entity tags with 'O'
    # Also, convert B-, I-, E-, S- tags to a simpler tag or keep them
    normalized = []
    for token, tag in merged_tokens:
        if tag == 'O' or tag is None:
            normalized.append((token, 'O'))
        else:
            # Remove B-, I-, E-, S- prefixes to get base tag
            # or you can keep prefixes if you want more detail
            if '-' in tag:
                tag_base = tag.split('-', 1)[1]
            else:
                tag_base = tag
            normalized.append((token, tag))
    return normalized

def normalize_B_to_E_spans(final_tokens):
    """
    Fix tag sequences from B-B-...-E to B-I-I-...-E,
    leaving S- and O tags untouched.
    """
    normalized_tokens = []
    i = 0
    while i < len(final_tokens):
        token, tag = final_tokens[i]

        if tag.startswith("B-"):
            entity_type = tag[2:]
            span = [(token, tag)]  # start with B-

            j = i + 1
            # Collect tokens until matching E-<entity_type>
            while j < len(final_tokens):
                next_token, next_tag = final_tokens[j]
                if next_tag == f"E-{entity_type}":
                    span.append((next_token, next_tag))
                    break
                elif next_tag.startswith("B-"):
                    # Invalid B- inside span, treat as I-
                    span.append((next_token, f"I-{entity_type}"))
                else:
                    # Everything else in span is I-
                    span.append((next_token, f"I-{entity_type}"))
                j += 1

            normalized_tokens.extend(span)
            i = j + 1  # move past the E- tag
        else:
            # Leave S- and O as is
            normalized_tokens.append((token, tag))
            i += 1

    return normalized_tokens

def fix_multiple_E_tags(final_tokens):
    """
    Corrects sequences like:
    B-Entity, E-Entity, E-Entity, ...  =>  B-, I-, ..., E-
    
    All other tags are preserved.
    """
    corrected_tokens = []
    i = 0
    while i < len(final_tokens):
        token, tag = final_tokens[i]

        # Check for pattern: B-Entity followed by 2+ E-Entity tags
        if tag.startswith("B-"):
            entity_type = tag[2:]

            # Look ahead to check for multiple E-Entity
            j = i + 1
            e_span = []
            while j < len(final_tokens):
                next_token, next_tag = final_tokens[j]
                if next_tag == f"E-{entity_type}":
                    e_span.append((next_token, next_tag))
                    j += 1
                else:
                    break

            # If multiple E-Entity found, correct them
            if len(e_span) > 1:
                corrected_tokens.append((token, tag))  # Keep the B-
                for k in range(len(e_span) - 1):  # Convert all but last E- to I-
                    corrected_tokens.append((e_span[k][0], f"I-{entity_type}"))
                corrected_tokens.append((e_span[-1][0], f"E-{entity_type}"))  # Last one stays E-
                i = j  # Move past corrected sequence
                continue  # skip to next iteration
            else:
                corrected_tokens.append((token, tag))  # No issue
                i += 1
        else:
            corrected_tokens.append((token, tag))
            i += 1

    return corrected_tokens


In [109]:
from transformers import TokenClassificationPipeline

def get_full_ner_tags(pipeline, text):
    # Tokenize with offset_mapping
    tokenized_inputs = pipeline.tokenizer(
        text,
        return_offsets_mapping=True,
        return_tensors="pt",
        truncation=True,
        is_split_into_words=False
    )

    # Extract offset_mapping and remove it from inputs to the model
    offset_mapping = tokenized_inputs["offset_mapping"]
    model_inputs = {k: v for k, v in tokenized_inputs.items() if k != "offset_mapping"}

    # Run model
    with torch.no_grad():
        outputs = pipeline.model(**model_inputs)

    logits = outputs.logits
    predictions = torch.argmax(softmax(logits, dim=-1), dim=-1).squeeze().tolist()
    tokens = pipeline.tokenizer.convert_ids_to_tokens(model_inputs["input_ids"].squeeze())
    offset_mapping = offset_mapping.squeeze().tolist()

    id2label = pipeline.model.config.id2label

    full_results = []
    for token, pred_id, offset in zip(tokens, predictions, offset_mapping):
        if offset == [0, 0]:
            continue  # Skip special tokens
        tag = id2label[pred_id]
        full_results.append({"word": token, "entity": tag})
    return full_results



In [203]:
# Load CSV
df = pd.read_csv("Listing_Titles.tsv", keep_default_na=False, na_values=None, sep="\t")

In [204]:
test_df = df[5000:30000].copy()
del df

In [205]:
test_df['Title'] = test_df['Title'].str.replace(r'\s{2,}', ' ', regex=True)
test_df['Title'] = test_df['Title'].str.replace('\xa0', ' ', regex=True)
test_df = test_df[test_df['Category']==1].copy()

In [206]:
test_df

Unnamed: 0,Record Number,Category,Title
5000,5001,1,OPEL ASTRA H 1.7 CDTI-SET 2 Bremsscheiben 4 Be...
5001,5002,1,Satz Gabelfedern Für BMW F 800GS Adventure Tei...
5002,5003,1,Vorderachse Bremsscheiben und Bremsbeläge für ...
5003,5004,1,Für Ford Puma - Bremsscheiben Bremsen Bremstro...
5006,5007,1,"!! Bremsensatz VA, CITROËN XSARA (N1), XSARA (..."
...,...,...,...
29994,29995,1,2x VAICO BREMSSCHEIBEN �345mm SET HINTEN F�R B...
29995,29996,1,2x Bremsscheiben + Bremsbeläge vorne VW Set Sa...
29996,29997,1,2 BREMSSCHEIBEN Ø 211mm 4 LOCH + Beläge VORNE ...
29997,29998,1,STARK Bremsensatz Bremsscheiben + Beläge Belüf...


In [208]:
# Step 1: Load model and tokenizer
model_dir = "ner_model_artifacts_for_cat_1"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForTokenClassification.from_pretrained(model_dir)
model.eval()

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

Device set to use cpu


In [209]:
record_numbers = list(sorted(set(test_df['Record Number'])))

In [210]:
#RN = 29991
final_dataset = []
for RN in record_numbers[:5]:
    print(RN)
    input_text = test_df[test_df['Record Number']==RN]['Title'].str.strip().unique()[0]

    ner_results = ner_pipeline(input_text)

    #ner_results = get_full_ner_tags(ner_pipeline, input_text)

    merged_tokens = merge_subtokens(ner_results)

    #fixed_BBB_tokens = normalize_B_to_E_spans(merged_tokens)

    #fixed_EEE_tokens = fix_multiple_E_tags(fixed_BBB_tokens)
    final_dataset.append([RN, merged_tokens])

5001
5002
5003
5004


  return forward_call(*args, **kwargs)


5007


In [211]:
final_dataset

[[5001,
  [('OPEL', 'S-Kompatible_Fahrzeug_Marke'),
   ('ASTRA', 'B-Kompatibles_Fahrzeug_Modell'),
   ('H', 'I-Kompatibles_Fahrzeug_Modell'),
   ('1', 'I-Kompatibles_Fahrzeug_Modell'),
   ('.', 'I-Kompatibles_Fahrzeug_Modell'),
   ('7', 'I-Kompatibles_Fahrzeug_Modell'),
   ('CDTI', 'E-Kompatibles_Fahrzeug_Modell'),
   ('-', 'E-Kompatibles_Fahrzeug_Modell'),
   ('SET', 'S-Produktart'),
   ('2', 'S-Anzahl_Der_Einheiten'),
   ('Bremsscheiben', 'S-Im_Lieferumfang_Enthalten'),
   ('4', 'S-Anzahl_Der_Einheiten'),
   ('Beläge', 'S-Im_Lieferumfang_Enthalten'),
   ('VA', 'S-Einbauposition')]],
 [5002,
  [('Satz', 'S-Produktart'),
   ('Gabelfedern', 'S-Im_Lieferumfang_Enthalten'),
   ('BMW', 'S-Kompatible_Fahrzeug_Marke'),
   ('F', 'B-Kompatibles_Fahrzeug_Modell'),
   ('800GS', 'B-Kompatibles_Fahrzeug_Modell'),
   ('Adventure', 'E-Kompatibles_Fahrzeug_Modell'),
   ('31427726307', 'S-Herstellernummer'),
   ('.', 'S-Herstellernummer')]],
 [5003,
  [('Vorderachse', 'S-Einbauposition'),
   ('Bremssc

# Check per Record Number

In [219]:
# check 5007 in 1 and 5008 in 2
RN = 29993
input_text = test_df[test_df['Record Number']==RN]['Title'].str.strip().unique()[0]

ner_results = ner_pipeline(input_text)

#ner_results = get_full_ner_tags(ner_pipeline, input_text)

merged_tokens = merge_subtokens(ner_results)

fixed_BBB_tokens = normalize_B_to_E_spans(merged_tokens)

fixed_EEE_tokens = fix_multiple_E_tags(fixed_BBB_tokens)

In [221]:
input_text

'BOSCH Vordere Bremsscheiben & Bremsbeläge Satz für Peugeot 207 1.6 HDI'

In [222]:
for entity in ner_results:
    print(entity)

{'entity': 'S-Hersteller', 'score': 0.9998349, 'index': 1, 'word': 'B', 'start': 0, 'end': 1}
{'entity': 'S-Hersteller', 'score': 0.9998503, 'index': 2, 'word': '##OS', 'start': 1, 'end': 3}
{'entity': 'S-Hersteller', 'score': 0.9998492, 'index': 3, 'word': '##C', 'start': 3, 'end': 4}
{'entity': 'S-Hersteller', 'score': 0.9998448, 'index': 4, 'word': '##H', 'start': 4, 'end': 5}
{'entity': 'S-Einbauposition', 'score': 0.49839234, 'index': 5, 'word': 'Vorder', 'start': 6, 'end': 12}
{'entity': 'S-Im_Lieferumfang_Enthalten', 'score': 0.38078278, 'index': 6, 'word': '##e', 'start': 12, 'end': 13}
{'entity': 'S-Im_Lieferumfang_Enthalten', 'score': 0.99987686, 'index': 7, 'word': 'Brem', 'start': 14, 'end': 18}
{'entity': 'S-Im_Lieferumfang_Enthalten', 'score': 0.99988866, 'index': 8, 'word': '##ss', 'start': 18, 'end': 20}
{'entity': 'S-Im_Lieferumfang_Enthalten', 'score': 0.99988115, 'index': 9, 'word': '##che', 'start': 20, 'end': 23}
{'entity': 'S-Im_Lieferumfang_Enthalten', 'score': 0

In [181]:
for token, tag in merged_tokens:
    print(f"{token}\t{tag}")

1x	S-Anzahl_Der_Einheiten
INA	S-Hersteller
Zahnriemensatz	S-Produktart
Citroen	S-Kompatible_Fahrzeug_Marke
Jumper	B-Kompatibles_Fahrzeug_Modell
244	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
244D	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
2	I-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell
.	E-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell
1	B-Kompatibles_Fahrzeug_Modell
244	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
244D	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
2	I-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell
.	E-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell


In [182]:
for token, tag in fixed_BBB_tokens:
    print(f"{token}\t{tag}")

1x	S-Anzahl_Der_Einheiten
INA	S-Hersteller
Zahnriemensatz	S-Produktart
Citroen	S-Kompatible_Fahrzeug_Marke
Jumper	B-Kompatibles_Fahrzeug_Modell
244	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
244D	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
2	I-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell
.	E-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell
1	B-Kompatibles_Fahrzeug_Modell
244	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
244D	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
2	I-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell
.	E-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell


In [183]:
for token, tag in fixed_EEE_tokens:
    print(f"{token}\t{tag}")

1x	S-Anzahl_Der_Einheiten
INA	S-Hersteller
Zahnriemensatz	S-Produktart
Citroen	S-Kompatible_Fahrzeug_Marke
Jumper	B-Kompatibles_Fahrzeug_Modell
244	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
244D	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
2	I-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell
.	E-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell
1	B-Kompatibles_Fahrzeug_Modell
244	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
244D	I-Kompatibles_Fahrzeug_Modell
/	I-Kompatibles_Fahrzeug_Modell
2	I-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell
.	E-Kompatibles_Fahrzeug_Modell
2	E-Kompatibles_Fahrzeug_Modell


In [192]:
cleaned_entities = merge_ner_tokens(ner_results)
for e in cleaned_entities:
    print(e)

{'entity': 'Anzahl_Der_Einheiten', 'text': '1'}
{'entity': 'Anzahl_Der_Einheiten', 'text': 'x'}
{'entity': 'Hersteller', 'text': 'IN'}
{'entity': 'Hersteller', 'text': 'A'}
{'entity': 'Produktart', 'text': 'Zahn'}
{'entity': 'Produktart', 'text': 'rie'}
{'entity': 'Produktart', 'text': 'mens'}
{'entity': 'Produktart', 'text': 'atz'}
{'entity': 'Kompatible_Fahrzeug_Marke', 'text': 'Cit'}
{'entity': 'Kompatible_Fahrzeug_Marke', 'text': 'roe'}
{'entity': 'Kompatible_Fahrzeug_Marke', 'text': 'n'}
{'entity': 'Kompatibles_Fahrzeug_Modell', 'text': 'er244/244D/22'}
{'entity': 'Kompatibles_Fahrzeug_Modell', 'text': '1244/244D/22'}
