In [1]:
import pandas as pd

# Load CSV
df = pd.read_csv("Listing_Titles.tsv", keep_default_na=False, na_values=None, sep="\t")

In [2]:
test_df = df[5000:30000].copy()

In [3]:
del df

In [4]:
test_df

Unnamed: 0,Record Number,Category,Title
5000,5001,1,OPEL ASTRA H 1.7 CDTI-SET 2 Bremsscheiben 4 Be...
5001,5002,1,Satz Gabelfedern Für BMW F 800GS Adventure Tei...
5002,5003,1,Vorderachse Bremsscheiben und Bremsbeläge für ...
5003,5004,1,Für Ford Puma - Bremsscheiben Bremsen Bremstr...
5004,5005,2,1x DOLZ WASSERPUMPE + ZAHNRIEMENSATZ KD046 910...
...,...,...,...
29995,29996,1,2x Bremsscheiben + Bremsbeläge vorne VW Set Sa...
29996,29997,1,2 BREMSSCHEIBEN Ø 211mm 4 LOCH + Beläge VORNE ...
29997,29998,1,STARK Bremsensatz Bremsscheiben + Beläge Belüf...
29998,29999,1,Bremsschlauch TRISCAN 8150 29339 NEU günstig k...


In [5]:
test_df['Title'] = test_df['Title'].str.replace(r'\s{2,}', ' ', regex=True)
test_df['Title'] = test_df['Title'].str.replace('\xa0', ' ', regex=True)
test_df = test_df[test_df['Category']==1]

In [6]:
test_df

Unnamed: 0,Record Number,Category,Title
5000,5001,1,OPEL ASTRA H 1.7 CDTI-SET 2 Bremsscheiben 4 Be...
5001,5002,1,Satz Gabelfedern Für BMW F 800GS Adventure Tei...
5002,5003,1,Vorderachse Bremsscheiben und Bremsbeläge für ...
5003,5004,1,Für Ford Puma - Bremsscheiben Bremsen Bremstro...
5006,5007,1,"!! Bremsensatz VA, CITROËN XSARA (N1), XSARA (..."
...,...,...,...
29994,29995,1,2x VAICO BREMSSCHEIBEN �345mm SET HINTEN F�R B...
29995,29996,1,2x Bremsscheiben + Bremsbeläge vorne VW Set Sa...
29996,29997,1,2 BREMSSCHEIBEN Ø 211mm 4 LOCH + Beläge VORNE ...
29997,29998,1,STARK Bremsensatz Bremsscheiben + Beläge Belüf...


In [7]:
#import csv
#test_df.to_csv('quiz_data.tsv', sep="\t", index=False)

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.functional import softmax
import pandas as pd
from tqdm import tqdm



  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Step 1: Load model and tokenizer
model_dir = "ner_model_artifacts_for_cat_1"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForTokenClassification.from_pretrained(model_dir)
model.eval()

# Step 2: Load test data
#test_df = pd.read_csv("test_data.csv")  # Adjust filename as needed



BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [10]:
# Step 3: Define helper functions to decode BIOES tags into aspect chunks
def extract_aspects(tokens, tags):
    aspects = []
    current_aspect = None
    aspect_tokens = []

    for token, tag in zip(tokens, tags):
        if tag == "O":
            if current_aspect:
                aspects.append((current_aspect, ' '.join(aspect_tokens)))
                current_aspect = None
                aspect_tokens = []
        elif tag.startswith("B-") or tag.startswith("S-"):
            if current_aspect:
                aspects.append((current_aspect, ' '.join(aspect_tokens)))
            current_aspect = tag[2:]
            aspect_tokens = [token]
            if tag.startswith("S-"):  # Single-token aspect
                aspects.append((current_aspect, token))
                current_aspect = None
                aspect_tokens = []
        elif tag.startswith("I-") or tag.startswith("E-"):
            if current_aspect:
                aspect_tokens.append(token)
            if tag.startswith("E-"):
                if current_aspect:
                    aspects.append((current_aspect, ' '.join(aspect_tokens)))
                    current_aspect = None
                    aspect_tokens = []
    if current_aspect:
        aspects.append((current_aspect, ' '.join(aspect_tokens)))
    return aspects



In [11]:
# Step 4: Perform inference and write output
output_rows = []

label_map = {i: label for i, label in enumerate(model.config.id2label.values())}

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    record_number = row["Record Number"]
    category = row["Category"]
    title = row["Title"]

    # Tokenize with alignment to original words
    inputs = tokenizer(title, return_tensors="pt", truncation=True, is_split_into_words=False)
    with torch.no_grad():
        outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)[0].tolist()

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    word_ids = inputs.word_ids()

    # Group predictions by original words
    # Group predictions at the word level
    word_predictions = []
    current_word_idx = None
    current_word_tokens = []
    current_word_labels = []

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue  # Skip special tokens like [CLS], [SEP]

        token = tokens[idx]
        label = label_map[predictions[idx]]

        if word_idx != current_word_idx:
            # Save previous word prediction
            if current_word_tokens:
                # Use the first label (common for NER, or use majority voting)
                word = tokenizer.convert_tokens_to_string(current_word_tokens).strip()
                word_predictions.append((word, current_word_labels[0]))
            # Start new word
            current_word_tokens = [token]
            current_word_labels = [label]
            current_word_idx = word_idx
        else:
            # Continuation of current word (subword)
            current_word_tokens.append(token)
            current_word_labels.append(label)

    # Add final word
    if current_word_tokens:
        word = tokenizer.convert_tokens_to_string(current_word_tokens).strip()
        word_predictions.append((word, current_word_labels[0]))

    # Extract aspect name/value pairs
    aspect_chunks = extract_aspects([wp[0] for wp in word_predictions], [wp[1] for wp in word_predictions])
    for aspect_name, aspect_value in aspect_chunks:
        output_rows.append([record_number, category, aspect_name, aspect_value])



  return forward_call(*args, **kwargs)
100%|████████████████████████████████████████████████████████████████████████████| 12500/12500 [32:09<00:00,  6.48it/s]


In [12]:
output_df = pd.DataFrame(output_rows, columns=["Record Number", "Category", "Aspect Name", "Aspect Value"])

In [13]:
output_df

Unnamed: 0,Record Number,Category,Aspect Name,Aspect Value
0,5001,1,Kompatible_Fahrzeug_Marke,OPEL
1,5001,1,Kompatibles_Fahrzeug_Modell,ASTRA H 1 . 7 CDTI
2,5001,1,Produktart,SET
3,5001,1,Anzahl_Der_Einheiten,2
4,5001,1,Im_Lieferumfang_Enthalten,Bremsscheiben
...,...,...,...,...
114736,29998,1,Bremsscheibenart,Belüftet
114737,29999,1,Im_Lieferumfang_Enthalten,Bremsschlauch
114738,29999,1,Kompatible_Fahrzeug_Marke,TRISCAN
114739,29999,1,Herstellernummer,8150


In [25]:
#29993
output_df[output_df['Record Number']==29993]

Unnamed: 0,Record Number,Category,Aspect Name,Aspect Value
114689,29993,1,Hersteller,BOSCH
114690,29993,1,Einbauposition,Vordere
114691,29993,1,Im_Lieferumfang_Enthalten,Bremsscheiben
114692,29993,1,Im_Lieferumfang_Enthalten,Bremsbeläge
114693,29993,1,Produktart,Satz
114694,29993,1,Kompatible_Fahrzeug_Marke,Peugeot
114695,29993,1,Kompatibles_Fahrzeug_Modell,207 1 . 6 HDI


In [22]:
output_df.to_csv('output_df_cat1.csv', index=False)

In [21]:
# Step 5: Save to file
#output_df.to_csv("ner_submission.tsv", sep="\t", index=False, header=False, encoding="utf-8")
#output_df.to_csv('ner_submission_cat1.tsv', sep="\t", index=False)

In [15]:
import csv
output_df.to_csv('ner_submission_cat1.tsv', sep="\t", index=False, quoting=csv.QUOTE_NONE)