In [1]:
import pandas as pd

# Load CSV
df = pd.read_csv("Listing_Titles.tsv", keep_default_na=False, na_values=None, sep="\t")

In [2]:
test_df = df[5000:30000].copy()

In [3]:
del df

In [21]:
test_df

Unnamed: 0,Record Number,Category,Title
5004,5005,2,1x DOLZ WASSERPUMPE + ZAHNRIEMENSATZ KD046 910...
5005,5006,2,Wasserpumpe + Zahnriemensatz GK K981089G für G...
5007,5008,2,1x INA Zahnriemensatz für Citroen Jumper 244/2...
5009,5010,2,Zahnriemensatz SNR KD484.07 für KIA HYUNDAI TU...
5011,5012,2,ZAHNRIEMENSATZ SET KIT BOSCH 1 987 948 946 P F...
...,...,...,...
29989,29990,2,1x SKF Zahnriemensatz u.a. für Honda CR-V 1 RD...
29990,29991,2,Gates Zahnriemen Set Für Vauxhall Astravan Ast...
29991,29992,2,BOSCH 1987946931 Wasserpumpe + Zahnriemensatz
29993,29994,2,MOTOR STEUERKETTE SATZ VOLL FAI AUTOPARTS TCK7...


In [5]:
test_df['Title'] = test_df['Title'].str.replace(r'\s{2,}', ' ', regex=True)
test_df['Title'] = test_df['Title'].str.replace('\xa0', ' ', regex=True)
test_df = test_df[test_df['Category']==2]

In [6]:
test_df

Unnamed: 0,Record Number,Category,Title
5004,5005,2,1x DOLZ WASSERPUMPE + ZAHNRIEMENSATZ KD046 910...
5005,5006,2,Wasserpumpe + Zahnriemensatz GK K981089G für G...
5007,5008,2,1x INA Zahnriemensatz für Citroen Jumper 244/2...
5009,5010,2,Zahnriemensatz SNR KD484.07 für KIA HYUNDAI TU...
5011,5012,2,ZAHNRIEMENSATZ SET KIT BOSCH 1 987 948 946 P F...
...,...,...,...
29989,29990,2,1x SKF Zahnriemensatz u.a. für Honda CR-V 1 RD...
29990,29991,2,Gates Zahnriemen Set Für Vauxhall Astravan Ast...
29991,29992,2,BOSCH 1987946931 Wasserpumpe + Zahnriemensatz
29993,29994,2,MOTOR STEUERKETTE SATZ VOLL FAI AUTOPARTS TCK7...


In [7]:
#import csv
#test_df.to_csv('quiz_data.tsv', sep="\t", index=False)

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.functional import softmax
import pandas as pd
from tqdm import tqdm



In [9]:
# Step 1: Load model and tokenizer
model_dir = "ner_model_artifacts_for_2_large"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForTokenClassification.from_pretrained(model_dir)
model.eval()

# Step 2: Load test data
#test_df = pd.read_csv("test_data.csv")  # Adjust filename as needed



BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

In [10]:
# Step 3: Define helper functions to decode BIOES tags into aspect chunks
def extract_aspects(tokens, tags):
    aspects = []
    current_aspect = None
    aspect_tokens = []

    for token, tag in zip(tokens, tags):
        if tag == "O":
            if current_aspect:
                aspects.append((current_aspect, ' '.join(aspect_tokens)))
                current_aspect = None
                aspect_tokens = []
        elif tag.startswith("B-") or tag.startswith("S-"):
            if current_aspect:
                aspects.append((current_aspect, ' '.join(aspect_tokens)))
            current_aspect = tag[2:]
            aspect_tokens = [token]
            if tag.startswith("S-"):  # Single-token aspect
                aspects.append((current_aspect, token))
                current_aspect = None
                aspect_tokens = []
        elif tag.startswith("I-") or tag.startswith("E-"):
            if current_aspect:
                aspect_tokens.append(token)
            if tag.startswith("E-"):
                if current_aspect:
                    aspects.append((current_aspect, ' '.join(aspect_tokens)))
                    current_aspect = None
                    aspect_tokens = []
    if current_aspect:
        aspects.append((current_aspect, ' '.join(aspect_tokens)))
    return aspects



In [11]:
# Step 4: Perform inference and write output
output_rows = []

label_map = {i: label for i, label in enumerate(model.config.id2label.values())}

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    record_number = row["Record Number"]
    category = row["Category"]
    title = row["Title"]

    # Tokenize with alignment to original words
    inputs = tokenizer(title, return_tensors="pt", truncation=True, is_split_into_words=False)
    with torch.no_grad():
        outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)[0].tolist()

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    word_ids = inputs.word_ids()

    # Group predictions by original words
    # Group predictions at the word level
    word_predictions = []
    current_word_idx = None
    current_word_tokens = []
    current_word_labels = []

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue  # Skip special tokens like [CLS], [SEP]

        token = tokens[idx]
        label = label_map[predictions[idx]]

        if word_idx != current_word_idx:
            # Save previous word prediction
            if current_word_tokens:
                # Use the first label (common for NER, or use majority voting)
                word = tokenizer.convert_tokens_to_string(current_word_tokens).strip()
                word_predictions.append((word, current_word_labels[0]))
            # Start new word
            current_word_tokens = [token]
            current_word_labels = [label]
            current_word_idx = word_idx
        else:
            # Continuation of current word (subword)
            current_word_tokens.append(token)
            current_word_labels.append(label)

    # Add final word
    if current_word_tokens:
        word = tokenizer.convert_tokens_to_string(current_word_tokens).strip()
        word_predictions.append((word, current_word_labels[0]))

    # Extract aspect name/value pairs
    aspect_chunks = extract_aspects([wp[0] for wp in word_predictions], [wp[1] for wp in word_predictions])
    for aspect_name, aspect_value in aspect_chunks:
        output_rows.append([record_number, category, aspect_name, aspect_value])



  return forward_call(*args, **kwargs)
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [1:08:01<00:00,  3.06it/s]


In [12]:
output_df = pd.DataFrame(output_rows, columns=["Record Number", "Category", "Aspect Name", "Aspect Value"])

In [29]:
#output_df
test_df[test_df['Record Number']==5008]['Title'].iloc[0]

'1x INA Zahnriemensatz für Citroen Jumper 244/244D/2 2.2 1 244/244D/2 2.2'

In [14]:
output_df[output_df['Record Number']==5008]

Unnamed: 0,Record Number,Category,Aspect Name,Aspect Value
18,5008,2,Anzahl_Der_Einheiten,1x
19,5008,2,Hersteller,INA
20,5008,2,Produktart,Zahnriemensatz
21,5008,2,Kompatible_Fahrzeug_Marke,Citroen
22,5008,2,Kompatibles_Fahrzeug_Modell,Jumper 244 / 244D / 2 2
23,5008,2,Kompatibles_Fahrzeug_Modell,1 244 / 244D / 2 2


In [19]:
output_df.to_csv('output_df_cat2.csv', index=False)

In [27]:
# Step 5: Save to file

#output_df.to_csv("ner_submission.tsv", sep="\t", index=False, header=False, encoding="utf-8")
#output_df.to_csv('ner_submission_cat2.tsv', sep="\t", index=False)

In [16]:
import csv
output_df.to_csv('ner_submission_cat2.tsv', sep="\t", index=False, quoting=csv.QUOTE_NONE)