# Convert to BIOES

In [1]:
import pandas as pd
df_train1 = pd.read_csv('Tagged_Titles_Train.tsv', keep_default_na=False, na_values=None, sep="\t")

df_train1['Title'] = df_train1['Title'].str.replace(r'\s{2,}', ' ', regex=True)
df_train1['Title'] = df_train1['Title'].str.replace('\xa0', ' ', regex=True)


In [2]:
df_train1

Unnamed: 0,Record Number,Category,Title,Token,Tag
0,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,MINI,Kompatible_Fahrzeug_Marke
1,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,1.6,Kompatibles_Fahrzeug_Modell
2,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W10B16A,Herstellernummer
3,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W11B16A,
4,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,R50,
...,...,...,...,...,...
56807,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,Opel,Kompatible_Fahrzeug_Marke
56808,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,Saab,Kompatibles_Fahrzeug_Modell
56809,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,1.8,
56810,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,/,O


In [4]:
def label_entity1(tokens, label, category, title):
    n = len(tokens)
    if n == 1:
        return [(tokens[0][0], category, title, tokens[0][1], f"S-{label}", tokens[0][2])]
    else:
        labeled = []
        for i, (rec, token, orig_idx) in enumerate(tokens):
            if i == 0:
                tag = f"B-{label}"
            elif i == n - 1:
                tag = f"E-{label}"
            else:
                tag = f"I-{label}"
            labeled.append((rec, category, title, token, tag, orig_idx))
        return labeled

def convert_df_to_bioes1(df):
    result_rows = []

    # Save original dtypes
    original_dtypes = df.dtypes.to_dict()

    # Save original index
    df = df.reset_index()  # creates a new column with original index
    df = df.rename(columns={"index": "OrigIndex"})

    for record_id, group in df.groupby("Record Number"):
        group = group.reset_index(drop=True)
        current_entity = []
        current_label = None
        current_category = None
        current_title = None

        for idx, row in group.iterrows():
            token = row['Token']
            tag = row['Tag']
            category = row['Category']
            title = row['Title']
            orig_idx = row['OrigIndex']

            if tag == 'O':
                if current_entity:
                    result_rows.extend(label_entity1(current_entity, current_label, current_category, current_title))
                    current_entity = []
                    current_label = None
                    current_category = None
                    current_title = None
                result_rows.append((record_id, category, title, token, 'O', orig_idx))

            elif pd.notna(tag) and tag != '':
                if current_entity:
                    result_rows.extend(label_entity1(current_entity, current_label, current_category, current_title))
                current_entity = [(record_id, token, orig_idx)]
                current_label = tag
                current_category = category
                current_title = title

            elif pd.isna(tag) or tag == '':
                current_entity.append((record_id, token, orig_idx))

        if current_entity:
            result_rows.extend(label_entity1(current_entity, current_label, current_category, current_title))

    result_df = pd.DataFrame(result_rows, columns=[
        "Record Number", "Category", "Title", "Token", "Tag", "OrigIndex"
    ])

    # Sort by original index and drop the helper column
    result_df = result_df.sort_values(by="OrigIndex").drop(columns=["OrigIndex"]).reset_index(drop=True)

    # Restore original data types carefully:
    for col in ["Record Number", "Category", "Title", "Token", "Tag"]:
        if col in original_dtypes:
            orig_dtype = original_dtypes[col]
            # Check if original dtype is integer
            if pd.api.types.is_integer_dtype(orig_dtype):
                # If the column has any NaNs, use nullable Int64 type; else use original int dtype
                if result_df[col].isnull().any():
                    result_df[col] = result_df[col].astype(pd.Int64Dtype())
                else:
                    result_df[col] = result_df[col].astype(orig_dtype)
            else:
                result_df[col] = result_df[col].astype(orig_dtype)

    return result_df


In [5]:
BIOES_data = convert_df_to_bioes1(df_train1)

In [6]:
#4789
df_train1[df_train1['Record Number']==4036]

Unnamed: 0,Record Number,Category,Title,Token,Tag
45982,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,1x,Anzahl_Der_Einheiten
45983,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,Wasserpumpe,Im_Lieferumfang_Enthalten
45984,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,+,O
45985,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,Zahnriemensatz,Produktart
45986,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,SKF,Hersteller
45987,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,VKMC,Herstellernummer
45988,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,95624,
45989,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,passend,O
45990,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,für,O
45991,4036,2,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,FIAT,Kompatible_Fahrzeug_Marke


In [15]:
BIOES_data[BIOES_data['Record Number']==4036][['Title','Token','Tag']]

Unnamed: 0,Title,Token,Tag
45982,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,1x,S-Anzahl_Der_Einheiten
45983,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,Wasserpumpe,S-Im_Lieferumfang_Enthalten
45984,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,+,O
45985,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,Zahnriemensatz,S-Produktart
45986,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,SKF,S-Hersteller
45987,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,VKMC,B-Herstellernummer
45988,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,95624,E-Herstellernummer
45989,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,passend,O
45990,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,für,O
45991,1x Wasserpumpe + Zahnriemensatz SKF VKMC 95624...,FIAT,S-Kompatible_Fahrzeug_Marke


In [522]:
BIOES_data[BIOES_data['Token']=='AX']

Unnamed: 0,Record Number,Category,Title,Token,Tag
16528,1449,2.0,CONTI Zahnriemen + Rolle Wasserpumpe CITROEN B...,AX,I-Kompatibles_Fahrzeug_Modell
19573,1716,2.0,BOSCH Zahnriemensatz für CITROEN AX Berlingo X...,AX,S-Kompatibles_Fahrzeug_Modell
20804,1827,,,AX,S-None
27251,2399,2.0,Zahnriemensatz mit Wasserpumpe Citroen Berling...,AX,E-Kompatibles_Fahrzeug_Modell
49392,4337,2.0,OEM ZAHNRIEMENSATZ WASSERPUMPE CITROEN AX SAXO...,AX,S-Kompatibles_Fahrzeug_Modell
49980,4387,2.0,CONTITECH Keilrippenriemen 6PK1564 Citroen Ber...,AX,S-Kompatibles_Fahrzeug_Modell
55420,4877,2.0,Gates Zahnriemensatz Spannrolle und Wasserpump...,AX,S-Kompatibles_Fahrzeug_Modell


In [523]:
BIOES_data.loc[20804, :] = df_train1.loc[20804, :].copy()
BIOES_data.loc[20804, 'Tag'] = 'S-Kompatibles_Fahrzeug_Modell'
BIOES_data = BIOES_data.astype({
    "Category": "int64"
})


In [524]:
df1 = df_train1.drop(columns=["Tag"])
df2 = BIOES_data.drop(columns=["Tag"])
df1.equals(df2)

True

In [None]:
import csv
BIOES_data.to_csv('BIOES_train_data.tsv', sep="\t", index=False, quoting=csv.QUOTE_NONE)