# NER Test Case

In [1]:
import pandas as pd

data_path = "../static/ds_ner_test_case.csv"
df = pd.read_csv(data_path)

df.head()

Unnamed: 0,description,highlights,headline,Brand,Speicherkapazität,Farbe,productId
0,"Das G56JR steckt voller Komponenten, deren Per...",<b> Intel Core i7-4700HQ der vierten Generati...,ASUS G56JR-CN169H,Asus,,Schwarz,52.0
1,Das IdeaPad Yoga 13 ist eines der ersten Ultra...,<b> Ultrabook mit klappbarem Multitouch-Displ...,LENOVO IdeaPad Yoga2 Pro 13 Orange,Lenovo,,Orange,87.0
2,Das Lenovo Flex2-15 ist ein schlankes und leic...,"<b> 15,6""-«Dual-Mode»-Notebook, schwarz</b> <...",LENOVO IdeaPad Flex2-15 schwarz,Lenovo,,Schwarz,100.0
3,Das Lenovo Flex2-15 ist ein schlankes und leic...,"<b> Erschwingliches 15,6""-Dual-Mode-Notebook,...",LENOVO IdeaPad Flex2-15 schwarz,Lenovo,,Schwarz,101.0
4,Das IdeaPad Yoga 13 ist eines der ersten Ultra...,<b> Ultrabook mit klappbarem Multitouch-Displ...,LENOVO IdeaPad Yoga2 Pro 13 Orange,Lenovo,,Orange,107.0


In [2]:
brands = df["Brand"].unique()
colors = df["Farbe"].unique()
storage_capacities = df["Speicherkapazität"].unique()

print(f"{brands=}\n\n{colors=}\n\n{storage_capacities=}")

brands=array(['Asus', 'Lenovo', 'Acer', 'Toshiba', 'Sony', 'Western Digital',
       'Trust', 'HP', 'Keysonic', 'Samsung', 'Hama', 'Buffalo', 'WD',
       'HUAWEI', 'Alcatel', 'ChiliGreen', 'ednet', 'Verbatim', 'HN Power',
       'Dell', 'LogiLink', 'CAT', 'Seagate', 'WIKO', 'Renkforce',
       'Transcend', 'Hitachi', 'Nokia', 'CHERRY', 'Hyper', 'Cellularline',
       'Perixx', 'YOTA', 'Microsoft', 'honor', 'Tylt', 'Genius',
       'Motorola', 'Phicomm', 'Gembird', 'V7 Videoseven', 'Kensington',
       'Intenso', 'Belkin', 'Cyrus', 'Vivanco', 'Medion', 'Leitz',
       'Huawei', 'Logitech', 'Kyocera', 'Odys', 'MadCatz', 'Rapoo',
       'VPRO', 'ZTE', 'VOLTCRAFT', 'Apple', 'Dynabook', 'Skross',
       'Manhattan', 'SpeedLink', 'Elgato', 'G-Technology', 'Digittrade',
       'SanDisk', 'NZXT', 'Silverstone', 'Digitus', 'Zowie',
       'Sound BlasterX', 'Archos', '3Dconnexion', 'Sharkoon', 'spiffy',
       'Geemarc', 'MayaMax', 'Matias', 'TrekStor®', 'Crucial', 'Vakoss',
       'Corsair', '

## Data Preprocessing

In [3]:
train_data = []
for index, row in df.iterrows():
    description = row['headline']
    brand = row['Brand']
    storage = row['Speicherkapazität']
    color = row['Farbe']

    entities = []

    # Add brand if present in description
    if pd.notna(brand) and brand.lower() in description.lower():
        start_idx = description.lower().index(brand.lower())
        end_idx = start_idx + len(brand)
        entities.append((start_idx, end_idx, "Brand"))
    
    # Add storage if present in description
    if pd.notna(storage) and str(storage) in description:
        start_idx = description.index(str(storage))
        end_idx = start_idx + len(str(storage))
        entities.append((start_idx, end_idx, "Storage"))
    
    # Add color if present in description
    if pd.notna(color) and color.lower() in description.lower():
        start_idx = description.lower().index(color.lower())
        end_idx = start_idx + len(color)
        entities.append((start_idx, end_idx, "Color"))
    
    if entities:
        train_data.append((description, {"entities": entities}))

In [5]:
# Fix train data
import spacy
from spacy.training import offsets_to_biluo_tags
# Load a blank German model
nlp = spacy.blank("de")

train_data_fixed = []
# fix the train data
for text, annotations in train_data:
        doc = nlp.make_doc(text)
        entities = annotations.get("entities")
        try:
            tags = offsets_to_biluo_tags(doc, entities)
            if '-' not in tags:
                train_data_fixed.append((text, annotations))
        except ValueError:
                # In case there's a problem converting to BILUO tags
                print(f"Skipping misaligned example: {text}")



In [7]:
print(f"{len(train_data)=} vs {len(train_data_fixed)=}")

len(train_data)=1040 vs len(train_data_fixed)=1033


## Training NER Model

In [9]:
from tqdm import tqdm
from spacy.training import Example
import random

# Add NER pipeline to the model
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add the entity labels to the NER pipeline
ner.add_label("Brand")
ner.add_label("Storage")
ner.add_label("Color")

# Begin training the NER model
optimizer = nlp.begin_training()

# Number of training iterations
n_iter = 50

# Disable other pipelines during training (since we're only interested in 'ner')
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # Only train NER 
    for iteration in range(n_iter):
        random.shuffle(train_data_fixed)  # Shuffle the training data
        losses = {}
        
        for i in tqdm(range(len(train_data_fixed))):
            text, annotations = train_data_fixed[i]
            # Create Example object for the training data
            example = Example.from_dict(nlp.make_doc(text), annotations)
            
            # Update the model
            nlp.update([example], drop=0.5, losses=losses)
        
        print(f"Iteration {iteration} Losses: {losses}")

# Save the model to disk
nlp.to_disk("../static/ner_model_de")
print("Model saved to 'ner_model' directory.")

100%|██████████| 1033/1033 [00:07<00:00, 129.35it/s]


Iteration 0 Losses: {'ner': 1146.3098181053251}


100%|██████████| 1033/1033 [00:07<00:00, 132.95it/s]


Iteration 1 Losses: {'ner': 414.6862589821014}


100%|██████████| 1033/1033 [00:07<00:00, 132.54it/s]


Iteration 2 Losses: {'ner': 371.9185875817716}


100%|██████████| 1033/1033 [00:07<00:00, 132.63it/s]


Iteration 3 Losses: {'ner': 290.481962048693}


100%|██████████| 1033/1033 [00:07<00:00, 135.04it/s]


Iteration 4 Losses: {'ner': 249.93483108890837}


100%|██████████| 1033/1033 [00:07<00:00, 134.48it/s]


Iteration 5 Losses: {'ner': 224.6221777979352}


100%|██████████| 1033/1033 [00:07<00:00, 135.18it/s]


Iteration 6 Losses: {'ner': 192.72439532188034}


100%|██████████| 1033/1033 [00:07<00:00, 133.44it/s]


Iteration 7 Losses: {'ner': 179.67783609375428}


100%|██████████| 1033/1033 [00:07<00:00, 134.92it/s]


Iteration 8 Losses: {'ner': 181.1791641029686}


100%|██████████| 1033/1033 [00:07<00:00, 132.65it/s]


Iteration 9 Losses: {'ner': 166.31224048357464}


100%|██████████| 1033/1033 [00:07<00:00, 131.55it/s]


Iteration 10 Losses: {'ner': 138.58635815702536}


100%|██████████| 1033/1033 [00:07<00:00, 135.77it/s]


Iteration 11 Losses: {'ner': 156.3742770096351}


100%|██████████| 1033/1033 [00:07<00:00, 134.90it/s]


Iteration 12 Losses: {'ner': 161.5446305689201}


100%|██████████| 1033/1033 [00:07<00:00, 137.74it/s]


Iteration 13 Losses: {'ner': 167.33772711894878}


100%|██████████| 1033/1033 [00:07<00:00, 137.19it/s]


Iteration 14 Losses: {'ner': 135.57958708422407}


100%|██████████| 1033/1033 [00:07<00:00, 130.63it/s]


Iteration 15 Losses: {'ner': 114.49097727519472}


100%|██████████| 1033/1033 [00:07<00:00, 139.66it/s]


Iteration 16 Losses: {'ner': 134.5716054475771}


100%|██████████| 1033/1033 [00:07<00:00, 136.60it/s]


Iteration 17 Losses: {'ner': 129.45844465010921}


100%|██████████| 1033/1033 [00:07<00:00, 137.25it/s]


Iteration 18 Losses: {'ner': 122.72097468550358}


100%|██████████| 1033/1033 [00:07<00:00, 136.96it/s]


Iteration 19 Losses: {'ner': 124.23446873867458}


100%|██████████| 1033/1033 [00:08<00:00, 128.21it/s]


Iteration 20 Losses: {'ner': 121.17117970608406}


100%|██████████| 1033/1033 [00:08<00:00, 126.49it/s]


Iteration 21 Losses: {'ner': 115.71691937114227}


100%|██████████| 1033/1033 [00:07<00:00, 129.71it/s]


Iteration 22 Losses: {'ner': 94.10251168143833}


100%|██████████| 1033/1033 [00:07<00:00, 129.60it/s]


Iteration 23 Losses: {'ner': 125.36364401169853}


100%|██████████| 1033/1033 [00:08<00:00, 126.94it/s]


Iteration 24 Losses: {'ner': 139.91401144066919}


100%|██████████| 1033/1033 [00:08<00:00, 128.18it/s]


Iteration 25 Losses: {'ner': 102.48026799873941}


100%|██████████| 1033/1033 [00:07<00:00, 130.65it/s]


Iteration 26 Losses: {'ner': 104.41035903771079}


100%|██████████| 1033/1033 [00:07<00:00, 131.02it/s]


Iteration 27 Losses: {'ner': 111.53575737423623}


100%|██████████| 1033/1033 [00:08<00:00, 123.73it/s]


Iteration 28 Losses: {'ner': 105.09522747176547}


100%|██████████| 1033/1033 [00:08<00:00, 126.64it/s]


Iteration 29 Losses: {'ner': 123.68367019819748}


100%|██████████| 1033/1033 [00:07<00:00, 130.55it/s]


Iteration 30 Losses: {'ner': 108.81622363589939}


100%|██████████| 1033/1033 [00:07<00:00, 129.43it/s]


Iteration 31 Losses: {'ner': 96.1405321147261}


100%|██████████| 1033/1033 [00:08<00:00, 126.66it/s]


Iteration 32 Losses: {'ner': 122.07032217775965}


100%|██████████| 1033/1033 [00:07<00:00, 130.77it/s]


Iteration 33 Losses: {'ner': 99.86769400487735}


100%|██████████| 1033/1033 [00:08<00:00, 126.36it/s]


Iteration 34 Losses: {'ner': 123.41685847913716}


100%|██████████| 1033/1033 [00:08<00:00, 126.27it/s]


Iteration 35 Losses: {'ner': 105.84975120593717}


100%|██████████| 1033/1033 [00:08<00:00, 127.48it/s]


Iteration 36 Losses: {'ner': 104.70765361586622}


100%|██████████| 1033/1033 [00:08<00:00, 126.24it/s]


Iteration 37 Losses: {'ner': 115.59865287327173}


100%|██████████| 1033/1033 [00:08<00:00, 127.34it/s]


Iteration 38 Losses: {'ner': 113.94406796456633}


100%|██████████| 1033/1033 [00:08<00:00, 124.87it/s]


Iteration 39 Losses: {'ner': 115.70608822406638}


100%|██████████| 1033/1033 [00:08<00:00, 122.44it/s]


Iteration 40 Losses: {'ner': 78.10384894232496}


100%|██████████| 1033/1033 [00:08<00:00, 126.55it/s]


Iteration 41 Losses: {'ner': 87.59132501572384}


100%|██████████| 1033/1033 [00:08<00:00, 127.46it/s]


Iteration 42 Losses: {'ner': 98.88124969004843}


100%|██████████| 1033/1033 [00:08<00:00, 127.41it/s]


Iteration 43 Losses: {'ner': 88.75497940091607}


100%|██████████| 1033/1033 [00:08<00:00, 128.30it/s]


Iteration 44 Losses: {'ner': 101.58478006857567}


100%|██████████| 1033/1033 [00:08<00:00, 126.57it/s]


Iteration 45 Losses: {'ner': 86.09457833276696}


100%|██████████| 1033/1033 [00:08<00:00, 125.89it/s]


Iteration 46 Losses: {'ner': 96.68646715945934}


100%|██████████| 1033/1033 [00:08<00:00, 127.35it/s]


Iteration 47 Losses: {'ner': 87.90907568724109}


100%|██████████| 1033/1033 [00:08<00:00, 125.06it/s]


Iteration 48 Losses: {'ner': 96.01089329183154}


100%|██████████| 1033/1033 [00:08<00:00, 126.41it/s]

Iteration 49 Losses: {'ner': 82.35493677515262}
Model saved to 'ner_model' directory.



