# NER Test Case

In [2]:
import pandas as pd

data_path = "../static/ds_ner_test_case.csv"
df = pd.read_csv(data_path)

df.head()

Unnamed: 0,description,highlights,headline,Brand,Speicherkapazität,Farbe,productId
0,"Das G56JR steckt voller Komponenten, deren Per...",<b> Intel Core i7-4700HQ der vierten Generati...,ASUS G56JR-CN169H,Asus,,Schwarz,52.0
1,Das IdeaPad Yoga 13 ist eines der ersten Ultra...,<b> Ultrabook mit klappbarem Multitouch-Displ...,LENOVO IdeaPad Yoga2 Pro 13 Orange,Lenovo,,Orange,87.0
2,Das Lenovo Flex2-15 ist ein schlankes und leic...,"<b> 15,6""-«Dual-Mode»-Notebook, schwarz</b> <...",LENOVO IdeaPad Flex2-15 schwarz,Lenovo,,Schwarz,100.0
3,Das Lenovo Flex2-15 ist ein schlankes und leic...,"<b> Erschwingliches 15,6""-Dual-Mode-Notebook,...",LENOVO IdeaPad Flex2-15 schwarz,Lenovo,,Schwarz,101.0
4,Das IdeaPad Yoga 13 ist eines der ersten Ultra...,<b> Ultrabook mit klappbarem Multitouch-Displ...,LENOVO IdeaPad Yoga2 Pro 13 Orange,Lenovo,,Orange,107.0


In [13]:
brands = df["Brand"].unique()
colors = df["Farbe"].unique()
storage_capacities = df["Speicherkapazität"].unique()

print(f"{brands=}\n\n{colors=}\n\n{storage_capacities=}")

brands=array(['Asus', 'Lenovo', 'Acer', 'Toshiba', 'Sony', 'Western Digital',
       'Trust', 'HP', 'Keysonic', 'Samsung', 'Hama', 'Buffalo', 'WD',
       'HUAWEI', 'Alcatel', 'ChiliGreen', 'ednet', 'Verbatim', 'HN Power',
       'Dell', 'LogiLink', 'CAT', 'Seagate', 'WIKO', 'Renkforce',
       'Transcend', 'Hitachi', 'Nokia', 'CHERRY', 'Hyper', 'Cellularline',
       'Perixx', 'YOTA', 'Microsoft', 'honor', 'Tylt', 'Genius',
       'Motorola', 'Phicomm', 'Gembird', 'V7 Videoseven', 'Kensington',
       'Intenso', 'Belkin', 'Cyrus', 'Vivanco', 'Medion', 'Leitz',
       'Huawei', 'Logitech', 'Kyocera', 'Odys', 'MadCatz', 'Rapoo',
       'VPRO', 'ZTE', 'VOLTCRAFT', 'Apple', 'Dynabook', 'Skross',
       'Manhattan', 'SpeedLink', 'Elgato', 'G-Technology', 'Digittrade',
       'SanDisk', 'NZXT', 'Silverstone', 'Digitus', 'Zowie',
       'Sound BlasterX', 'Archos', '3Dconnexion', 'Sharkoon', 'spiffy',
       'Geemarc', 'MayaMax', 'Matias', 'TrekStor®', 'Crucial', 'Vakoss',
       'Corsair', '

## Data Preprocessing

In [4]:
train_data = []
for index, row in df.iterrows():
    description = row['description']
    brand = row['Brand']
    storage = row['Speicherkapazität']
    color = row['Farbe']

    entities = []

    # Add brand if present in description
    if pd.notna(brand) and brand.lower() in description.lower():
        start_idx = description.lower().index(brand.lower())
        end_idx = start_idx + len(brand)
        entities.append((start_idx, end_idx, "Brand"))
    
    # Add storage if present in description
    if pd.notna(storage) and str(storage) in description:
        start_idx = description.index(str(storage))
        end_idx = start_idx + len(str(storage))
        entities.append((start_idx, end_idx, "Storage"))
    
    # Add color if present in description
    if pd.notna(color) and color.lower() in description.lower():
        start_idx = description.lower().index(color.lower())
        end_idx = start_idx + len(color)
        entities.append((start_idx, end_idx, "Color"))
    
    if entities:
        train_data.append((description, {"entities": entities}))

In [12]:
# Fix train data
import spacy
from spacy.training import offsets_to_biluo_tags
# Load a blank German model
nlp = spacy.blank("de")

train_data_fixed = []
# fix the train data
for text, annotations in train_data:
        doc = nlp.make_doc(text)
        entities = annotations.get("entities")
        try:
            tags = offsets_to_biluo_tags(doc, entities)
            if '-' not in tags:
                train_data_fixed.append((text, annotations))
        except ValueError:
                # In case there's a problem converting to BILUO tags
                print(f"Skipping misaligned example: {text}")

ImportError: [E048] Can't import language de_core_news_sm or any matching language from spacy.lang: No module named 'spacy.lang.de_core_news_sm'

In [8]:
print(f"{len(train_data)=} vs {len(train_data_fixed)=}")

len(train_data)=775 vs len(train_data_fixed)=690


## Training NER Model

In [10]:
from tqdm import tqdm
from spacy.training import Example
import random

# Add NER pipeline to the model
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add the entity labels to the NER pipeline
ner.add_label("Brand")
ner.add_label("Storage")
ner.add_label("Color")

# Begin training the NER model
optimizer = nlp.begin_training()

# Number of training iterations
n_iter = 50

# Disable other pipelines during training (since we're only interested in 'ner')
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # Only train NER 
    for iteration in range(n_iter):
        random.shuffle(train_data_fixed)  # Shuffle the training data
        losses = {}
        
        for i in tqdm(range(len(train_data_fixed))):
            text, annotations = train_data_fixed[i]
            # Create Example object for the training data
            example = Example.from_dict(nlp.make_doc(text), annotations)
            
            # Update the model
            nlp.update([example], drop=0.5, losses=losses)
        
        print(f"Iteration {iteration} Losses: {losses}")

# Save the model to disk
nlp.to_disk("../static/ner_model_de")
print("Model saved to 'ner_model' directory.")

  4%|▍         | 31/690 [00:02<00:54, 12.08it/s]


KeyboardInterrupt: 