<a href="https://colab.research.google.com/github/sohataher/NLP-Course/blob/main/NLP_Day2_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [7]:
import pandas as pd
import ast
import spacy
from spacy.tokens import DocBin

# Load spaCy model
nlp = spacy.blank("en")

# Load the dataset
df = pd.read_csv("ner.csv")

# Forward fill sentence IDs if needed
df = df.ffill()

# Convert string representations of lists into actual lists
df['POS'] = df['POS'].apply(ast.literal_eval)
df['Tag'] = df['Tag'].apply(ast.literal_eval)

# Check and match lengths
def is_valid_row(row):
    return len(row['Sentence'].split()) == len(row['POS']) == len(row['Tag'])

df = df[df.apply(is_valid_row, axis=1)]

# Split into train and dev
train_df = df.sample(frac=0.8, random_state=42)
dev_df = df.drop(train_df.index)

# Convert to spaCy DocBin
def convert_to_docbin(dataframe, filename):
    db = DocBin()
    for _, row in dataframe.iterrows():
        doc = nlp.make_doc(row['Sentence'])
        ents = []
        words = row['Sentence'].split()
        tags = row['Tag']
        offset = 0
        for word, tag in zip(words, tags):
            start = row['Sentence'].find(word, offset)
            end = start + len(word)
            if tag != "O":
                label = tag.split("-")[-1]
                span = doc.char_span(start, end, label=label)
                if span:
                    ents.append(span)
            offset = end
        doc.ents = ents
        db.add(doc)

    # Save the DocBin object to disk
    db.to_disk(filename)

    # Print out a sample sentence from the DocBin to check the conversion
    print(f"\nConverted {filename}: Sample data from the DocBin:")
    doc_bin = DocBin().from_disk(filename)
    docs = list(doc_bin.get_docs(nlp.vocab))  # Convert generator to list
    if docs:
        sample_doc = docs[0]
        print("Text:", sample_doc.text)
        for ent in sample_doc.ents:
            print(f"Entity: {ent.text}, Label: {ent.label_}")
    else:
        print("No documents found.")

# Convert to DocBin and check sample data
convert_to_docbin(train_df, "train.spacy")
convert_to_docbin(dev_df, "dev.spacy")



Converted train.spacy: Sample data from the DocBin:
Text: On the Republican side , Senator John McCain seems on the verge of clinching his party 's nomination .
Entity: Senator, Label: per
Entity: John, Label: per
Entity: McCain, Label: per

Converted dev.spacy: Sample data from the DocBin:
Text: They marched from the Houses of Parliament to a rally in Hyde Park .
Entity: Hyde, Label: geo
Entity: Park, Label: geo


In [9]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     44.21    1.46    3.78    0.91    0.01
  0     200         21.48   2622.10   75.24   76.93   73.63    0.75
  0     400         60.67   1782.43   78.41   78.03   78.79    0.78
  0     600         36.58   1941.12   81.70   82.50   80.91    0.82
  0     800         45.69   2265.85   82.71   81.09   84.39    0.83
  0    1000         54.29   2509.12   83.92   84.46   83.38    0.84
  0    1200         61.49   2838.34   84.21   84.84   83.59    0.84
  0    1400         81.75   3524.73   84.89   85.69   84.11    0.85
  0    1600        104.66   4176.09   83.43   84.15 

In [11]:
!python -m spacy evaluate ./output/model-best ./dev.spacy

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   88.49 
NER R   87.29 
NER F   87.89 
SPEED   3656  

[1m

          P       R       F
org   83.80   78.97   81.32
geo   86.12   91.06   88.52
gpe   95.39   95.09   95.24
tim   93.47   89.41   91.40
per   90.40   89.47   89.93
art   14.63    4.41    6.78
eve   58.00   23.39   33.33
nat   89.47   35.42   50.75



In [12]:
import spacy

model = spacy.load("./output/model-best")

In [15]:
doc = model("Apple is looking to buy a startup in America")
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple org
America geo
