<a href="https://colab.research.google.com/github/spdsp04/Spacy_Custom_NER_Shiva_Digital_Solution.ipynb/blob/main/Spacy_Custom_NER_Shiva_Digital_Solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Building Custom Named Entity Recognition Model Using Spacy**

https://newscatcherapi.com/blog/train-custom-named-entity-recognition-ner-model-with-spacy-v3

In [None]:
import spacy

In [None]:
# Downloading the large English language model for SpaCy
# !python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Loading the 'en_core_web_lg' model into SpaCy's pipeline
nlp = spacy.load("en_core_web_lg")
nlp

<spacy.lang.en.English at 0x7a3e4cc97e20>

In [None]:
# Processing a sample text to extract named entities (NER)
doc = nlp("Donad Trump was President of USA")

In [None]:
doc # This stores the processed document

Donad Trump was President of USA

In [None]:
type(doc) # Checking the type of the processed document

spacy.tokens.doc.Doc

In [None]:
doc.ents # Extracting the entities found in the document (NER)

(Donad Trump, USA)

In [None]:
doc.ents[0], type(doc.ents[0]) # Checking the first entity and its type

(Donad Trump, spacy.tokens.span.Span)

In [None]:
# Visualizing the named entities with SpaCy's displacy module
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

## File Handling and Data Extraction

In [None]:
import zipfile
import os

# Path to the uploaded zip file (adjust the path based on your system)
zip_file_path = '/content/archive.zip'

# Directory where the contents of the zip file will be extracted
extract_dir = '/content/extracted_files'

# Creating the directory if it doesn't exist
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)  # Add the missing closing parenthesis

# Unzipping the file into the specified directory
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f'Files have been extracted to {extract_dir}')


Files have been extracted to /content/extracted_files


## Loading JSON Data



Download data from below link

## https://www.kaggle.com/datasets/finalepoch/medical-ner

In [None]:
import json
# Loading data from a JSON file containing medical annotations
with open('/content/extracted_files/Corona2.json', 'r') as f:
    data = json.load(f)

## # Displaying the first example from the data

In [None]:
data['examples'][0]

{'id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'content': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'metadata': {},
 'annotations': [{'id': '0825a1

In [None]:
data['examples'][0].keys()# Showing the keys of the first example

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])

In [None]:
data['examples'][0]['content'] # The textual content of the example

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [None]:
data['examples'][0]['annotations'][0]# Annotations related to the text

{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
 'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1',
 'end': 371,
 'start': 360,
 'example_id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'tag_name': 'Medicine',
 'value': 'Diosmectite',
 'correct': None,
 'human_annotations': [{'timestamp': '2020-03-21T00:24:32.098000Z',
   'annotator_id': 1,
   'tagged_token_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
   'name': 'Ashpat123',
   'reason': 'exploration'}],
 'model_annotations': []}

## Preparing Training Data for NER

In [None]:
training_data = []
# Looping through each example in the dataset
for example in data['examples']:
  temp_dict = {}
  temp_dict['text'] = example['content']# Extracting the text
  temp_dict['entities'] = []# Initializing an empty list for entities
   # Looping through the annotations to get start, end, and tag for each entity
  for annotation in example['annotations']:
    start = annotation['start']# Start position of entity
    end = annotation['end']# End position of entity
    label = annotation['tag_name'].upper() # The label of the entity
    temp_dict['entities'].append((start, end, label)) # Append entity details
  training_data.append(temp_dict)# Adding each example to the training data

print(training_data[0])# Checking the first entry in the training data

{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]", 'entities': [(360, 371, 'MEDICINE'), (383, 408, 'MEDICINE'), (104, 112, 'MEDICALCONDITION'), (679,

In [None]:
training_data[0]['text'] # Accessing the text of the first entry

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [None]:
training_data[0]['entities']  # Accessing the entities in the first entry

[(360, 371, 'MEDICINE'),
 (383, 408, 'MEDICINE'),
 (104, 112, 'MEDICALCONDITION'),
 (679, 689, 'MEDICINE'),
 (6, 23, 'MEDICINE'),
 (25, 37, 'MEDICINE'),
 (461, 470, 'MEDICALCONDITION'),
 (577, 589, 'MEDICINE'),
 (853, 865, 'MEDICALCONDITION'),
 (188, 198, 'MEDICINE'),
 (754, 762, 'MEDICALCONDITION'),
 (870, 880, 'MEDICALCONDITION'),
 (823, 833, 'MEDICINE'),
 (852, 853, 'MEDICALCONDITION'),
 (461, 469, 'MEDICALCONDITION'),
 (535, 543, 'MEDICALCONDITION'),
 (692, 704, 'MEDICINE'),
 (563, 571, 'MEDICALCONDITION')]

In [None]:
training_data[0]['text'][360:371]# Example snippet of the text from a specific range

'Diosmectite'

## SpaCy NER Model Preparation

In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm
# Creating a blank SpaCy model for training
nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()

In [None]:
# Utility for filtering overlapping entities
from spacy.util import filter_spans
# Processing the training data to prepare it for the SpaCy model
for training_example  in tqdm(training_data):
    text = training_example['text']# Extracting text from each training example
    labels = training_example['entities']# Getting the entity labels
  # Creating a new document in SpaCy's format
    doc = nlp.make_doc(text)
    ents = []# Placeholder for entity spans
     # Looping through each entity's start, end, and label
    for start, end, label in labels:
       # Creating a character span for each entity
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
          # Handling missing spans
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    # Filtering overlapping entities
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents # Assigning the filtered entities to the document
     # Adding the document to the binary format for SpaCy
    doc_bin.add(doc)

# Saving the training data in SpaCy's binary format
doc_bin.to_disk("train.spacy")

100%|██████████| 31/31 [00:00<00:00, 540.04it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





## SpaCy Model Configuration and Training

## Special Instruction

In [None]:
# Visit to below link
# https://spacy.io/usage/training#quickstart
# copy code for base_config.cfg file from above link and save

In [None]:
# Initializing a configuration file for SpaCy's training
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


##### Do following changes in config.cfg file to finish within 3-5 minutes
  - Change batch_size = 100 (Line no : 14)
  - max_epochs = 10 (Line No : 90)

In [None]:
# Training the SpaCy model using the custom NER data
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    153.29    0.55    0.91    0.39    0.01
  7     200        499.25   3640.10   70.00   82.80   60.63    0.70
[38;5;2m✔ Saved pipeline to output directory[0m
model-last


## Testing the Trained Model and Visualization

In [None]:
# Loading the best-performing model after training
nlp_ner = spacy.load("model-best")

In [None]:
# Running inference on new text using the trained NER model
doc = nlp_ner("While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.")
# Customizing colors for different entity types
colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#a6e22d"}
options = {"colors": colors}
# Visualizing the recognized entities with customized colors
spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

# Thank You