In [1]:
import spacy
import opendatasets as od 

!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     -------------------------------------- 0.0/400.7 MB 330.3 kB/s eta 0:20:13
     -------------------------------------- 0.1/400.7 MB 469.7 kB/s eta 0:14:13
     ---------------------------------------- 0.3/400.7 MB 1.8 MB/s eta 0:03:46
     ---------------------------------------- 0.9/400.7 MB 4.5 MB/s eta 0:01:29
     ---------------------------------------- 1.8/400.7 MB 7.0 MB/s eta 0:00:57
     ---------------------------------------- 2.6/400.7 MB 8.9 MB/s eta 0:00:45
     --------------------------------------- 3.6/400.7 MB 10.6 MB/s eta 0:00:38
     --------------------------------------- 4.6/400.7 MB 11.8 MB/s eta 0:00:34
      -------------------------------


[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
nlp = spacy.load("en_core_web_lg")
nlp

<spacy.lang.en.English at 0x238672f11b0>

In [3]:
print([ pipeline[0] for pipeline in nlp.pipeline])

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [4]:
text = """Former president Clinton was accused of sexual harrasment by her secratery"""
doc = nlp(text)
print(type(text))
print(type(doc))

<class 'str'>
<class 'spacy.tokens.doc.Doc'>


In [5]:
print(doc.ents)

(Clinton,)


In [6]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [7]:
import opendatasets as od 
od.download('https://www.kaggle.com/datasets/finalepoch/medical-ner')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Dataset URL: https://www.kaggle.com/datasets/finalepoch/medical-ner
Downloading medical-ner.zip to .\medical-ner


100%|██████████| 26.2k/26.2k [00:00<00:00, 5.43MB/s]







In [8]:
import json

with open('./medical-ner/Corona2.json','r') as f:
  data = json.load(f)

In [9]:
print(data['examples'][0].keys())
print('----'*10)
print(data['examples'][0]['content'])

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])
----------------------------------------
While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]

Diosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.

Racecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94

In [10]:
data['examples'][0]['annotations'][0]

{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
 'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1',
 'end': 371,
 'start': 360,
 'example_id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'tag_name': 'Medicine',
 'value': 'Diosmectite',
 'correct': None,
 'human_annotations': [{'timestamp': '2020-03-21T00:24:32.098000Z',
   'annotator_id': 1,
   'tagged_token_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
   'name': 'Ashpat123',
   'reason': 'exploration'}],
 'model_annotations': []}

In [11]:
# generate training data 
training_data = []
for example in data['examples']:
  temp_dict={}
  temp_dict['text'] = example['content']
  temp_dict['entities'] = []

  for annots in example['annotations']:
    start = annots['start']
    end = annots['end']
    label = annots['tag_name'].lower()
    temp_dict['entities'].append((start, end, label))

  training_data.append(temp_dict)

print(training_data[0])

{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]", 'entities': [(360, 371, 'medicine'), (383, 408, 'medicine'), (104, 112, 'medicalcondition'), (679,

In [12]:
training_data[0]['entities']

[(360, 371, 'medicine'),
 (383, 408, 'medicine'),
 (104, 112, 'medicalcondition'),
 (679, 689, 'medicine'),
 (6, 23, 'medicine'),
 (25, 37, 'medicine'),
 (461, 470, 'medicalcondition'),
 (577, 589, 'medicine'),
 (853, 865, 'medicalcondition'),
 (188, 198, 'medicine'),
 (754, 762, 'medicalcondition'),
 (870, 880, 'medicalcondition'),
 (823, 833, 'medicine'),
 (852, 853, 'medicalcondition'),
 (461, 469, 'medicalcondition'),
 (535, 543, 'medicalcondition'),
 (692, 704, 'medicine'),
 (563, 571, 'medicalcondition')]

In [13]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()
doc_bin

<spacy.tokens._serialize.DocBin at 0x238670637c0>

In [14]:
from spacy.util import filter_spans 

for training_example in tqdm(training_data):
  text = training_example['text']
  labels = training_example['entities']
  doc = nlp.make_doc(text)
  ents = []

  for start, end, label in labels:
    span = doc.char_span(start, end, label=label, alignment_mode='contract')
    if span is None:
      print('Skipping entity')
    else:
      ents.append(span)

    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

  doc_bin.to_disk('train.spacy')

 16%|█▌        | 5/31 [00:00<00:00, 39.08it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 29%|██▉       | 9/31 [00:00<00:00, 31.33it/s]

Skipping entity
Skipping entity


 52%|█████▏    | 16/31 [00:00<00:00, 24.19it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 71%|███████   | 22/31 [00:00<00:00, 21.76it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


100%|██████████| 31/31 [00:01<00:00, 21.04it/s]


In [22]:
!python -m spacy init config ./base_config.cfg --lang en --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [23]:

!python -m spacy init fill-config ./base_config.cfg ./config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [24]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy 

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     76.71    1.05    0.56    8.96    0.01
  0     200       1254.82   4786.80   46.81   52.89   41.97    0.47
  1     400        116.92   1636.91   66.16   69.34   63.26    0.66
  2     600        165.72   1708.91   70.02   73.56   66.80    0.70
  2     800        149.38   1590.23   71.01   63.75   80.13    0.71
  3    1000        160.43   1460.76   70.31   78.29   63.80    0.70
  4    1200        170.78   1589.72   67.99   78.71   59.85    0.68
  5    1400        176.53   1603.29   65.56   83.90   53.80    0.66
  6    1600        179.21   1747.24   71.98   79.57   65.71    0.72
  7    1800        193.39   1828.87   74.95   70.82

In [25]:
nlp_ner = spacy.load("model-best")

doc = nlp_ner("While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.")

colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#a6e22d"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

In [None]:
# custom NER for resume 
