# Spacy

In [25]:
import json
import traceback
import re
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans
import locale

In [None]:

def get_entites_spacy(file_name):
    """Extract the entities for NER traineing from Label Studio json data format for scipy fine tuning.

    Args:
        file_name (str): JSON File location.

    Returns:
        list: Extracted entites in the format [(text, {"entities":[(start, end, label)]})]
    """
    # labels = ['Organization', 'Asset', 'Person', 'Investor']
    esg_map = {'Soc': 'Social', 'Env': 'Environmental', 'Gov': 'Governance'}
    documents = []
    # Open the JSONL file for reading
    with open(file_name, 'r', encoding='utf-8') as jsonl_file:
        # Iterate over the lines in the file
        data = json.load(jsonl_file)
        for item in data:
            text = item['data']['text']

            entities = []
            # Parse each line as a JSON object
            for entry in item['annotations']:
                for res in entry['result']:
                    try:
                        boundaries = []
                    
                        if res['type'] == 'labels':
                            value = res['value']
                            from_name = res['from_name']
                            ner = from_name.split('-')[1]
                            
                            if ner == 'General':
                                continue
                                if value['labels'][0] in labels:
                                    ner = value['labels'][0]
                            else:
                                ner = esg_map[ner]
                            boundaries.append(value['start'])
                            boundaries.append(value['end'])
                            entities.append((boundaries[0], boundaries[1], ner))
                    except Exception as e:
                        print(res)
                        print(traceback.print_exc())
                        break
            
            documents.append({'text': text, 'entities': entities})
    return documents

In [None]:
# import extract_data as ex

documents = []
documents += get_entites_spacy('chev_data.json')
documents += get_entites_spacy('chev_data2.json')

In [1]:
with open('./ner_data_spacy.txt', 'r') as f:
    data = json.load(f)


def entity_boundaries(data):
    #df_data = pd.DataFrame(columns=['data'])
    lst_data = []
    for idx in range(len(data)):
        text = data[idx]['text']
        entities = data[idx]['entities']
        ent_w_loc = []

        for entity in entities:
            if entity['entity'] not in ['Environmental', 'Social', 'Governance']:
              continue
            pattern = re.compile(entity['value'])
            m_val = pattern.search(text)
            if m_val is not None:
                ent_w_loc.append((m_val.start(), m_val.end(), entity['entity']))
                
                
        if len(ent_w_loc)!=0:
          #df_data = pd.concat([df_data, pd.DataFrame(
          #    {'data':[(text, {'entities':ent_w_loc})]})], ignore_index=True)
            lst_data.append(({'text': text, 'entities':ent_w_loc}))

    return lst_data

lst_data=entity_boundaries(data)
len(lst_data)

1240

In [3]:
nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()

In [5]:
training_data = documents
for training_example  in tqdm(training_data[:1050]): 
    # print(training_example['text'])
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

 61%|██████    | 637/1050 [00:00<00:00, 2144.50it/s]

Skipping entity
Skipping entity


100%|██████████| 1050/1050 [00:00<00:00, 2285.84it/s]


In [6]:
training_data = documents
for training_example  in tqdm(training_data[1050:]): 
    # print(training_example['text'])
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("val.spacy")

100%|██████████| 190/190 [00:00<00:00, 4131.21it/s]


In [7]:
print(locale.getpreferredencoding())

ANSI_X3.4-1968


In [8]:
locale.getpreferredencoding = lambda: "UTF-8"

In [24]:
!pip install spacy-transformers -qq

In [10]:
!python -m spacy init fill-config ./base_config.cfg config.cfg 

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
!python3 -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [13]:
!python -m spacy train config.cfg --gpu-id 0 --output ./models_05_02 --paths.train ./train.spacy --paths.dev ./val.spacy 

[38;5;2m✔ Created output directory: models_05_02[0m
[38;5;4mℹ Saving to output directory: models_05_02[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2023-05-03 22:40:09,929] [INFO] Set up nlp object from config
[2023-05-03 22:40:09,947] [INFO] Pipeline: ['transformer', 'ner']
[2023-05-03 22:40:09,951] [INFO] Created vocabulary
[2023-05-03 22:40:09,954] [INFO] Finished initializing nlp object
Downloading (…)lve/main/config.json: 100% 481/481 [00:00<00:00, 2.75MB/s]
Downloading (…)olve/main/vocab.json: 100% 899k/899k [00:00<00:00, 17.5MB/s]
Downloading (…)olve/main/merges.txt: 100% 456k/456k [00:00<00:00, 146MB/s]
Downloading (…)/main/tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 57.2MB/s]
Downloading pytorch_model.bin: 100% 501M/501M [00:05<00:00, 86.6MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.

In [None]:
#Final model uploaded to Huggingface
!pip install https://huggingface.co/msr10/en_esg_ner/resolve/main/en_esg_ner-any-py3-none-any.whl

# Using spacy.load().
import spacy
nlp = spacy.load("en_esg_ner")

In [None]:
doc = nlp("These concerns reflect a converging consensus by policymakers, investors, and companies on growing climate risk, the need to limit global temperature increase to 1.5° C (net zero global greenhouse gas (GHG) emissions by 2050),5 and the impact of such actions to companies.")

colors = {"Social": "#F67DE3", "Governance": "#7DF6D9", "Environmental":"#a6e22d"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)