# Bangla NER
Bangla Name Entity Recognition(NER) using SpaCy. this model training using PER(PERSON) class. the main purpose extract human name from text string.

In [None]:
!pip install spacy
!pip install spacy-transformers



Write the base config file to directory. For more information please check spacy

In [None]:
%%writefile base_config.cfg
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = null
dev = null
vectors = null
[system]
gpu_allocator = "pytorch"

[nlp]
lang = "bn"
pipeline = ["transformer","ner"]
batch_size = 256

[components]

[components.transformer]
factory = "transformer"

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "csebuetnlp/banglabert"
tokenizer_config = {"use_fast": true}
# max_length = 512

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 256
stride = 96

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0

[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256

[initialize]
vectors = ${paths.vectors}

Writing base_config.cfg


Convert Base config to training config file

In [None]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

2023-12-08 09:05:01.836139: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-08 09:05:01.836199: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-08 09:05:01.836235: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-08 09:05:01.843977: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 09:05:04.315105: I tensorflow/compiler/

BLIOU and BIO format are similar type data format for NER.

BLIOU data format:

    B = Begin

    L = Last

    I = Inside

    O = Outside

    U = Unique

IOB data format:

    I = Inside

    O = Outside

    B = Begin

In [None]:
pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=4fa4e0c8a6bbe578aaeda5e117ee16fc78591c2e2ef845bae0b4b242b8f82bc7
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


Download Data from huggingface, for more information [check](https://huggingface.co/datasets/saiful9379/BanglaNER_V1.0)

In [None]:
import os
import wget


def download_data_form_huggingface(url, output_dir):
    wget.download(url, out=output_dir)

data = "data"
os.makedirs(data, exist_ok=True)

In [None]:
train_data = "https://huggingface.co/datasets/saiful9379/BanglaNER_V1.0/resolve/main/train.jsonl?download=true"
val_data = "https://huggingface.co/datasets/saiful9379/BanglaNER_V1.0/resolve/main/val.jsonl?download=true"

In [None]:
download_data_form_huggingface(train_data, data)

In [None]:
download_data_form_huggingface(val_data, data)

# Convert docanno format to spacy format

In [None]:
import os
import json
import spacy
from tqdm import tqdm
from spacy.tokens import DocBin

VISUALIZATION_STATUS = False


def save_jsonl(data, output_dir):
    """
    Save a list of data into a JSON Lines file.

    Parameters:
        - data (list): List of data to be saved.
        - filename (str): The name of the file to save.

    Returns:
        None
    """
    with open(output_dir, 'w', encoding='utf-8') as file:
        for item in data:
            json.dump(item, file, ensure_ascii=False)
            file.write('\n')

def read_jsonl_file(file_path):
    with open(file_path, 'r') as f:
        data = f.read().split("\n")
    data = [json.loads(line) for line in data if line]
    return data



def convert_spacy_format(data, output_path="data/unkonwn.spacy"):
    """
    Convert data to Spacy format and save it as a DocBin.

    Parameters:
        - data (list): List of tuples containing text and annotations.
        - output_path (str): The path to save the Spacy DocBin file.

    Returns:
        None
    """
    nlp = spacy.blank("bn") # load a new spacy model
    db = DocBin() # create a DocBin object
    number_of_skip_entity, processed_line = 0, 0
    for text, annot in tqdm(data): # data in previous format
        try:
            doc = nlp.make_doc(text) # create doc object from text
            ents = []
            for start, end, label in annot["entities"]: # add character indexes

                span = doc.char_span(start, end, label=label, alignment_mode="strict")
                # print(start, end, label, span)
                if span is None:
                    s = doc.text
                    sub_E = s[end:]
                    sub_S = s[:start]
                    end = end+ (0 if len(sub_E.split(" ", 1)[0]) <= 0 else len(sub_E.split(" ", 1)[0]))
                    start = start - (0 if len(sub_S.rsplit(" ", 1)[-1]) <= 0 else len(sub_S.rsplit(" ", 1)[-1]))

                    span = doc.char_span(start, end, label=label, alignment_mode="strict")
                    if span is None:
                        number_of_skip_entity += 1
                        # print("++++++++++++++++++++++++++++Skipping entity Start++++++++++++++++++++++++++++")
                        # print(start, end, label, span)
                        # print(doc.text[start:end],doc.text[start],doc.text[end],'kh',sep='|')
                        # print("++++++++++++++++++++++++++++Skipping entity End++++++++++++++++++++++++++++++")
                        break
                else:
                    processed_line += 1
                    ents.append(span)
            doc.ents = ents # label the text with the ents
            if VISUALIZATION_STATUS:
                spacy.displacy.render(doc, style="ent", jupyter=True)
            db.add(doc)
        except:
            number_of_skip_entity += 1
    db.to_disk(output_path) # save the docbin object

    print(f" Spacy Processed file   : {output_path}")
    print(f" No. of Processed line : {processed_line}")
    print(f" No. of Skip Entity  : {number_of_skip_entity}")



def data_convert_spacy_format(file_path):
    """
    Convert data from a JSON Lines file to Spacy format.

    Parameters:
        - jsonl_file_path (str): The path to the JSON Lines file.

    Returns:
        - training_data (list): List of tuples containing text and annotations.
    """
    # need to check empty line
    training_data, lines=[], []
    # with open(jsonl_file_path, 'r') as f:
    #     data = f.read().split("\n")
    # # print(data)
    # for line in data:
    #     j_line = json.loads(line)
    data = read_jsonl_file(file_path)
    for line in data:
        text, entities= line['text'], line['label']
        if len(entities)>0:
            training_data.append((text, {"entities" : entities}))
    return training_data




import glob
input_dir = "./data"
output_dir = "./data"

jsonl_files = glob.glob(input_dir+"/*.jsonl")
for jsonl_file in jsonl_files:
    file_name = os.path.basename(jsonl_file).split(".")[0]+".spacy"
    data = data_convert_spacy_format(jsonl_file)
    convert_spacy_format(data, output_path=os.path.join(output_dir, file_name))

100%|██████████| 4483/4483 [00:02<00:00, 2161.33it/s]


 Spacy Processed file   : ./data/train.spacy
 No. of Processed line : 4447
 No. of Skip Entity  : 11


100%|██████████| 1161/1161 [00:00<00:00, 1256.12it/s]


 Spacy Processed file   : ./data/val.spacy
 No. of Processed line : 1146
 No. of Skip Entity  : 3


# Training BanglaNER model

In [None]:
! python -m spacy train config.cfg \
    --gpu-id 0 \
    --output ./models/bangla_ner_model \
    --paths.train ./data/train.spacy \
    --paths.dev ./data/train.spacy

2023-12-08 09:28:36.176569: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-08 09:28:36.176625: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-08 09:28:36.176659: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;2m✔ Created output directory: models/bangla_ner_model[0m
[38;5;4mℹ Saving to output directory: models/bangla_ner_model[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
tokenizer_config.json: 100% 119/119 [00:00<00:00, 631kB/s]
config.json: 100% 586/586 [00:00<00:00, 4.10MB/s]
vocab.txt: 100% 528k/528k [00:00<00:00, 2.66MB/s]
special_tokens_map.json: 100% 112/112 

# Inference

In [None]:
# pip install -U spacy
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("models/bangla_ner_model/model-best")

text_list = [
    "আব্দুর রহিম নামের কাস্টমারকে একশ টাকা বাকি দিলাম",
    "নতুন বছরে জ্বলছেন আরও একজন—রজার ফেদেরার ।",
    "ডিপিডিসির স্পেশাল টাস্কফোর্সের প্রধান মুনীর চৌধুরী জানান",
    "তিনি মোহাম্মদ বাকির আল-সদর এর ছাত্র ছিলেন।",
    "লিশ ট্র্যাক তৈরির সময় বেশ কয়েকজন শিল্পীর দ্বারা অনুপ্রাণিত হওয়ার কথা স্মরণ করেন, বিশেষ করে ফ্রাঙ্ক সিনাত্রা ।",
]
for text in text_list:
    doc = nlp(text)
    print(f"Input: {text}")
    for entity in doc.ents:
        print(f"Entity: {entity.text}, Label: {entity.label_}")
    print("---")


Input: আব্দুর রহিম নামের কাস্টমারকে একশ টাকা বাকি দিলাম
Entity: আব্দুর রহিম, Label: PER
---
Input: নতুন বছরে জ্বলছেন আরও একজন—রজার ফেদেরার ।
---
Input: ডিপিডিসির স্পেশাল টাস্কফোর্সের প্রধান মুনীর চৌধুরী জানান
Entity: মুনীর চৌধুরী, Label: PER
---
Input: তিনি মোহাম্মদ বাকির আল-সদর এর ছাত্র ছিলেন।
Entity: মোহাম্মদ বাকির আল-সদর, Label: PER
---
Input: লিশ ট্র্যাক তৈরির সময় বেশ কয়েকজন শিল্পীর দ্বারা অনুপ্রাণিত হওয়ার কথা স্মরণ করেন, বিশেষ করে ফ্রাঙ্ক সিনাত্রা ।
Entity: ফ্রাঙ্ক সিনাত্রা, Label: PER
---
