In [21]:
import json
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random
from spacy import displacy
import os

In [8]:

# Directory containing spaCy JSON files
data_directory = '/home/hp/Documents/Mini_Project/Labelled/spacy'


In [9]:
# Collect all training data from .spacy files
all_training_data = []
for filename in os.listdir(data_directory):
    if filename.endswith('.spacy'):
        file_path = os.path.join(data_directory, filename)
        all_training_data.append(file_path)

In [10]:
# Debugging: Print the number of .spacy files
print(f"Number of training files: {len(all_training_data)}")
if all_training_data:
    print(f"Sample training file: {all_training_data[:2]}")


Number of training files: 47
Sample training file: ['/home/hp/Documents/Mini_Project/Labelled/spacy/74.spacy', '/home/hp/Documents/Mini_Project/Labelled/spacy/84.spacy']


In [11]:
# Create a blank spaCy model
nlp = spacy.blank("en")

# Create the NER component and add it to the pipeline
ner = nlp.add_pipe("ner")

In [13]:
# Load data from .spacy files and add new labels to the NER component
for file_path in all_training_data:
    doc_bin = spacy.tokens.DocBin().from_disk(file_path)
    for doc in doc_bin.get_docs(nlp.vocab):
        for ent in doc.ents:
            ner.add_label(ent.label_)

# Debugging: Print the labels added to the NER component
print("Entity labels in the training data:", ner.labels)


Entity labels in the training data: ()


In [14]:
# Disable other pipeline components (if any)
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [15]:
# Training the NER model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(20):  # Increase iterations if needed
        random.shuffle(all_training_data)
        losses = {}
        for file_path in all_training_data:
            doc_bin = spacy.tokens.DocBin().from_disk(file_path)
            examples = [Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in doc_bin.get_docs(nlp.vocab)]
            for batch in minibatch(examples, size=compounding(4.0, 32.0, 1.001)):
                nlp.update(batch, drop=0.5, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

Iteration 1, Losses: {}
Iteration 2, Losses: {}
Iteration 3, Losses: {}
Iteration 4, Losses: {}
Iteration 5, Losses: {}
Iteration 6, Losses: {}
Iteration 7, Losses: {}
Iteration 8, Losses: {}
Iteration 9, Losses: {}
Iteration 10, Losses: {}
Iteration 11, Losses: {}
Iteration 12, Losses: {}
Iteration 13, Losses: {}
Iteration 14, Losses: {}
Iteration 15, Losses: {}
Iteration 16, Losses: {}
Iteration 17, Losses: {}
Iteration 18, Losses: {}
Iteration 19, Losses: {}
Iteration 20, Losses: {}


In [16]:
# Save the trained model
nlp.to_disk("trained_model")

In [17]:
# Load the trained model
model_path = "trained_model"
nlp = spacy.load(model_path)

In [18]:
## Function to read the content of a text file
def read_text_file(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return ""


In [19]:
# Function to test and visualize the model on the content of a text file
def test_and_visualize_text_file(nlp, filepath):
    text = read_text_file(filepath)
    if text:
        doc = nlp(text)
        if doc.ents:
            displacy.render(doc, style="ent", jupyter=True)  # Visualize entities in Jupyter notebook
        else:
            print("No entities found in the document.")
            for token in doc:
                print(f"{token.text}: {token.ent_iob_}, {token.ent_type_}")
    else:
        print("No text found to process.")

In [20]:
# Path to the text file you want to test
text_file_path = '/home/hp/Documents/Mini_Project/dataset/IN-Abs/test-data/judgement/78.txt'

# Test and visualize the model on the text file
test_and_visualize_text_file(nlp, text_file_path)

No entities found in the document.
N: O, 
:: O, 
Criminal: O, 
Appeal: O, 
No: O, 
.: O, 
8: O, 
of: O, 
1951: O, 
.: O, 

: O, 
Appeal: O, 
from: O, 
an: O, 
Order: O, 
of: O, 
the: O, 
High: O, 
Court: O, 
of: O, 
Bombay: O, 
(: O, 
Bavdekar: O, 
and: O, 
Chainani: O, 
JJ: O, 
.: O, 
): O, 
dated: O, 
20th: O, 
February: O, 
,: O, 
1950: O, 
,: O, 
in: O, 
Criminal: O, 
Appeal: O, 
No: O, 
.: O, 
106: O, 
of: O, 
1950: O, 
arising: O, 
out: O, 
of: O, 
an: O, 
order: O, 
dated: O, 
9th: O, 
January: O, 
,: O, 
1950: O, 
,: O, 
of: O, 
the: O, 
Presidency: O, 
Magistrate: O, 
,: O, 
19th: O, 
Court: O, 
,: O, 
Esplanade: O, 
,: O, 
Bombay: O, 
,: O, 
in: O, 
Case: O, 
No: O, 
.: O, 
10879: O, 
/: O, 
P: O, 
of: O, 
1949: O, 
.: O, 

: O, 
The: O, 
facts: O, 
are: O, 
stated: O, 
in: O, 
the: O, 
judgment: O, 
.: O, 

: O, 
Iswarlal: O, 
C.: O, 
Dalai: O, 
and: O, 
R.B.: O, 
Dalai: O, 
,: O, 
for: O, 
the: O, 
appellant: O, 
.: O, 

: O, 
C.K.: O, 
Daphtary: O, 
,: O, 
Solicitor: O, 
G