# Importing necessary Libraries

In [None]:
import spacy
import random
import pandas as pd
from spacy.util import minibatch, compounding
from spacy.training.example import Example
from spacy import displacy
from spacy.tokens import DocBin
from tqdm import tqdm

# load a new spacy model

In [None]:
nlp = spacy.blank("en") # load a new spacy model

# Load the blank English language model in spaCy

In [None]:
# Load the blank English language model in spaCy
nlp = spacy.blank("en")

# Create a new entity type for your custom NER

In [None]:
# Create a new entity type for your custom NER
ner = nlp.create_pipe("ner")
nlp.add_pipe('ner')


<spacy.pipeline.ner.EntityRecognizer at 0x7f4ab38f5ee0>

# Adding the Labels

In [None]:
ner.add_label("CODE")
ner.add_label("MODULE_TITLE")
ner.add_label("SEMESTER")
ner.add_label("PERIOD")
ner.add_label("CREDITS")
ner.add_label("GRADE")

1

# Loading the dataset

In [None]:
# Define your custom labels and their corresponding examples
TRAIN_DATA = []
df = pd.read_csv("Custom NER dataset module titles transcript(AutoRecovered).csv")


# Define the Entities

In [None]:
for index, row in df.iterrows():
    code = str(row["Code"])
    module_title = str(row["Module Title"])
    semester = str(row["Semester"])
    period = str(row["Period"])
    credits = str(row["Credits"])
    grade = str(row["Grade"])
    
    text = code + " " + module_title + " " + semester + " " + period + " " + credits + " " + grade

    entities = []
    current_pos = 0
    entities.append((current_pos, current_pos + len(code), "CODE"))
    current_pos += len(code) + 1
    entities.append((current_pos, current_pos + len(module_title), "MODULE_TITLE"))
    current_pos += len(module_title) + 1
    entities.append((current_pos, current_pos + len(semester), "SEMESTER"))
    current_pos += len(semester) + 1
    entities.append((current_pos, current_pos + len(period), "PERIOD"))
    current_pos += len(period) + 1
    entities.append((current_pos, current_pos + len(credits), "CREDITS"))
    current_pos += len(credits) + 1
    entities.append((current_pos, current_pos + len(grade), "GRADE"))

    
    TRAIN_DATA.append((text, {"entities": entities}))

# Disable other pipeline components to focus on NER

In [None]:
# Disable other pipeline components to focus on NER
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Train the NER model

In [None]:
# Train the NER model
n_iter = 10
optimizer = nlp.begin_training()
for i in range(n_iter):
    random.shuffle(TRAIN_DATA)
    losses = {}
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        examples = []
        texts, annotations = zip(*batch)
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example = Example.from_dict(doc, annotations[i])
            examples.append(example)
        nlp.update(examples, sgd=optimizer, losses=losses)
    print("Losses:", losses)




Losses: {'ner': 1080.9641413251313}
Losses: {'ner': 81.69892219707003}
Losses: {'ner': 26.13685030728511}
Losses: {'ner': 32.97153806471643}
Losses: {'ner': 2.39907462387285}
Losses: {'ner': 22.977387887101763}
Losses: {'ner': 4.599581692964621e-06}
Losses: {'ner': 1.9882955234192774e-08}
Losses: {'ner': 5.26225924263131e-09}
Losses: {'ner': 2.2673915734306062e-07}


# Save the trained model

In [None]:
from spacy.tokens import DocBin

doc_bin = DocBin() # create a DocBin object

# Save the trained model
nlp.to_disk('IdentifyingModulenamesandgrades')
doc_bin.to_disk("train.spacy") # save the docbin object

# Loading the custom NER model  and applying it

In [None]:
# Loading the custom NER model using `nlp_ner`
nlp_ner = spacy.load('IdentifyingModulenamesandgrades')

# Load the document from the text file
with open("extracted_sections.txt", "r") as file:
    text = file.read()

# Process the document with the custom NER model
doc = nlp(text)

# Extract and print the entities
print("Entities:")
for ent in doc.ents:
    print(ent.text, ent.label_)

Entities:
IT1010 CODE
Introduction to Programming 1 Apr - 2019 4 A
IT1020 Introduction to Computer Systems 1 Apr - 2019 4 C+
IT1030 Mathematics for Computing 1 Apr - 2019 4 B
IT1040 Communication Skills MODULE_TITLE
1 Apr - 2019 3 PERIOD
A- GRADE
IT1050 CODE
Object Oriented Concepts MODULE_TITLE
2 SEMESTER
Oct - 2019 PERIOD
2 CREDITS
A GRADE
IT1060 CODE
Software Process Modeling MODULE_TITLE
2 SEMESTER
Oct - 2019 PERIOD
3 CREDITS
A GRADE
IT1080 CODE
English for Academic Purposes MODULE_TITLE
2 SEMESTER
Oct - 2019 PERIOD
3 CREDITS
B GRADE
IT1090 CODE
Information Systems & Data Modeling MODULE_TITLE
2 SEMESTER
Oct - 2019 4 PERIOD
A GRADE
IT1100 CODE
Internet & Web Technclogies MODULE_TITLE
2 SEMESTER
Oct - 2019 4 PERIOD
A- GRADE
IT2020 CODE
Software Engineering MODULE_TITLE
1 Apr - 2020 4 PERIOD
A GRADE
IT2030 CODE
Object Oriented Programming 1 . Jun - 2020 4 A
IT2040 Database Management Systems 1 Apr - 2020 4 A
IT2050 Computer Networks MODULE_TITLE
1 Jun - 2020 PERIOD
4 CREDITS
A GRADE


#Saving the identifying Module Titles andg grades in a CSV file

In [None]:
import pandas as pd

def test_ner_model(file_path, output_file):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    modules = []
    grades = []

    print("Testing NER on file:", file_path)
    print("Entities:")
    for line in lines:
        doc = nlp_ner(line.strip())
        module_title = None
        grade = None
        for ent in doc.ents:
            if ent.label_ == "MODULE_TITLE":
                module_title = ent.text
            elif ent.label_ == "GRADE":
                grade = ent.text

        if module_title and grade:  # Only append if both module title and grade exist
            modules.append(module_title)
            grades.append(grade)

    # Create a DataFrame with modules and grades
    data = {"Module Title": modules, "Grade": grades}
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(output_file, index=False)
    print("Saved extracted data to", output_file)

# Provide the file path to test and output file path
file_path = "extracted_sections.txt"
output_file = "extracted_data.csv"
test_ner_model(file_path, output_file)


Testing NER on file: extracted_sections.txt
Entities:
Saved extracted data to extracted_data.csv
