## 1. Imports

In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
import pandas as pd
import numpy as np
import os
import re
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline



In [4]:
# Reading the dataset
df1 = pd.read_pickle('/content/drive/My Drive/Northwestern MS/453 - Natural Language Processing/imdb/imdb_dataset.pkl')

In [247]:
# Extract the first 10 rows from the imdb reviews
df = pd.Series(df1['review'].head(10), name="unprocessed_review")
df = df.to_frame(name='unprocessed_review')
print(df.head())
print(df.columns)

                                  unprocessed_review
0  One of the other reviewers has mentioned that ...
1  A wonderful little production. <br /><br />The...
2  I thought this was a wonderful way to spend ti...
3  Basically there's a family where a little boy ...
4  Petter Mattei's "Love in the Time of Money" is...
Index(['unprocessed_review'], dtype='object')


## 2. Pre-processing

In [248]:
# Remove extra whitespace and newline characters
def remove_extra_whitespace(text):
    text = text.replace('\n', ' ').replace('\r', ' ')
    return re.sub(r'\s+', ' ', text).strip()

# Handle contractions
def handle_contractions(text):
    contractions = {
        "there's": "there is",
        "it's": "it is",
        "can't": "cannot",
        "won't": "will not",
        "I'm": "I am",
        "he's": "he is",
        "she's": "she is",
        "they're": "they are",
        "we're": "we are",
        "you're": "you are",
        "I've": "I have",
        "you've": "you have",
        "we've": "we have",
        "they've": "they have",
        "I'd": "I would",
        "you'd": "you would",
        "he'd": "he would",
        "she'd": "she would",
        "we'd": "we would",
        "they'd": "they would",
        "I'll": "I will",
        "you'll": "you will",
        "he'll": "he will",
        "she'll": "she will",
        "we'll": "we will",
        "they'll": "they will",
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "hasn't": "has not",
        "haven't": "have not",
        "hadn't": "had not",
        "doesn't": "does not",
        "don't": "do not",
        "didn't": "did not",
        "won't": "will not",
        "wouldn't": "would not",
        "can't": "cannot",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not"
    }
    for contraction, expanded in contractions.items():
        text = text.replace(contraction, expanded)
    return text

In [249]:
# function to clean text
def clean(text):
      text = remove_extra_whitespace(text)
      text = handle_contractions(text)
      # removing paragraph numbers
      text = re.sub('[0-9]+.\t', '', text)
      # removing new line characters
      text = re.sub('\n', '', text)
      text = re.sub('\n', ' ', text)
      # remove apostrophes
      text = re.sub("'s", '', text)
      # replacing hyphen with blank space
      text = re.sub('-', ' ', text)
      text = re.sub('- ', '', text)
      # remove quotation marks
      text = re.sub('\"', '', text)
      # removing slautations
      text = re.sub('Mr\.', 'Mr', text)
      text = re.sub('Mrs\.', 'Mrs', text)
      text = re.sub('Ms\.', 'Ms', text)
      text = re.sub('Dr\.', 'Dr', text)
      # remove whitespaces
      text = text.strip()
      #removing reference to outside text
      text = re.sub('[\(\[].*?[\)\])]', '', text)
      # removing extra spaces
      text = re.sub(' +', ' ', text)
      # remove square brackets
      text = re.sub('\[.*?\]', '', text)
      # remove <> and any text between < and >
      text = re.sub('<.*?>', '', text)
      # remove special characters
      text = re.sub('[^A-Za-z0-9]+', ' ', text)
      return text

In [250]:
# function to split text into sentences
def sentences(text):
      text = re.split('[.?!]', text)
      clean_sent = []
      for sent in text:
        if sent != '':
          clean_sent.append(sent)
      return clean_sent

In [251]:
df['sentences'] = df['unprocessed_review'].apply(clean).apply(sentences)

In [252]:
df.head()

Unnamed: 0,unprocessed_review,sentences
0,One of the other reviewers has mentioned that ...,[One of the other reviewers has mentioned that...
1,A wonderful little production. <br /><br />The...,[A wonderful little production The filming tec...
2,I thought this was a wonderful way to spend ti...,[I thought this was a wonderful way to spend t...
3,Basically there's a family where a little boy ...,[Basically there is a family where a little bo...
4,"Petter Mattei's ""Love in the Time of Money"" is...",[Petter Mattei Love in the Time of Money is a ...


In [253]:
df.sentences[0]

['One of the other reviewers has mentioned that after watching just 1 Oz episode you will be hooked They are right as this is exactly what happened with me The first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs sex or violence Its is hardcore in the classic use of the word It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda Em City is home to many Aryans Muslims gangstas Latinos Christians Italians Irish and more so scuffles death stares dodgy dealings and shady agreements are never far away I would say the main appeal of the show is due to the fact that it goes where other shows would not dare Forget pr

In [254]:
# Concatenate list of sentences into a single string
df['sentences'] = df['sentences'].apply(lambda x: ' '.join(x))

## 3. Extracting Entities and Relations with Spacy

In [255]:
# Function to extract entities from text
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [256]:
# Define custom rules for relation extraction
def extract_relations(doc):
    relations = set()  # Use a set to avoid duplicates
    # Build a map of entities to their tokens
    ent_map = {ent: [tok for tok in ent] for ent in doc.ents}

    for ent1 in doc.ents:
        head1 = ent1.root.head  # Get the syntactic head of the first entity
        for ent2 in doc.ents:
            if ent1 == ent2 or (ent2.text, ent1.text) in relations:
                continue  # Skip self relations and already captured in reverse

            head2 = ent2.root.head  # Get the syntactic head of the second entity
            # Entities are related if their heads are the same, or one is a child of the other's head
            if head1 == head2 or ent2.root in head1.children or ent1.root in head2.children:
                relations.add((ent1.text, ent2.text))
            elif head1 in ent_map[ent2] or head2 in ent_map[ent1]:
                relations.add((ent1.text, ent2.text))
            else:
                # Capture relations within a close proximity window
                distance = min(abs(tok1.i - tok2.i) for tok1 in ent_map[ent1] for tok2 in ent_map[ent2])
                if distance <= 5:
                    relations.add((ent1.text, ent2.text))

    return list(relations)  # Convert set back to list for output


In [257]:
# Apply the function to the DataFrame and create a new column
df['spacy_entities'] = df['sentences'].apply(extract_entities)

# Apply the relation extraction function to the DataFrame and create a new column
df['spacy_relations'] = df['sentences'].apply(lambda x: extract_relations(nlp(x)))

In [258]:
# Visualize dependency parsing for the first sentence (optional)
displacy.render(nlp(df['sentences'][0]), style="dep", jupyter=True, options={'distance': 100})

In [259]:
df.spacy_entities[2] # validate output

[('summer weekend', 'DATE'),
 ('Match Point 2 Risk Addiction', 'ORG'),
 ('Woody Allen', 'PERSON'),
 ('one', 'CARDINAL'),
 ('Woody', 'NORP'),
 ('years', 'DATE'),
 ('Scarlet Johanson', 'ORG'),
 ('Devil Wears Prada', 'ORG'),
 ('Superman', 'GPE')]

In [260]:
df.spacy_relations[2] # validate output

[('Devil Wears Prada', 'Superman'),
 ('Woody', 'years'),
 ('one', 'years'),
 ('one', 'Woody')]

In [261]:
df.head() # validate output

Unnamed: 0,unprocessed_review,sentences,spacy_entities,spacy_relations
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...,"[(One, CARDINAL), (just 1 Oz, PERCENT), (GO Tr...","[(Em City, Aryans Muslims), (Latinos Christian..."
1,A wonderful little production. <br /><br />The...,A wonderful little production The filming tech...,"[(BBC, ORG), (Michael Sheen, PERSON), (William...","[(Orton, Halliwell)]"
2,I thought this was a wonderful way to spend ti...,I thought this was a wonderful way to spend ti...,"[(summer weekend, DATE), (Match Point 2 Risk A...","[(Devil Wears Prada, Superman), (Woody, years)..."
3,Basically there's a family where a little boy ...,Basically there is a family where a little boy...,"[(Rambo, PERSON), (first, ORDINAL), (BOOGEYMAN...","[(3, 10)]"
4,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei Love in the Time of Money is a v...,"[(Petter Mattei Love, ORG), (Mr Mattei, PERSON...","[(Mr Mattei, Steve Buscemi Rosario), (Steve Bu..."


## 3. Extracting Entities with Transformers and Relations with Spacy

In [262]:
# Load the pre-trained model and tokenizer from Hugging Face
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a pipeline for named entity recognition
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [263]:
# Function to extract entities using transformers
def extract_entities_transformers(sentence):
    ner_results = ner_pipeline(sentence)
    entities = [(result['word'], result['entity']) for result in ner_results]
    return entities

In [264]:
# Apply the function to the DataFrame and create a new column
df['transformer_entities'] = df['sentences'].apply(extract_entities_transformers)

In [266]:
def extract_relations_spacy(doc, entities):
    relations = []
    visited = set()

    entity_spans = []
    for ent in doc.ents:
        entity_spans.append(ent)

    for ent1 in entity_spans:
        for ent2 in entity_spans:
            if ent1 == ent2 or (ent1, ent2) in visited or (ent2, ent1) in visited:
                continue
            visited.add((ent1, ent2))

            if ent1.root.head == ent2.root.head:
                relations.append((ent1.text, ent1.root.head.lemma_, ent2.text, ent1.label_, ent2.label_))
            elif ent1.root.head == ent2.root or ent1.root == ent2.root.head:
                relations.append((ent1.text, ent1.root.head.lemma_ + ' ' + ent2.root.head.lemma_, ent2.text, ent1.label_, ent2.label_))
            elif ent2 in ent1.subtree or ent1 in ent2.subtree:
                relations.append((ent1.text, "related_to", ent2.text, ent1.label_, ent2.label_))
            else:
                for token in doc:
                    if token.head == ent1.root and token.dep_ in ("prep", "agent", "pobj", "dobj") and token.head == ent2.root:
                        relations.append((ent1.text, token.lemma_, ent2.text, ent1.label_, ent2.label_))
                    elif token.head == ent2.root and token.dep_ in ("prep", "agent", "pobj", "dobj") and token.head == ent1.root:
                        relations.append((ent1.text, token.lemma_, ent2.text, ent1.label_, ent2.label_))
                    elif token.head == ent1.root or token.head == ent2.root:
                        if token.head.head == ent1.root or token.head.head == ent2.root:
                            relations.append((ent1.text, token.lemma_, ent2.text, ent1.label_, ent2.label_))

    return relations

In [267]:
# Apply the relation extraction function to each sentence
def extract_relations_from_entities(row):
    sentence = row['sentences']
    entities = row['transformer_entities']
    doc = nlp(sentence)
    return extract_relations_spacy(doc, entities)

In [268]:
# Apply relation extraction to the dataframe
df['trans_relations'] = df.apply(extract_relations_from_entities, axis=1)

In [292]:
df['transformer_entities'][0]

[('Oz', 'I-MISC'),
 ('Oz', 'I-MISC'),
 ('O', 'I-ORG'),
 ('##Z', 'I-ORG'),
 ('Oswald', 'I-LOC'),
 ('Pen', 'I-ORG'),
 ('##ite', 'I-ORG'),
 ('##ntary', 'I-ORG'),
 ('Emerald', 'I-LOC'),
 ('City', 'I-LOC'),
 ('Em', 'I-LOC'),
 ('City', 'I-LOC'),
 ('A', 'I-MISC'),
 ('##ryan', 'I-MISC'),
 ('Muslims', 'I-MISC'),
 ('Latino', 'I-MISC'),
 ('Christians', 'I-MISC'),
 ('Italians', 'I-MISC'),
 ('Irish', 'I-MISC'),
 ('O', 'I-ORG'),
 ('##Z', 'I-ORG'),
 ('Oz', 'I-MISC'),
 ('Oz', 'I-MISC')]

In [293]:
df['trans_relations'][0]

[('Em City', 'be', 'Italians', 'GPE', 'NORP'),
 ('Aryans Muslims', 'gangsta', 'Latinos Christians', 'NORP', 'NORP'),
 ('Italians', 'be Italians', 'Irish', 'NORP', 'NORP')]

## 4. Cleaning Output

In [269]:
# Display the DataFrame with relations
df['trans_relations'][0]

[('Em City', 'be', 'Italians', 'GPE', 'NORP'),
 ('Aryans Muslims', 'gangsta', 'Latinos Christians', 'NORP', 'NORP'),
 ('Italians', 'be Italians', 'Irish', 'NORP', 'NORP')]

In [270]:
df['spacy_relations'][0]

[('Em City', 'Aryans Muslims'),
 ('Latinos Christians', 'Italians'),
 ('Em City', 'Italians'),
 ('Aryans Muslims', 'Irish'),
 ('Italians', 'Irish'),
 ('Aryans Muslims', 'Latinos Christians'),
 ('the Oswald Maximum Security State Penitentary It', 'Emerald City'),
 ('Aryans Muslims', 'Italians'),
 ('Latinos Christians', 'Irish')]

In [271]:
df['spacy_entities'][0]

[('One', 'CARDINAL'),
 ('just 1 Oz', 'PERCENT'),
 ('GO Trust', 'ORG'),
 ('the Oswald Maximum Security State Penitentary It', 'ORG'),
 ('Emerald City', 'GPE'),
 ('Em City', 'GPE'),
 ('Aryans Muslims', 'NORP'),
 ('Latinos Christians', 'NORP'),
 ('Italians', 'NORP'),
 ('Irish', 'NORP'),
 ('first', 'ORDINAL')]

In [272]:
df['transformer_entities'][0]

[('Oz', 'I-MISC'),
 ('Oz', 'I-MISC'),
 ('O', 'I-ORG'),
 ('##Z', 'I-ORG'),
 ('Oswald', 'I-LOC'),
 ('Pen', 'I-ORG'),
 ('##ite', 'I-ORG'),
 ('##ntary', 'I-ORG'),
 ('Emerald', 'I-LOC'),
 ('City', 'I-LOC'),
 ('Em', 'I-LOC'),
 ('City', 'I-LOC'),
 ('A', 'I-MISC'),
 ('##ryan', 'I-MISC'),
 ('Muslims', 'I-MISC'),
 ('Latino', 'I-MISC'),
 ('Christians', 'I-MISC'),
 ('Italians', 'I-MISC'),
 ('Irish', 'I-MISC'),
 ('O', 'I-ORG'),
 ('##Z', 'I-ORG'),
 ('Oz', 'I-MISC'),
 ('Oz', 'I-MISC')]

The outputs, including entities and relations, from SpaCy and Transformers+SpaCy are in different formats and contain duplicates. It is crucial to clean these outputs to ensure accurate evaluation and comparison.

In [273]:
def clean_entities(entities):
    cleaned_entities = []
    seen = set()
    for entity in entities:
        if len(entity) == 4:
            word, entity_type, start, end = entity
        else:
            word, entity_type = entity
            start, end = None, None  # Placeholder values if start and end are not provided

        # Remove sub-token markers
        word = word.replace("##", "")
        # Combine sub-tokens
        if cleaned_entities and cleaned_entities[-1][1] == entity_type and cleaned_entities[-1][3] == start:
            prev_word, prev_type, prev_start, prev_end = cleaned_entities.pop()
            word = prev_word + word
            start = prev_start

        if (word, entity_type) not in seen:
            cleaned_entities.append((word, entity_type, start, end))
            seen.add((word, entity_type))

    return cleaned_entities

In [274]:
def remove_duplicate_entities(entities):
    seen = set()
    unique_entities = []
    for entity in entities:
        if entity not in seen:
            unique_entities.append(entity)
            seen.add(entity)
    return unique_entities

In [275]:
def remove_duplicate_relations(relations):
    seen = set()
    unique_relations = []
    for relation in relations:
        if relation not in seen:
            unique_relations.append(relation)
            seen.add(relation)
    return unique_relations

In [276]:
def clean_transformer_entities(entities):
    cleaned_entities = []
    buffer = ""
    entity_type = None

    for entity in entities:
        if len(entity) == 4:
            word, entity_type, start, end = entity
        else:
            word, entity_type = entity
            start, end = None, None

        # Remove sub-token markers and merge tokens
        if word.startswith("##"):
            buffer += word[2:]
        else:
            if buffer:
                cleaned_entities.append((buffer, entity_type, start, end))
                buffer = word
            else:
                buffer = word

        # Append the buffered word if it is the end of an entity
        if word != buffer:
            cleaned_entities.append((buffer, entity_type, start, end))
            buffer = ""

    # Append the last buffered entity
    if buffer:
        cleaned_entities.append((buffer, entity_type, start, end))

    # Align entity types with SpaCy
    aligned_entities = []
    for word, entity_type, start, end in cleaned_entities:
        if entity_type == "I-MISC":
            aligned_entities.append((word, "MISC", start, end))
        elif entity_type == "I-ORG":
            aligned_entities.append((word, "ORG", start, end))
        elif entity_type == "I-PER":
            aligned_entities.append((word, "PERSON", start, end))
        elif entity_type == "I-LOC":
            aligned_entities.append((word, "GPE", start, end))
        else:
            aligned_entities.append((word, entity_type, start, end))

    return aligned_entities



In [277]:
# Clean transformer entities with improved function
df['cleaned_transformer_entities'] = df['transformer_entities'].apply(clean_transformer_entities)

# Remove duplicates from both SpaCy and cleaned transformer entities
df['cleaned_spacy_entities'] = df['spacy_entities'].apply(remove_duplicate_entities)
df['cleaned_transformer_entities'] = df['cleaned_transformer_entities'].apply(remove_duplicate_entities)

# Remove duplicate relations
df['cleaned_spacy_relations'] = df['spacy_relations'].apply(remove_duplicate_relations)
df['cleaned_transformer_relations'] = df['trans_relations'].apply(remove_duplicate_relations)


In [278]:
# Display the cleaned DataFrame
print(df[['sentences', 'cleaned_spacy_entities', 'cleaned_transformer_entities', 'cleaned_spacy_relations', 'cleaned_transformer_relations']].head(1))


                                           sentences  \
0  One of the other reviewers has mentioned that ...   

                              cleaned_spacy_entities  \
0  [(One, CARDINAL), (just 1 Oz, PERCENT), (GO Tr...   

                        cleaned_transformer_entities  \
0  [(Oz, MISC, None, None), (Oz, ORG, None, None)...   

                             cleaned_spacy_relations  \
0  [(Em City, Aryans Muslims), (Latinos Christian...   

                       cleaned_transformer_relations  
0  [(Em City, be, Italians, GPE, NORP), (Aryans M...  


In [279]:
df.cleaned_spacy_entities[0]

[('One', 'CARDINAL'),
 ('just 1 Oz', 'PERCENT'),
 ('GO Trust', 'ORG'),
 ('the Oswald Maximum Security State Penitentary It', 'ORG'),
 ('Emerald City', 'GPE'),
 ('Em City', 'GPE'),
 ('Aryans Muslims', 'NORP'),
 ('Latinos Christians', 'NORP'),
 ('Italians', 'NORP'),
 ('Irish', 'NORP'),
 ('first', 'ORDINAL')]

In [280]:
df.cleaned_transformer_entities[0]

[('Oz', 'MISC', None, None),
 ('Oz', 'ORG', None, None),
 ('OZ', 'ORG', None, None),
 ('Oswald', 'ORG', None, None),
 ('Penite', 'ORG', None, None),
 ('ntary', 'ORG', None, None),
 ('Emerald', 'GPE', None, None),
 ('City', 'GPE', None, None),
 ('Em', 'GPE', None, None),
 ('City', 'MISC', None, None),
 ('Aryan', 'MISC', None, None),
 ('Muslims', 'MISC', None, None),
 ('Latino', 'MISC', None, None),
 ('Christians', 'MISC', None, None),
 ('Italians', 'MISC', None, None),
 ('Irish', 'ORG', None, None)]

In [281]:
df.cleaned_spacy_relations[0]

[('Em City', 'Aryans Muslims'),
 ('Latinos Christians', 'Italians'),
 ('Em City', 'Italians'),
 ('Aryans Muslims', 'Irish'),
 ('Italians', 'Irish'),
 ('Aryans Muslims', 'Latinos Christians'),
 ('the Oswald Maximum Security State Penitentary It', 'Emerald City'),
 ('Aryans Muslims', 'Italians'),
 ('Latinos Christians', 'Irish')]

In [282]:
df.cleaned_transformer_relations[0]

[('Em City', 'be', 'Italians', 'GPE', 'NORP'),
 ('Aryans Muslims', 'gangsta', 'Latinos Christians', 'NORP', 'NORP'),
 ('Italians', 'be Italians', 'Irish', 'NORP', 'NORP')]

## 5. Evaluate Performance - Spacy vs Transformers

In [283]:
# Define evaluation functions
def evaluate_entities(true_entities, pred_entities):
    true_set = set((ent[0], ent[1]) for ent in true_entities)
    pred_set = set((ent[0], ent[1]) for ent in pred_entities)

    print(f"Spacy Entities: {true_set}")
    print(f"Transformer Entities: {pred_set}")

    tp = len(true_set & pred_set)
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1

In [284]:
def evaluate_relations(true_relations, pred_relations):
    true_set = set(true_relations)
    pred_set = set(pred_relations)

    print(f"Spacy Relations: {true_set}")
    print(f"Transformer Relations: {pred_set}")

    tp = len(true_set & pred_set)
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1

In [285]:
# Evaluate performances
def evaluate_method(df, pred_entity_col, pred_relation_col, gold_entity_col, gold_relation_col):
    entity_precisions, entity_recalls, entity_f1s = [], [], []
    relation_precisions, relation_recalls, relation_f1s = [], [], []

    for _, row in df.iterrows():
        # Evaluate entities
        true_entities = row[gold_entity_col]
        pred_entities = row[pred_entity_col]
        print(f"Text: {row['sentences']}")
        p, r, f1 = evaluate_entities(true_entities, pred_entities)
        entity_precisions.append(p)
        entity_recalls.append(r)
        entity_f1s.append(f1)

        # Evaluate relations
        true_relations = row[gold_relation_col]
        pred_relations = row[pred_relation_col]
        p, r, f1 = evaluate_relations(true_relations, pred_relations)
        relation_precisions.append(p)
        relation_recalls.append(r)
        relation_f1s.append(f1)

    # Calculate average scores
    avg_entity_precision = sum(entity_precisions) / len(entity_precisions) if entity_precisions else 0
    avg_entity_recall = sum(entity_recalls) / len(entity_recalls) if entity_recalls else 0
    avg_entity_f1 = sum(entity_f1s) / len(entity_f1s) if entity_f1s else 0

    avg_relation_precision = sum(relation_precisions) / len(relation_precisions) if relation_precisions else 0
    avg_relation_recall = sum(relation_recalls) / len(relation_recalls) if relation_recalls else 0
    avg_relation_f1 = sum(relation_f1s) / len(relation_f1s) if relation_f1s else 0

    return {
        "entity_precision": avg_entity_precision,
        "entity_recall": avg_entity_recall,
        "entity_f1": avg_entity_f1,
        "relation_precision": avg_relation_precision,
        "relation_recall": avg_relation_recall,
        "relation_f1": avg_relation_f1
    }

In [286]:
# Use cleaned entities and relations
df['gold_entities'] = df['cleaned_spacy_entities']
df['gold_relations'] = df['cleaned_spacy_relations']

In [287]:
# Evaluate Transformers + SpaCy
transformer_results = evaluate_method(df, 'cleaned_transformer_entities', 'cleaned_transformer_relations', 'gold_entities', 'gold_relations')

print("Transformers+SpaCy Results:", transformer_results)

Text: One of the other reviewers has mentioned that after watching just 1 Oz episode you will be hooked They are right as this is exactly what happened with me The first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs sex or violence Its is hardcore in the classic use of the word It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda Em City is home to many Aryans Muslims gangstas Latinos Christians Italians Irish and more so scuffles death stares dodgy dealings and shady agreements are never far away I would say the main appeal of the show is due to the fact that it goes where other shows would not dare Forge