Introduction To NLP @ Esade BAIB

# Named Entity Recognition with GliNER

## Preliminaries

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 300)
pd.set_option('display.max_colwidth', 300)
from tqdm import tqdm

In [2]:
reviews_df = pd.read_csv('movie_reviews_train.csv')

----

# GliNER

https://github.com/urchade/GLiNER

## Preliminaries

In [None]:
! pip install gliner

In [None]:
from gliner import GLiNER

In [None]:
model = GLiNER.from_pretrained("urchade/gliner_medium")
model.eval()
print("ok")

## Running GliNER

In [None]:
def run_gliner_on_text(text, labels = None, print_outputs: bool = True):
    if labels is None: 
        labels = ["person", "book", "movie", "location", "date", "actor", "character", "device"]
    mentions = model.predict_entities(text, labels, threshold=0.4)
    if print_outputs:
        print(pd.DataFrame(mentions).to_markdown())
    return mentions


In [None]:
_ = run_gliner_on_text("I went to Barcelona and watched Rambo II on my iPad, I like all movies with Stallone!")

## Aggregating GliNER entities

In [None]:
def gliner_entity_to_dict(entity_dict, doc_label):
    return {
        'text': entity_dict['text'], 
        'lemma': entity_dict['text'].lower().strip(),
        'ner_type': entity_dict['label'],
        doc_label: 1
    }

def entity_aggregation_for_gliner(gliner_docs, doc_labels=None):
    if doc_labels is None:
        doc_labels = ['unlabeled'] * len(gliner_docs)
    else:
        if len(doc_labels) != len(gliner_docs):
            raise ValueError("Number of doc labels must be the equal to number of gliner docs")
    df = pd.DataFrame([gliner_entity_to_dict(entity_dict, label) for doc, label in zip(gliner_docs, doc_labels) for entity_dict in doc]).fillna(0)
    agg_dict = {
        'num_docs': pd.NamedAgg('text', 'count')
    }
    for l in set(doc_labels):
        agg_dict[f'num_{l}'] = pd.NamedAgg(l, 'sum')
    agg = df.groupby(['lemma', 'ner_type'], as_index=False).agg(**agg_dict).sort_values('num_docs', ascending=False)
    return agg

In [None]:
docs_gliner = [run_gliner_on_text(t, print_outputs=False) for t in tqdm(reviews_df.text[:5000])]

In [None]:
agg_gliner = entity_aggregation_for_gliner(docs_gliner)
agg_gliner

In [None]:
agg_gliner[agg_gliner.ner_type=="actor"]