In [14]:
import pandas as pd
from datasets import load_dataset
from gliner import GLiNER

In [15]:


# Path to your dataset file
file_path = 'sample-1M.jsonl'

# Read only the first 100 lines to save memory
df = pd.read_json(file_path, lines=True, nrows=100)

print(f"Successfully loaded {len(df)} articles.")
if 'title' in df.columns:
    print("Sample Article Title:", df.loc[0, 'title'])
else:
    print("No 'title' column in the dataset")

# Display the first few rows
# print(df.head())

df.head()

Successfully loaded 100 articles.
Sample Article Title: Worcester breakfast club for veterans gives hunger its marching orders


Unnamed: 0,id,content,title,media-type,source,published
0,f7ca322d-c3e8-40d2-841f-9d7250ac72ca,VETERANS saluted Worcester's first ever breakf...,Worcester breakfast club for veterans gives hu...,News,Redditch Advertiser,2015-09-07T10:16:14Z
1,609772bc-0672-4db5-8516-4c025cfd54ca,New Product Gives Marketers Access to Real Key...,Jumpshot Gives Marketers Renewed Visibility In...,News,Virtualization Conference & Expo,2015-09-17T15:00:00Z
2,1aa9d1b0-e6ba-4a48-ad0c-66552d896aac,Home »\rStyle » The Return Of The Nike Air Max...,The Return Of The Nike Air Max Sensation Has 8...,Blog,Streets Connect,2015-09-22T22:54:37Z
3,719699f9-47be-4bc7-969b-b53a881c95ae,NYMag.com Daily Intelligencer Vulture The Cut ...,This New Dating App Will Ruin Your Internet Game,Blog,The Cut,2015-09-16T23:12:11Z
4,a080f99a-07d9-47d1-8244-26a540017b7a,"KUALA LUMPUR, Sept 15 (MySinchew) -- The Kuala...",Pay up or face legal action: DBKL,News,My Sinchew,2015-09-15T10:17:53Z


In [17]:

# import pandas as pd

# ---------------------------------------------------------
# 1. Initialize Stanza Pipeline
# ---------------------------------------------------------
# 'processors': We only need tokenization and NER (Named Entity Recognition).
# 'use_gpu': Set to True if you have a GPU, otherwise False.
print("Initializing Stanza (downloading model if needed)...")
stanza.download('en', processors='tokenize,ner')
nlp = stanza.Pipeline('en', processors='tokenize,ner', use_gpu=False, verbose=False)

# ---------------------------------------------------------
# 2. Define Entity Extraction Function
# ---------------------------------------------------------
def get_entities_stanza(text):
    """
    Extracts specific entity types (PER, ORG, LOC, GPE) from text using Stanza.
    """
    print("1")
    if not isinstance(text, str) or not text.strip():
        return []
    
    # Process text with Stanza
    print("Process text with NLP")
    doc = nlp(text)
    
    # Filter for relevant entity types
    # PERSON: People, including fictional.
    # ORG: Companies, agencies, institutions.
    # GPE: Countries, cities, states.
    # LOC: Non-GPE locations, mountain ranges, bodies of water.
    relevant_types = {'PERSON', 'ORG', 'GPE', 'LOC'}
    
    entities = []
    # Stanza processes text sentence by sentence
    print("Stanza processes text sentence by sentence")
    i = 1
    for sent in doc.sentences:
        # print("Sentence Number ", i)
        # print("Sentence : ", sent)
        for ent in sent.ents:
            # print("Entities :", ent)
            if ent.type in relevant_types:
                entities.append((ent.text, ent.type))
                # print("Type : ", ent.type)
            # print("---------")
        i = i + 1
    print("Entities :", entities)
    return entities

# ---------------------------------------------------------
# 3. Process the DataFrame
# ---------------------------------------------------------
print("Extracting entities from dataframe...")

# Create a combined text column (Title + Content) for better context
# Using .fillna('') to handle potential missing values
print("If content in df.columns")
if 'content' in df.columns:
    df['full_text'] = df['title'].fillna('') + ". " + df['content'].fillna('')
else:
    df['full_text'] = df['title'].fillna('')

# print(df['full_text'])

# Apply the extraction function (This may take a minute for 100 rows)
df['extracted_entities'] = df['full_text'].apply(get_entities_stanza)

# ---------------------------------------------------------
# 4. Display Results
# ---------------------------------------------------------
print("\n" + "="*60)
print("STANZA NER EXTRACTION RESULTS")
print("="*60)

# Display the first 5 results
for index, row in df.head(5).iterrows():
    print(f"Article ID: {index}")
    print(f"Title: {row.get('title', 'No Title')}")
    print(f"Found Entities: {row['extracted_entities']}")
    print("-" * 60)

Initializing Stanza (downloading model if needed)...
Extracting entities from dataframe...
If content in df.columns
1
Process text with NLP
Stanza processes text sentence by sentence
Entities : [('Worcester', 'GPE'), ('VETERANS', 'ORG'), ("Worcester's", 'ORG'), ('The Worcester Breakfast Club', 'ORG'), ('HM Forces Veterans', 'ORG'), ('Dave Carney', 'PERSON'), ('Merrimans Hill', 'GPE'), ('Worcester', 'GPE'), ('Droitwich', 'GPE'), ('Carney', 'PERSON'), ('the Royal Engineers', 'ORG'), ('Bromsgrove', 'GPE'), ('Gloucester', 'GPE'), ('Derek Hardman', 'PERSON'), ('Hull', 'GPE'), ('Andy Wilson', 'PERSON'), ('Newcastle', 'GPE'), ('Germany', 'GPE'), ('Carney', 'PERSON'), ('Royal British Legion', 'ORG'), ('The Postal Order', 'ORG'), ('the Postal Order', 'ORG')]
1
Process text with NLP
Stanza processes text sentence by sentence
Entities : [('SAN FRANCISCO', 'GPE'), ('CA', 'GPE'), ('Marketwired', 'ORG'), ('Jumpshot', 'ORG'), ('Jumpshot', 'ORG'), ('Deren Baker', 'PERSON'), ('Jumpshot', 'ORG'), ('Jump

KeyboardInterrupt: 