In [1]:
import pandas as pd
from gliner import GLiNER
from tqdm import tqdm

In [2]:


# Path to your dataset file
file_path = 'sample-1M.jsonl'

# Read only the first 100 lines to save memory
df = pd.read_json(file_path, lines=True, nrows=100000)

print(f"Successfully loaded {len(df)} articles.")
if 'title' in df.columns:
    print("Sample Article Title:", df.loc[0, 'title'])
else:
    print("No 'title' column in the dataset")

# Display the first few rows
# print(df.head())

df.head()

Successfully loaded 100000 articles.
Sample Article Title: Worcester breakfast club for veterans gives hunger its marching orders


Unnamed: 0,id,content,title,media-type,source,published
0,f7ca322d-c3e8-40d2-841f-9d7250ac72ca,VETERANS saluted Worcester's first ever breakf...,Worcester breakfast club for veterans gives hu...,News,Redditch Advertiser,2015-09-07T10:16:14Z
1,609772bc-0672-4db5-8516-4c025cfd54ca,New Product Gives Marketers Access to Real Key...,Jumpshot Gives Marketers Renewed Visibility In...,News,Virtualization Conference & Expo,2015-09-17T15:00:00Z
2,1aa9d1b0-e6ba-4a48-ad0c-66552d896aac,Home »\rStyle » The Return Of The Nike Air Max...,The Return Of The Nike Air Max Sensation Has 8...,Blog,Streets Connect,2015-09-22T22:54:37Z
3,719699f9-47be-4bc7-969b-b53a881c95ae,NYMag.com Daily Intelligencer Vulture The Cut ...,This New Dating App Will Ruin Your Internet Game,Blog,The Cut,2015-09-16T23:12:11Z
4,a080f99a-07d9-47d1-8244-26a540017b7a,"KUALA LUMPUR, Sept 15 (MySinchew) -- The Kuala...",Pay up or face legal action: DBKL,News,My Sinchew,2015-09-15T10:17:53Z


In [3]:
model = GLiNER.from_pretrained("urchade/gliner_base")

labels = [
    # Core OntoNotes
    "person", "norp", "facility", "organization", "gpe", "location",
    "product", "event", "work_of_art", "law", "language",
    "date", "time", "percent", "money", "quantity", "ordinal", "cardinal",

    # Common extensions
    "religion",
    "political_party",
    "nationality",
    "ethnic_group",
    "title",
    "award",
    "disease",
    "chemical",
    "weapon",
    "vehicle",
    "currency",
    "brand"
]

# # Run NER on titles
# def run_ner(text):
#     if pd.isna(text):
#         return []
#     return model.predict_entities(text, labels)

# df["entities"] = df["title"].apply(run_ner)
batch_size = 32
texts = df["title"].fillna("").astype(str).tolist()
all_entities = []

for i in tqdm(range(0, len(texts), batch_size)):
    print(f"Processing starting index: {i} | Remaining: {len(texts) - i}")

    batch = texts[i:i+batch_size]

    preds = model.run(
        batch,
        labels,
        batch_size=len(batch)  # important
    )

    all_entities.extend(preds)

df["entities"] = all_entities

# Show result
df[["title", "entities"]].head()

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|                                                                       | 0/3125 [00:00<?, ?it/s]


Processing starting index: 0 | Remaining: 100000


TypeError: expected string or bytes-like object

In [None]:
output_path = "ner_result_100k.json"

df[["title", "entities"]].to_json(
    output_path,
    orient="records",
    indent=2,
    force_ascii=False
)

print(f"NER results saved to {output_path}")