In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [8]:
jsonl_path = "/kaggle/input/datasets/tunguz/pubmed-title-abstracts-2019-baseline/PUBMED_title_abstracts_2019_baseline.jsonl"


In [9]:
import json
import pandas as pd
from tqdm import tqdm


In [10]:
hypertension_keywords = [
    "hypertension",
    "high blood pressure",
    "elevated blood pressure",
    "essential hypertension",
    "arterial hypertension"
]

stroke_keywords = [
    "stroke",
    "ischemic stroke",
    "ischaemic stroke",
    "hemorrhagic stroke",
    "haemorrhagic stroke",
    "cerebrovascular accident",
    "brain ischemia",
    "cerebral infarction"
]

def is_relevant_abstract(text: str) -> bool:
    text = text.lower()
    return (
        any(k in text for k in hypertension_keywords)
        or
        any(k in text for k in stroke_keywords)
    )


In [11]:
import json

with open(jsonl_path, "r") as f:
    for i in range(3):
        line = f.readline()
        paper = json.loads(line)
        print(paper.keys())
        print(paper)
        print("-" * 80)


dict_keys(['meta', 'text'])
{'meta': {'pmid': 11409574, 'language': 'eng'}, 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age. Systematic review of the published literature. Out-patient clinics, emergency departments and hospitalisation wards in 23 health centres from 10 countries. Cohort studies reporting the frequency of hypoxaemia in children under 5 years of age with ALRI, and the association between hypoxaemia and the risk of dying. Prevalence of hypoxaemia measured in children with ARI and relative risks for the association between the severity of illness and the frequency of hypoxaemia, and between hypoxaemia and the risk of dying. Seventeen publishe

In [12]:
TARGET_COUNT = 4000 
records = []

with open(jsonl_path, "r") as f:
    for line in tqdm(f):
        if len(records) >= TARGET_COUNT:
            break

        paper = json.loads(line)

        text = paper.get("text", "")
        if not text:
            continue

        if is_relevant_abstract(text):
            records.append({
                "pmid": paper.get("meta", {}).get("pmid", ""),
                "text": text
            })

df = pd.DataFrame(records)

print("Collected abstracts:", len(df))
df.head()


139203it [00:05, 26627.16it/s]

Collected abstracts: 4000





Unnamed: 0,pmid,text
0,11409606,Family education seminars and social functioni...
1,11409639,Local angiotensin II-generating system in vasc...
2,11409640,Diurnal variation of hemodynamic indices in no...
3,11409641,Profiles of patients who control the doses of ...
4,11409643,Non-Linear trends in the blood pressure of Jap...


In [13]:
df.rename(columns={"text": "abstract"}, inplace=True)
df.to_csv("filtered_pubmed.csv", index=False)


In [14]:
import pandas as pd

df = pd.read_csv("filtered_pubmed.csv")

print(df.shape)
df.head()


(4000, 2)


Unnamed: 0,pmid,abstract
0,11409606,Family education seminars and social functioni...
1,11409639,Local angiotensin II-generating system in vasc...
2,11409640,Diurnal variation of hemodynamic indices in no...
3,11409641,Profiles of patients who control the doses of ...
4,11409643,Non-Linear trends in the blood pressure of Jap...


In [15]:
import re

def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text)      # normalize whitespace
    text = text.strip()
    return text.lower()

df["clean_abstract"] = df["abstract"].astype(str).apply(clean_text)

In [16]:
import nltk
nltk.download("punkt")

from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
chunks = []
chunk_id = 0

MAX_TOKENS = 400   # safe middle of 300â€“500

for _, row in df.iterrows():
    pmid = row["pmid"]
    sentences = sent_tokenize(row["clean_abstract"])

    current_chunk = []
    current_len = 0

    for sent in sentences:
        sent_len = len(sent.split())

        if current_len + sent_len <= MAX_TOKENS:
            current_chunk.append(sent)
            current_len += sent_len
        else:
            chunks.append({
                "chunk_id": chunk_id,
                "pmid": pmid,
                "chunk_text": " ".join(current_chunk)
            })
            chunk_id += 1

            current_chunk = [sent]
            current_len = sent_len

    if current_chunk:
        chunks.append({
            "chunk_id": chunk_id,
            "pmid": pmid,
            "chunk_text": " ".join(current_chunk)
        })
        chunk_id += 1


In [18]:
chunks_df = pd.DataFrame(chunks)

print("Total chunks:", len(chunks_df))
chunks_df.head()

Total chunks: 4095


Unnamed: 0,chunk_id,pmid,chunk_text
0,0,11409606,family education seminars and social functioni...
1,1,11409639,local angiotensin ii-generating system in vasc...
2,2,11409640,diurnal variation of hemodynamic indices in no...
3,3,11409641,profiles of patients who control the doses of ...
4,4,11409643,non-linear trends in the blood pressure of jap...


In [19]:
chunks_df.to_csv("chunks.csv", index=False)

In [20]:
chunks_df.sample(3)


Unnamed: 0,chunk_id,pmid,chunk_text
3131,3131,12449899,cardiovascular disease costs associated with u...
410,410,11424516,"the herbal preparation, ygd capsules, signific..."
40,40,11411119,[the role of cardiac catheterization for diagn...


In [1]:
!pip install -q transformers torch


In [2]:
from transformers import pipeline

ner = pipeline(
    "ner",
    model="d4data/biomedical-ner-all",
    aggregation_strategy="simple"
)


2026-02-10 16:14:52.750433: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770740092.990459     500 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770740093.059516     500 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770740093.626032     500 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770740093.626090     500 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770740093.626094     500 computation_placer.cc:177] computation placer alr

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu


In [3]:
text = """
Hypertension is a major risk factor for ischemic stroke.
ACE inhibitors are commonly prescribed to control blood pressure.
"""

entities = ner(text)
entities


[{'entity_group': 'Sign_symptom',
  'score': 0.9741545,
  'word': 'hyper',
  'start': 1,
  'end': 6},
 {'entity_group': 'Sign_symptom',
  'score': 0.455893,
  'word': '##tension',
  'start': 6,
  'end': 13},
 {'entity_group': 'Medication',
  'score': 0.9993838,
  'word': 'ace inhibitors',
  'start': 58,
  'end': 72}]

This happens because:
Hugging Face NER models are transformer-based,They use WordPiece / BPE tokenization
So "hypertension" becomes: hyper + ##tension

In [4]:
def merge_subword_entities(entities):
    merged = []
    current = None

    for ent in entities:
        word = ent["word"]

        if word.startswith("##"):
            # continuation of previous token
            if current:
                current["entity_text"] += word.replace("##", "")
        else:
            if current:
                merged.append(current)
            current = {
                "entity_text": word,
                "entity_type": ent["entity_group"]
            }

    if current:
        merged.append(current)

    return merged


###Sample NER

In [5]:
text = "Hypertension is treated with ACE inhibitors."

raw_entities = ner(text)
clean_entities = merge_subword_entities(raw_entities)

print("RAW:")
print(raw_entities)

print("\nCLEANED:")
print(clean_entities)


RAW:
[{'entity_group': 'Sign_symptom', 'score': 0.99968183, 'word': 'hyper', 'start': 0, 'end': 5}, {'entity_group': 'Sign_symptom', 'score': 0.9426377, 'word': '##tension', 'start': 5, 'end': 12}, {'entity_group': 'Medication', 'score': 0.9693132, 'word': 'ace', 'start': 29, 'end': 32}]

CLEANED:
[{'entity_text': 'hypertension', 'entity_type': 'Sign_symptom'}, {'entity_text': 'ace', 'entity_type': 'Medication'}]


In [22]:
entity_rows = []

for _, row in chunks_df.iterrows():
    raw_ents = ner(row["chunk_text"])
    clean_ents = merge_subword_entities(raw_ents)

    for ent in clean_ents:
        entity_rows.append({
            "chunk_id": row["chunk_id"],
            "pmid": row["pmid"],
            "entity_text": ent["entity_text"].lower(),
            "entity_type": ent["entity_type"]
        })

entities_df = pd.DataFrame(entity_rows)
entities_df.head()

Unnamed: 0,chunk_id,pmid,entity_text,entity_type
0,0,11409606,social functioning,Diagnostic_procedure
1,0,11409606,chronic,Detailed_description
2,0,11409606,aphas,Disease_disorder
3,0,11409606,social roles,Coreference
4,0,11409606,weeks,Duration


In [23]:
entities_df.to_csv("entities.csv", index=False)

In [24]:
import pandas as pd

chunks_df = pd.read_csv("chunks.csv")
entities_df = pd.read_csv("entities.csv")

print(chunks_df.shape)
print(entities_df.shape)
 

(4095, 3)
(138547, 4)


In [25]:
VALID_ENTITY_TYPES = [
    "Disease",
    "Sign_symptom",
    "Medication"
]

entities_df = entities_df[entities_df["entity_type"].isin(VALID_ENTITY_TYPES)]


In [26]:
RELATION_PATTERNS = [
    ("increases risk of", "INCREASES_RISK_OF"),
    ("risk factor for", "INCREASES_RISK_OF"),
    ("associated with", "ASSOCIATED_WITH"),
    ("causes", "CAUSES"),
    ("leads to", "CAUSES"),
    ("treated with", "TREATED_BY"),
    ("treatment with", "TREATED_BY"),
    ("managed with", "TREATED_BY")
]


In [27]:
chunk_entity_map = (
    entities_df
    .groupby("chunk_id")
    .apply(lambda x: list(zip(x["entity_text"], x["entity_type"])))
    .to_dict()
)


  .apply(lambda x: list(zip(x["entity_text"], x["entity_type"])))


In [28]:
triplets = []

for _, row in chunks_df.iterrows():
    chunk_id = row["chunk_id"]
    text = row["chunk_text"].lower()
    pmid = row["pmid"]

    if chunk_id not in chunk_entity_map:
        continue

    entities = chunk_entity_map[chunk_id]
    entity_texts = [e[0] for e in entities]

    for pattern, relation in RELATION_PATTERNS:
        if pattern in text and len(entity_texts) >= 2:
            head = entity_texts[0]
            tail = entity_texts[1]

            if head != tail:
                triplets.append({
                    "head": head,
                    "relation": relation,
                    "tail": tail,
                    "source_pmid": pmid
                })


In [29]:
triplets_df = pd.DataFrame(triplets).drop_duplicates()

print("Total triplets:", len(triplets_df))
triplets_df.head()


Total triplets: 1209


Unnamed: 0,head,relation,tail,source_pmid
0,hypertension,ASSOCIATED_WITH,hypertensive,11409644
1,essential hypertension,ASSOCIATED_WITH,gene polymorphism,11409653
2,genetic variation2 bradykinin,INCREASES_RISK_OF,hypertension,11409654
3,genetic variation2 bradykinin,ASSOCIATED_WITH,hypertension,11409654
4,-,TREATED_BY,hypertension,11409655


In [30]:
triplets_df.to_csv("triplets.csv", index=False)


In [31]:
triplets_df.sample(5)


Unnamed: 0,head,relation,tail,source_pmid
24,activityriuresis,ASSOCIATED_WITH,baroreflexes,11411750
171,medications,INCREASES_RISK_OF,bronchodilator medications,11434792
968,complications,TREATED_BY,morbidity,12455117
844,high,TREATED_BY,diuretics,12436155
1103,antiretl,ASSOCIATED_WITH,antiiral,30851231


In [32]:
def is_valid_entity(e):
    return (
        isinstance(e, str)
        and len(e) >= 4
        and e.isalpha()
    )

triplets_df = triplets_df[
    triplets_df["head"].apply(is_valid_entity) &
    triplets_df["tail"].apply(is_valid_entity)
]


In [33]:
GENERIC_TERMS = {"medications", "complications", "morbidity"}

triplets_df = triplets_df[
    ~triplets_df["head"].isin(GENERIC_TERMS) &
    ~triplets_df["tail"].isin(GENERIC_TERMS)
]


In [34]:
triplets_df.sample(5)


Unnamed: 0,head,relation,tail,source_pmid
941,lesions,ASSOCIATED_WITH,stenosis,12450241
925,tumor,ASSOCIATED_WITH,necrosis,12446992
63,hypertension,TREATED_BY,overweight,11416590
173,stroke,ASSOCIATED_WITH,abcixima,11434830
1118,bleeding,ASSOCIATED_WITH,varice,30852769


In [35]:
print("Total triplets:", len(triplets_df))
triplets_df.head()


Total triplets: 620


Unnamed: 0,head,relation,tail,source_pmid
0,hypertension,ASSOCIATED_WITH,hypertensive,11409644
7,overweight,ASSOCIATED_WITH,obesity,11410818
8,pulmonary,CAUSES,alve,11411115
9,mono,ASSOCIATED_WITH,pyrrolizidine,11411116
11,hyperphy,CAUSES,mutation,11411129
