In [2]:
pip install habanero

Defaulting to user installation because normal site-packages is not writeable
Collecting urllib3==2.2.0 (from habanero)
  Using cached urllib3-2.2.0-py3-none-any.whl.metadata (6.4 kB)
Using cached urllib3-2.2.0-py3-none-any.whl (120 kB)
[0mInstalling collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.26.16
    Uninstalling urllib3-1.26.16:
      Successfully uninstalled urllib3-1.26.16
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
botocore 1.37.10 requires urllib3!=2.2.0,<3,>=1.25.4; python_version >= "3.10", but you have urllib3 2.2.0 which is incompatible.
crossrefapi 1.6.0 requires urllib3==1.26.16, but you have urllib3 2.2.0 which is incompatible.
evidently 0.4.40 requires cryptography>=43.0.1, but you have cryptography 41.0.7 which is incompatible.
evidently 0.4.40 requires requests>=2.32.0, but y

In [1]:
from habanero import Crossref

cr = Crossref()

# Lookup metadata from DOI
result = cr.works(ids="10.1016/j.jep.2020.112743")

print(result['message']['title'])
print(result['message']['published-print']['date-parts'][0][0])  # Year
print(result['message']['author'][0]['family'])  # First author last name


["Neuroprotective potential of Ayahuasca and untargeted metabolomics analyses: applicability to Parkinson's disease"]
2020
Katchborian-Neto


In [2]:
pip install crossrefapi pyvis PyMuPDF scispacy spacy networkx matplotlib

Defaulting to user installation because normal site-packages is not writeable
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
# lineage_lit_spider.py

import os
import re
import requests
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from scholarly import scholarly
from pyvis.network import Network
from crossrefapi import Crossref
from PyPDF2 import PdfReader


ModuleNotFoundError: No module named 'crossrefapi'

In [2]:
# lineage_spider.ipynb

# Cell 1: Imports
import os
import requests
import pandas as pd
import json
from habanero import Crossref
from PyPDF2 import PdfReader
from collections import defaultdict
from pyvis.network import Network

# Cell 2: Initialization
cr = Crossref(mailto="nick.laskowski@sensorium.bio")

base_folder = 'literature_lineage_results'
os.makedirs(base_folder, exist_ok=True)

# Cell 3: Extract PDF Text
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    pages = [page.extract_text() for page in reader.pages]
    return pages

# Cell 4: Keyword Hit Extraction
def find_keyword_contexts(pages, keywords):
    hits = []
    for page_num, text in enumerate(pages):
        for keyword in keywords:
            if keyword.lower() in text.lower():
                hits.append({'keyword': keyword, 'page': page_num+1, 'evidence': text})
    return hits

# Cell 5: CrossRef Metadata
def get_metadata(doi):
    result = cr.works(ids=doi)
    if result:
        return {
            'title': result['message']['title'][0] if 'title' in result['message'] else 'N/A',
            'year': result['message']['issued']['date-parts'][0][0] if 'issued' in result['message'] else 'N/A',
            'references': result['message'].get('reference', [])
        }
    return {'title': 'N/A', 'year': 'N/A', 'references': []}

# Cell 6: Download PDF
def download_pdf(doi, layer):
    url = f'https://doi.org/{doi}'
    response = requests.get(url, allow_redirects=True)
    file_path = f'{base_folder}/layer_{layer}/{doi.replace("/", "_")}.pdf'
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'wb') as f:
        f.write(response.content)
    return file_path

# Cell 7: Main Spidering Function
def run_lineage_spider(start_doi, keywords, max_layers=2):
    lineage = []
    seen = set()

    def spider(doi, layer, source_doi='N/A'):
        if doi in seen or layer > max_layers:
            return
        seen.add(doi)

        meta = get_metadata(doi)
        try:
            pdf_path = download_pdf(doi, layer)
            pages = extract_text_from_pdf(pdf_path)
            hits = find_keyword_contexts(pages, keywords)
        except Exception as e:
            print(f'Failed for DOI {doi}: {e}')
            hits = []

        for hit in hits:
            lineage.append({
                'Layer': layer,
                'DOI': doi,
                'Title': meta['title'],
                'Year': meta['year'],
                'Source_DOI': source_doi,
                'Keyword': hit['keyword'],
                'Page': hit['page'],
                'Evidence': hit['evidence']
            })

        for ref in meta['references']:
            if 'DOI' in ref:
                spider(ref['DOI'], layer+1, doi)

    spider(start_doi, 1)

    df = pd.DataFrame(lineage)
    df.to_csv(f'{base_folder}/lineage_output.csv', index=False)

    return df

# Cell 8: Visualization
def build_network(df):
    net = Network(height='800px', width='100%')

    for idx, row in df.iterrows():
        net.add_node(row['DOI'], label=row['Title'][:50], title=row['Title'], shape='box')
        if row['Source_DOI'] != 'N/A':
            net.add_edge(row['Source_DOI'], row['DOI'])
        entity_id = f"{row['Keyword']}-{idx}"
        net.add_node(entity_id, label=row['Keyword'], shape='ellipse', color='orange')
        net.add_edge(row['DOI'], entity_id, title=row['Evidence'][:200])

    net.show(f'{base_folder}/lineage_network.html')

# Cell 9: Execute Example
start_doi = '10.3389/fphar.2023.1240295'  # Replace with your starting paper
df = run_lineage_spider(start_doi, keywords=['kava', 'pfeiffer', 'epilepsy'], max_layers=2)
build_network(df)


Failed for DOI 10.3389/fphar.2023.1240295: EOF marker not found
Failed for DOI 10.1007/s41465-019-00151-6: EOF marker not found
Failed for DOI 10.1016/j.cell.2021.03.022: EOF marker not found
Failed for DOI 10.1016/S2468-1253(19)30333-4: EOF marker not found
Failed for DOI 10.3389/fmed.2019.00334: EOF marker not found
Failed for DOI 10.1111/bph.13714: EOF marker not found
Failed for DOI 10.1080/03344355.2020.1732046: EOF marker not found
Failed for DOI 10.1186/1746-4269-10-26: EOF marker not found
Failed for DOI 10.1007/s00109-011-0752-4: EOF marker not found
Failed for DOI 10.1080/02791072.2019.1593560: EOF marker not found
Failed for DOI 10.1136/jnnp-2019-320912: EOF marker not found
Failed for DOI 10.1038/srep30550: EOF marker not found
Failed for DOI 10.1007/s00213-007-0963-0: EOF marker not found
Failed for DOI 10.4103/0253-7176.183086: EOF marker not found
Failed for DOI 10.1016/j.paid.2017.06.004: EOF marker not found
Failed for DOI 10.5152/eurjrheum.2017.17025: EOF marker not f

KeyboardInterrupt: 

v2

# Install required libraries (only once per environment)

In [3]:
!pip install habanero requests pandas PyMuPDF tqdm

Defaulting to user installation because normal site-packages is not writeable
[0m

In [5]:
# --- Imports ---
import os
import re
import requests
import pandas as pd
import fitz  # PyMuPDF
from habanero import Crossref
from tqdm import tqdm

# Set your email for Unpaywall API (required)
UNPAYWALL_EMAIL = "info@sensorium.bio"

# Step 2: Extract Metadata and References from a DOI using habanero

This will help us:

Retrieve the title, publication year, and references from a source paper.

Prepare for citation traversal in future steps.

In [6]:
# Initialize Crossref client
cr = Crossref()

def get_metadata_from_doi(doi):
    """
    Get metadata including title, year, and references from a DOI.
    """
    try:
        record = cr.works(ids=doi)
        title = record['message'].get('title', [''])[0]
        year = record['message']['issued']['date-parts'][0][0]
        references = record['message'].get('reference', [])
        
        # Extract DOIs from references if available
        cited_dois = []
        for ref in references:
            if 'DOI' in ref:
                cited_dois.append(ref['DOI'])

        return {
            'doi': doi,
            'title': title,
            'year': year,
            'cited_dois': cited_dois
        }

    except Exception as e:
        print(f"Error retrieving metadata for DOI {doi}: {e}")
        return None


In [7]:
# Example DOI (replace with your own)
test_doi = "10.1007/BF01711971"

meta = get_metadata_from_doi(test_doi)
print("Title:", meta['title'])
print("Year:", meta['year'])
print("Cited DOIs:", meta['cited_dois'][:5])  # Show first 5


Title: Kawa-Pyrone — eine neuartige Substanzgruppe zentraler Muskelrelaxantien vom Typ des Mephenesins
Year: 1966
Cited DOIs: ['10.1111/j.1749-6632.1960.tb42792.x', '10.1111/j.1749-6632.1956.tb36842.x', '10.1007/BF00420104']


# Step 3: Query Unpaywall API to Retrieve Open Access PDF Links

This step will:

Query the Unpaywall API for the best open-access link.

Attempt to download the PDF (if available).

Save it to your directory structure.

In [8]:
pip install unpywall

Defaulting to user installation because normal site-packages is not writeable
Collecting unpywall
  Downloading unpywall-0.2.3.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: unpywall
  Building wheel for unpywall (setup.py) ... [?25ldone
[?25h  Created wheel for unpywall: filename=unpywall-0.2.3-py3-none-any.whl size=12318 sha256=11add7f9235f92787e588d208fecebe77d1bcea2f638f85f7410c14eb92b7713
  Stored in directory: /home/ubuntu/.cache/pip/wheels/90/6b/7f/3a1299f56cc32bdd6d386c4fbf8d1ef8c7fcab39606ed63395
Successfully built unpywall
[0mInstalling collected packages: unpywall
Successfully installed unpywall-0.2.3
Note: you may need to restart the kernel to use updated packages.


In [14]:
from unpywall import Unpywall

# Set your email
Unpywall.email = "nick.laskowski@sensorium.bio"
unpaywall = Unpywall()

# Correct method for Unpaywall API lookup
result = unpaywall.lookup(doi)


AttributeError: 'Unpywall' object has no attribute 'lookup'

In [15]:
from habanero import cn
import requests
import os
import time

def get_unpaywall_pdf_url(doi):
    try:
        result = cn.content_negotiation(ids=doi, format="bibentry", url="https://api.unpaywall.org/v2/" + doi,
                                        headers={"Accept": "application/json"},
                                        mailto="nick.laskowski@sensorium.bio")
        
        if result and 'best_oa_location' in result and result['best_oa_location']:
            return result['best_oa_location']['url_for_pdf']
        else:
            return None
    except Exception as e:
        print(f"Unpaywall lookup failed for {doi}: {e}")
        return None

def download_pdf(doi, save_dir="literature_spider_results/layer_1/"):
    url = get_unpaywall_pdf_url(doi)
    if url:
        os.makedirs(save_dir, exist_ok=True)
        filename = doi.replace("/", "_") + ".pdf"
        save_path = os.path.join(save_dir, filename)
        
        print(f"Attempting to download: {url}")
        response = requests.get(url, stream=True)
        
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {doi} to {save_path}")
            return save_path
        else:
            print(f"Failed to download {doi} (HTTP {response.status_code})")
    else:
        print(f"No PDF found for {doi}")
    
    return None


In [16]:
doi = "10.3389/fphar.2023.1240295"
download_pdf(doi)


Unpaywall lookup failed for 10.3389/fphar.2023.1240295: httpx.get() got multiple values for keyword argument 'headers'
No PDF found for 10.3389/fphar.2023.1240295


In [17]:
import requests
import os

UNPAYWALL_API = "https://api.unpaywall.org/v2/"
EMAIL = "nick.laskowski@sensorium.bio"

def get_unpaywall_metadata(doi):
    url = f"{UNPAYWALL_API}{doi}?email={EMAIL}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed Unpaywall lookup for {doi} (HTTP {response.status_code})")
    except Exception as e:
        print(f"Exception during Unpaywall request for {doi}: {e}")
    return None

def get_best_pdf_url(metadata):
    if not metadata:
        return None
    location = metadata.get("best_oa_location", {})
    return location.get("url_for_pdf")

def download_pdf_from_doi(doi, save_dir="literature_spider_results/layer_1/"):
    metadata = get_unpaywall_metadata(doi)
    pdf_url = get_best_pdf_url(metadata)
    
    if pdf_url:
        os.makedirs(save_dir, exist_ok=True)
        filename = doi.replace("/", "_") + ".pdf"
        filepath = os.path.join(save_dir, filename)
        
        try:
            response = requests.get(pdf_url, stream=True, timeout=15)
            if response.status_code == 200:
                with open(filepath, 'wb') as f:
                    f.write(response.content)
                print(f"✅ Downloaded: {filepath}")
                return filepath
            else:
                print(f"❌ PDF download failed (HTTP {response.status_code})")
        except Exception as e:
            print(f"❌ Exception downloading PDF: {e}")
    else:
        print(f"❌ No open-access PDF URL found for {doi}")
    
    return None


In [18]:
download_pdf_from_doi("10.3389/fphar.2023.1240295")


✅ Downloaded: literature_spider_results/layer_1/10.3389_fphar.2023.1240295.pdf


'literature_spider_results/layer_1/10.3389_fphar.2023.1240295.pdf'

In [20]:
# Step: Lightweight Entity & Evidence Extraction

import spacy
import re
import pandas as pd
from pathlib import Path

# Load lightweight SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Example custom dictionary (expand as needed)
CUSTOM_TERMS = ["kava", "psilocybin", "ayahuasca", "TNF-alpha", "Parkinson", "epilepsy"]

# Compile regex for fast matching
custom_pattern = re.compile(r'|'.join([re.escape(term) for term in CUSTOM_TERMS]), re.IGNORECASE)


def extract_entities_and_evidence(pages, source_doi, layer):
    extracted_data = []

    for page_number, page_text in enumerate(pages, start=1):
        doc = nlp(page_text)
        
        # Named Entities
        for ent in doc.ents:
            if ent.label_ in ["ORG", "GPE", "PERSON", "NORP", "PRODUCT", "DISEASE"]:
                evidence = ent.sent.text
                extracted_data.append({
                    "Layer": layer,
                    "DOI": source_doi,
                    "Entity": ent.text,
                    "Evidence_Statement": evidence,
                    "Page_Number": page_number
                })

        # Custom Terms
        for match in custom_pattern.finditer(page_text):
            term = match.group(0)
            start_idx = match.start()

            # Get surrounding sentence as evidence
            sentences = list(doc.sents)
            for sent in sentences:
                if sent.start_char <= start_idx <= sent.end_char:
                    extracted_data.append({
                        "Layer": layer,
                        "DOI": source_doi,
                        "Entity": term,
                        "Evidence_Statement": sent.text,
                        "Page_Number": page_number
                    })
                    break

    return extracted_data


# Example usage:
# pages = extract_text_from_pdf("literature_spider_results/layer_1/10.3389_fphar.2023.1240295.pdf")
# data = extract_entities_and_evidence(pages, source_doi="10.3389/fphar.2023.1240295", layer=1)
# df = pd.DataFrame(data)
# df.to_csv("entities_and_evidence_layer1.csv", index=False)

print("Entity and Evidence Extraction Step Ready.")




Entity and Evidence Extraction Step Ready.


In [24]:
# Step 3: Lightweight Entity and Evidence Extraction

import re
import spacy
from PyPDF2 import PdfReader

# Load small spaCy model
nlp = spacy.load("en_core_web_sm")

# Define patterns (add more as needed)
chemical_pattern = r'\b[A-Z][a-z]{2,}\b'
disease_keywords = ["cancer", "epilepsy", "parkinson", "schizophrenia", "alzheimer", "inflammation"]
protein_keywords = ["receptor", "enzyme", "cytokine", "protein", "gene"]

def extract_entities_and_evidence(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = " ".join(page.extract_text() for page in reader.pages if page.extract_text())

    evidence_results = []

    for sent in re.split(r'(?<=[.!?]) +', full_text):
        doc = nlp(sent)
        entities = set()

        # Regex entity capture
        entities.update(re.findall(chemical_pattern, sent))

        # Keyword matches
        for keyword in disease_keywords + protein_keywords:
            if keyword.lower() in sent.lower():
                entities.add(keyword)

        if entities:
            evidence_results.append({
                "sentence": sent,
                "entities": list(entities)
            })

    return evidence_results

# Example Usage
pdf_path = 'fphar-14-1240295.pdf'
evidence_data = extract_entities_and_evidence(pdf_path)

# Display top evidence statements
import pandas as pd
pd.DataFrame(evidence_data).head(10)




Unnamed: 0,sentence,entities
0,Bedside to bench: the outlook for\npsychedelic...,"[Victor, Bedside]"
1,"Acero1,2,3,4, Emily S.",[Emily]
2,"Cribas4,5, Kevin D.",[Kevin]
3,"Browne1,2,\nOlivia Rivellini1,2,4, Justin C.","[Olivia, Justin]"
4,"Burrell1,2,3, John C.",[John]
5,"O ’Donnell1,2,4,\nSuradip Das1,2and D.",[Suradip]
6,"Kacy Cullen1,2,3*\n1Center for Brain Injury an...","[Injury, Repair, Corporal, Kacy, Philadelphia,..."
7,"Crescenz Veterans Affairs Medical Center, Phil...","[Crescenz, Applied, Microbiology, Engineering,..."
8,Despite promising ef ﬁcacy observed in some cl...,[Despite]
9,"Indeed, most studies to date have focused on a...",[Indeed]


In [25]:
df = pd.DataFrame(evidence_data)

In [28]:
df.to_csv("text.csv")