In [1]:
from pypdf import PdfReader
import re

pdf_path = r"E:\GitHub\molass-library\study\2002, Francesco Dondi.pdf"
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)

print(f"PDF loaded: {num_pages} pages")

In [2]:
# Extract text from first page to see title and abstract
first_page = reader.pages[0].extract_text()
print("=" * 80)
print("FIRST PAGE (Title and Abstract)")
print("=" * 80)
print(first_page[:1500])

In [3]:
# Search for GEC mentions
def search_keyword(keyword, max_results=5):
    results = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if keyword.lower() in text.lower():
            # Find the position
            idx = text.lower().find(keyword.lower())
            context_start = max(0, idx - 200)
            context_end = min(len(text), idx + 400)
            context = text[context_start:context_end]
            results.append((i+1, context))
    return results

# Search for GEC
gec_results = search_keyword("GEC")
print(f"Found 'GEC' on {len(gec_results)} pages")
print("\nFirst few occurrences:")
for page_num, context in gec_results[:3]:
    print(f"\n{'='*80}")
    print(f"Page {page_num}:")
    print(f"{'='*80}")
    print(context)

In [4]:
# Search for monopore model
monopore_results = search_keyword("monopore")
print(f"Found 'monopore' on {len(monopore_results)} pages")
print("\nOccurrences:")
for page_num, context in monopore_results[:5]:
    print(f"\n{'='*80}")
    print(f"Page {page_num}:")
    print(f"{'='*80}")
    print(context)

In [5]:
# Extract key pages that likely contain GEC equations
# Usually in the theory/methods section
print("Extracting pages 2-5 (likely theory section):")
for i in range(1, min(5, num_pages)):
    text = reader.pages[i].extract_text()
    print(f"\n{'='*80}")
    print(f"PAGE {i+1}")
    print(f"{'='*80}")
    print(text[:2000])  # First 2000 chars
    print("\n[... truncated ...]")

In [6]:
# Look for equation-related keywords
equation_keywords = [
    "characteristic function",
    "CF",
    "moment",
    "variance",
    "residence time",
    "pore",
    "stochastic"
]

for keyword in equation_keywords:
    results = search_keyword(keyword)
    print(f"\n'{keyword}': found on pages {[r[0] for r in results]}")

In [7]:
# Extract specific sections about SDM
# Look for equations and derivations

def extract_page_section(page_num, start_keyword, num_chars=3000):
    """Extract section starting from keyword"""
    text = reader.pages[page_num-1].extract_text()
    idx = text.lower().find(start_keyword.lower())
    if idx >= 0:
        return text[idx:idx+num_chars]
    return None

# Common section titles in chromatography papers
section_keywords = [
    "dispersive model",
    "brownian motion",
    "gaussian component",
    "plate height",
    "band broadening"
]

print("\nSearching for theoretical sections:")
for keyword in section_keywords:
    for page_num in range(1, num_pages+1):
        section = extract_page_section(page_num, keyword, 1500)
        if section:
            print(f"\n{'='*80}")
            print(f"Found '{keyword}' on page {page_num}")
            print(f"{'='*80}")
            print(section)
            break  # Only show first match

In [8]:
# Search for SDM-related terms
sdm_keywords = [
    "SDM",
    "dispersive",
    "Stochastic Dispersive Model",
    "mobile phase dispersion",
    "plate number",
    "N0",
    "Brownian"
]

print("=" * 80)
print("SEARCHING FOR SDM (STOCHASTIC DISPERSIVE MODEL)")
print("=" * 80)

for keyword in sdm_keywords:
    results = search_keyword(keyword, max_results=3)
    if results:
        print(f"\n\n{'='*80}")
        print(f"Keyword: '{keyword}' - Found on {len(results)} pages: {[r[0] for r in results]}")
        print(f"{'='*80}")
        
        # Show first occurrence with more context
        if len(results) > 0:
            page_num, context = results[0]
            print(f"\n[First occurrence - Page {page_num}]")
            print(context)

# Step 3: SDM (Stochastic Dispersive Model) Analysis

Now searching for SDM monopore model information

# Step 5: Lognormal Pore Distribution (Sepsey 2014a)

Reading and analyzing the lognormal pore distribution model from Annamária Sepsey's 2014 paper.

In [9]:
# Read Sepsey 2014a paper
sepsey_pdf_path = r"E:\GitHub\molass-library\study\2014a, Annamária Sepsey.pdf"
sepsey_reader = PdfReader(sepsey_pdf_path)
sepsey_num_pages = len(sepsey_reader.pages)

print(f"Sepsey 2014a PDF loaded: {sepsey_num_pages} pages")

# Extract title and abstract
sepsey_first_page = sepsey_reader.pages[0].extract_text()
print("=" * 80)
print("TITLE AND ABSTRACT")
print("=" * 80)
print(sepsey_first_page[:2000])

In [10]:
# Search for lognormal distribution mentions
def search_sepsey(keyword, max_results=5):
    """Search in Sepsey PDF"""
    results = []
    for i, page in enumerate(sepsey_reader.pages):
        text = page.extract_text()
        if keyword.lower() in text.lower():
            idx = text.lower().find(keyword.lower())
            context_start = max(0, idx - 250)
            context_end = min(len(text), idx + 500)
            context = text[context_start:context_end]
            results.append((i+1, context))
    return results

# Key terms for lognormal pore distribution
lognormal_keywords = [
    "lognormal",
    "log-normal",
    "pore size distribution",
    "pore distribution",
    "polydisperse",
    "heterogeneous pore"
]

print("=" * 80)
print("SEARCHING FOR LOGNORMAL PORE DISTRIBUTION")
print("=" * 80)

for keyword in lognormal_keywords:
    results = search_sepsey(keyword, max_results=3)
    if results:
        print(f"\n\n{'='*80}")
        print(f"Keyword: '{keyword}' - Found on {len(results)} pages: {[r[0] for r in results]}")
        print(f"{'='*80}")
        
        # Show first occurrence
        if len(results) > 0:
            page_num, context = results[0]
            print(f"\n[First occurrence - Page {page_num}]")
            print(context)

In [11]:
# Search for mathematical formulation
math_keywords = [
    "characteristic function",
    "PDF",
    "probability density",
    "distribution function",
    "moment",
    "variance",
    "mean",
    "standard deviation",
    "μ",
    "σ"
]

print("\n" + "=" * 80)
print("MATHEMATICAL FORMULATION")
print("=" * 80)

for keyword in math_keywords[:5]:  # First 5 keywords
    results = search_sepsey(keyword, max_results=2)
    if results:
        print(f"\n'{keyword}': found on pages {[r[0] for r in results]}")

In [12]:
# Extract key theory pages
# Typically theoretical development is in early-mid sections
print("\n" + "=" * 80)
print("EXTRACTING KEY THEORY PAGES")
print("=" * 80)

# Check pages 2-6 for theory
for page_num in range(1, min(7, sepsey_num_pages)):
    text = sepsey_reader.pages[page_num].extract_text()
    
    # Check if this page contains key theory terms
    has_lognormal = 'lognormal' in text.lower() or 'log-normal' in text.lower()
    has_pore = 'pore' in text.lower()
    has_distribution = 'distribution' in text.lower()
    
    if has_lognormal and has_pore:
        print(f"\n{'='*80}")
        print(f"PAGE {page_num + 1} - Contains lognormal pore theory")
        print(f"{'='*80}")
        print(text[:2500])  # First 2500 chars
        print("\n[... continued ...]")

In [13]:
# Look for equations and models
equation_sections = [
    "theory",
    "model",
    "equation",
    "stochastic",
    "chromatography",
    "SEC",
    "size exclusion"
]

print("\n" + "=" * 80)
print("SEARCHING FOR THEORETICAL SECTIONS")
print("=" * 80)

for keyword in equation_sections:
    results = search_sepsey(keyword, max_results=1)
    if results and len(results) > 0:
        page_num, context = results[0]
        # Check if this context mentions lognormal
        if 'lognormal' in context.lower() or 'log-normal' in context.lower():
            print(f"\n{'='*80}")
            print(f"Found '{keyword}' with lognormal context on page {page_num}")
            print(f"{'='*80}")
            print(context)

In [14]:
# Extract full text from pages that mention lognormal
print("\n" + "=" * 80)
print("FULL TEXT FROM LOGNORMAL PAGES")
print("=" * 80)

lognormal_pages = []
for i in range(sepsey_num_pages):
    text = sepsey_reader.pages[i].extract_text()
    if 'lognormal' in text.lower() or 'log-normal' in text.lower():
        lognormal_pages.append(i+1)

print(f"\nPages containing 'lognormal': {lognormal_pages}")
print("\nExtracting full text from these pages...")

for page_num in lognormal_pages[:3]:  # First 3 pages with lognormal
    text = sepsey_reader.pages[page_num-1].extract_text()
    print(f"\n{'='*80}")
    print(f"FULL PAGE {page_num}")
    print(f"{'='*80}")
    print(text)
    print("\n" + "="*80 + "\n")

In [15]:
# Specific search for Equation 21
print("=" * 80)
print("SEARCHING SPECIFICALLY FOR EQUATION 21")
print("=" * 80)

eq21_keywords = [
    "equation 21",
    "eq. 21",
    "eq 21",
    "(21)",
    "Eq. (21)"
]

for keyword in eq21_keywords:
    results = search_sepsey(keyword, max_results=2)
    if results:
        print(f"\n\nFound '{keyword}' on {len(results)} page(s)")
        for page_num, context in results:
            print(f"\n{'='*80}")
            print(f"Page {page_num}:")
            print(f"{'='*80}")
            print(context)

In [16]:
# Search for characteristic function and phi symbols
print("\n" + "=" * 80)
print("SEARCHING FOR CHARACTERISTIC FUNCTION FORMULAS")
print("=" * 80)

cf_keywords = [
    "φ(",
    "Φ(",
    "phi(",
    "characteristic function",
    "∫",
    "integral"
]

for keyword in cf_keywords[:4]:  # First 4
    results = search_sepsey(keyword, max_results=2)
    if results:
        print(f"\n\n'{keyword}': Found on pages {[r[0] for r in results]}")
        # Show first match
        if len(results) > 0:
            page_num, context = results[0]
            print(f"\nFirst occurrence (Page {page_num}):")
            print(context[:600])

In [17]:
# Look for pages with formulas - they often have specific patterns
print("\n" + "=" * 80)
print("SEARCHING FOR PAGES WITH MATHEMATICAL FORMULAS")
print("=" * 80)

formula_patterns = [
    "exp[",
    "exp(",
    "∑",
    "τ",
    "ω",
    "σ²"
]

pages_with_formulas = set()
for i in range(sepsey_num_pages):
    text = sepsey_reader.pages[i].extract_text()
    for pattern in formula_patterns:
        if pattern in text:
            pages_with_formulas.add(i+1)
            break

print(f"\nPages with mathematical formulas: {sorted(pages_with_formulas)}")

# Extract pages that likely contain the main model
print("\n" + "=" * 80)
print("EXTRACTING PAGES 3-5 (typical location for theory)")
print("=" * 80)

for page_num in [3, 4, 5]:
    if page_num <= sepsey_num_pages:
        text = sepsey_reader.pages[page_num-1].extract_text()
        print(f"\n{'='*80}")
        print(f"PAGE {page_num}")
        print(f"{'='*80}")
        # Show more text to catch equations
        print(text[:3000])
        if len(text) > 3000:
            print("\n[... truncated ...]")