In [None]:

import pandas as pd

import xml.etree.ElementTree as ET
from collections import defaultdict


from tqdm.notebook import tqdm

import voyageai

# We have some helper functions that lets us save results every x rows and recombine them later.
# This lets us run large jobs wihout losing all progress in the case of a crash.
from python_code.helpers import apply_in_chunks, recombine_chunks, embed_w_token_manager, safe_load_env_variable

from python_code.analyzing import analyze_section_data_with_details, print_analysis_report, compare_text_lengths, analyze_section_content_vs_fulltext_deviation


# Enable tqdm progress_apply for pandas
tqdm.pandas()

# Load the API_KEY using a safe helper function, this checks if 
# it uses a system env variable and sends a warning if you might 
# be using your personal api key.
VOYAGE_API_KEY = safe_load_env_variable("VOYAGE_API_KEY")
vo = voyageai.Client(api_key=VOYAGE_API_KEY)

## Loading data and filtering

In [8]:

# Loads the data and filters it
unfiltered_df = pd.read_pickle("../article_splitting_shared/prper_prstper_metadata_texts_20250227.pkl")

# Adds a year column to the dataframe
unfiltered_df["year"] = unfiltered_df["date"].dt.year

# Filters out the types of entries we are interested in and only includes completed years for consistency.
df = unfiltered_df[
    (unfiltered_df["year"] != 2025) & 
    (unfiltered_df["articleType"].isin(["article", "brief"]))
]


# This has some errors/weirdness in its sections and title structure, filtering it out for now
df = df[df["id"] != "10.1103/PhysRevSTPER.6.020117"] 

## Parsing the XML data

In [17]:
def parse_xml_sections(xml_content):
    """
    Parse the XML content and extract section information.
    
    Args:
        xml_content (str): The XML content as a string.
    
    Returns:
        list: A list of dictionaries containing section information.
    """
    # Parse the XML content
    root = ET.fromstring(xml_content)
    
    sections = []
    
    # Find all sec elements in the body
    for i,sec in enumerate(root.findall('.//body/sec')):
        section_data = defaultdict(str)

        section_data["relative_position"] = i+1
        
        # Extract label
        label = sec.find('label')
        if label is not None:
            section_data['label'] = label.text.strip() if label.text else ''
        
        # Extract title
        title = sec.find('title')
        if title is not None:
            section_data['title'] = title.text.strip() if title.text else ''
        
        # Extract all text content from the section, runs recursively on the XML tree to extract all text content
        def get_full_text(element):
            """Recursively get all text (including tails) from an element and its children."""
            texts = []
            # If the element contains any text, collec
            if element.text:
                texts.append(element.text.strip())
            for child in element:
                # For xref, handle as reference
                if child.tag == 'xref':
                    ref = child.get('rid')
                    ref_type = child.get('ref-type')
                    if ref and ref_type:
                        texts.append(f"[{ref_type}:{ref}]")
                else:
                    texts.append(get_full_text(child)) # If the element is not a reference, then process the element as its parent
                if child.tail:
                    texts.append(child.tail.strip())
            return ' '.join([t for t in texts if t])

        section_data['content'] = get_full_text(sec)

        # Extract any other metadata like id
        sec_id = sec.get('id')
        if sec_id:
            section_data['id'] = sec_id
        
        sections.append(dict(section_data))
    
    return sections

In [18]:
df["sections"] = df["full_text_xml"].progress_apply(parse_xml_sections)

  0%|          | 0/1222 [00:00<?, ?it/s]

In [19]:
df.keys()

Index(['abstract', 'articleType', 'authors', 'affiliations', 'date', 'type',
       'metadata_last_modified_at', 'last_modified_at', 'id', 'identifiers',
       'issue', 'pageStart', 'hasArticleId', 'numPages', 'publisher', 'rights',
       'journal', 'title', 'volume', 'notes', 'tocSection', 'fundings',
       'classificationSchemes', 'doi', 'full_text_xml', 'full_text', 'year',
       'sections'],
      dtype='object')

### Explode data for better access

Currently, the extracted data is saved in a dataframe with one row per article. Since we are interested in analysing the relation between different sections , it will be easier for us if our data was organized with one row per section. Therefore, we exlode the dataframe to get our desired format here.

In [None]:
def explode_sections(df, section_col="sections"):
    """
    Explodes a DataFrame on the given section_col (which should be a list of dicts per row),
    and prefixes all original columns (except section_col) with 'article_'.
    The resulting DataFrame will have one row per section, with section dict keys as columns,
    and all original article-level columns prefixed.
    """
    # Columns to keep and prefix (all except section_col)
    article_cols = [col for col in df.columns if col != section_col]
    # Prepare new column names
    article_col_map = {col: f"article_{col}" for col in article_cols}
    # Rename columns
    df_prefixed = df.rename(columns=article_col_map)
    # Explode the sections column
    df_exploded = df_prefixed.explode(section_col, ignore_index=True)
    # Expand the dict in the section_col into columns
    section_df = pd.json_normalize(df_exploded[section_col])
    section_df = section_df.rename(columns={col: f"section_{col}" for col in section_df.columns})
    # Drop the old section_col and join the expanded section columns
    df_exploded = df_exploded.drop(columns=[section_col])
    result = pd.concat([df_exploded, section_df], axis=1)
    return result

# Example usage:
exploded_df = explode_sections(df, section_col="sections")


In [23]:
exploded_df.keys()

Index(['article_abstract', 'article_articleType', 'article_authors',
       'article_affiliations', 'article_date', 'article_type',
       'article_metadata_last_modified_at', 'article_last_modified_at',
       'article_id', 'article_identifiers', 'article_issue',
       'article_pageStart', 'article_hasArticleId', 'article_numPages',
       'article_publisher', 'article_rights', 'article_journal',
       'article_title', 'article_volume', 'article_notes',
       'article_tocSection', 'article_fundings',
       'article_classificationSchemes', 'article_doi', 'article_full_text_xml',
       'article_full_text', 'article_year', 'section_relative_position',
       'section_label', 'section_title', 'section_content', 'section_id'],
      dtype='object')

### Extraction validation

To ensure that all the text has been extracted properly, we run some basic validation comparing the text length of the full text and the extracted sections, as well as some checks on missing data.

In [None]:
results = analyze_section_data_with_details(df["sections"])
print_analysis_report(results)

print("\n\n")
print("-"*50)
print("Comparing text lengths")
print("-"*50)
compare_text_lengths(exploded_df, df)
print("\n")
res = analyze_section_content_vs_fulltext_deviation(df)

Detailed Section Data Analysis Report
Total documents analyzed: 1222

Documents without any sections:
  Count: 0
  Percentage: 0.00%
  Document indices: []

Total sections across all documents: 7313

Sections without titles:
  Count: 0
  Percentage: 0.00%

Sections without labels:
  Count: 5
  Percentage: 0.07%
  Example sections without labels:
    Document 126, Section 4: V. DISCUSSION AND CONCLUSION
    Document 586, Section 0: Introduction.—
    Document 586, Section 1: Methods.—



--------------------------------------------------
Comparing text lengths
--------------------------------------------------
Total length of section_title: 146,307
Total length of section_content: 72,932,071
Sum of section_content and section_title: 73,078,378
Original full text length: 72,167,341
Difference (original - combined): -911,037


Deviation between original full text and sum of parsed section contents:
  Mean deviation:   -625.80 characters
  Median deviation: -576.50 characters
  Min deviati

## Embedding

Now that we have sucsesfully extracted the sections, let us generate embeddings that we can use for further analysis. We have chosen to generate to forms of embeddings. First we embed the whole section as well as the titles. Second, we chunk the section text body into sentence chunks of up to 300 characters. This second embedding will be useful to explore the movement of articles/sections through the embedding space.

For the embedding we use the `voyage-3-large` model provided through the VoyageAI API, it is currently the state-of-art embedding model.

### Embedding whole sections

In [None]:
def embed_sections(rows):
    results_df = rows.copy()

    # Extract title and content from the expanded sections
    titles = results_df['section_title'].tolist()
    contents = results_df['section_content'].tolist()

    # Process titles and contents using the helper function
    # The embed_w_token_manager function reduces the number of 
    # requests to the API by finding the max amount of data that can be passed per request.
    title_embeddings = embed_w_token_manager(titles, vo)
    content_embeddings = embed_w_token_manager(contents, vo)

    results_df["title_embedding"] = title_embeddings
    results_df["content_embedding"] = content_embeddings

    return results_df


apply_in_chunks(exploded_df, embed_sections, chunk_size=100,
                output_dir="chunks_whole_sections")


Processing 100 rows in chunks of 50 for a total of 2 chunks...




Starting embedding process for 50 texts with max_tokens=120000
Position 0: Trying chunk size 50, estimated tokens: 266
✓ Using maximum chunk size: 50
Processing chunk 1: texts 0 to 49
Completed: 1 chunks processed, 50 embeddings generated
Starting embedding process for 50 texts with max_tokens=120000
Position 0: Trying chunk size 50, estimated tokens: 88534
✓ Using maximum chunk size: 50
Processing chunk 1: texts 0 to 49




Completed: 1 chunks processed, 50 embeddings generated
Starting embedding process for 50 texts with max_tokens=120000
Position 0: Trying chunk size 50, estimated tokens: 259
✓ Using maximum chunk size: 50
Processing chunk 1: texts 0 to 49
Completed: 1 chunks processed, 50 embeddings generated
Starting embedding process for 50 texts with max_tokens=120000
Position 0: Trying chunk size 50, estimated tokens: 93903
✓ Using maximum chunk size: 50
Processing chunk 1: texts 0 to 49


Processing chunks: 100%|██████████| 2/2 [00:17<00:00,  8.58s/chunk, ETA: 0.0s]

Completed: 1 chunks processed, 50 embeddings generated
Completed processing in 17.2 seconds





Total tokens used: 0


In [49]:
df_with_embeddings = recombine_chunks("chunks_whole_sections")

In [44]:
df_with_embeddings.keys()

Index(['section_label', 'article_title', 'section_title_raw', 'article_id',
       'year', 'content_text', 'relative_position', 'title_embedding',
       'content_embedding'],
      dtype='object')

In [50]:
# Select only the relevant embedding columns from df_with_embeddings
embeddings_to_merge = df_with_embeddings[["article_id", "section_title_raw", "title_embedding", "content_embedding"]].copy()
embeddings_to_merge = embeddings_to_merge.rename(
    columns={
        "title_embedding": "section_title_embedding",
        "content_embedding": "section_content_embedding",
        "section_title_raw": "section_title"
    }
)

# Merge into exploded_df, keeping all columns from exploded_df and only the two embedding columns from embeddings_to_merge
df_with_embeddings = exploded_df.merge(
    embeddings_to_merge,
    on=["article_id", "section_title"],
    how="left"
)
df_with_embeddings.keys()

Index(['article_abstract', 'article_articleType', 'article_authors',
       'article_affiliations', 'article_date', 'article_type',
       'article_metadata_last_modified_at', 'article_last_modified_at',
       'article_id', 'article_identifiers', 'article_issue',
       'article_pageStart', 'article_hasArticleId', 'article_numPages',
       'article_publisher', 'article_rights', 'article_journal',
       'article_title', 'article_volume', 'article_notes',
       'article_tocSection', 'article_fundings',
       'article_classificationSchemes', 'article_doi', 'article_full_text_xml',
       'article_full_text', 'article_year', 'section_relative_position',
       'section_label', 'section_title', 'section_content', 'section_id',
       'section_title_embedding', 'section_content_embedding'],
      dtype='object')

#### Verify embeddings

To verify that the embeddings were processed correctly, we run a semantic similarity test on a random sample of rows.

In [None]:

def verify_semantic_consistency(text, embedding, vo, similarity_threshold=0.7, print_similarity=False):
    """
    Verify that the embedding semantically represents the original text
    by comparing it with a fresh embedding of the same text.
    """
    
    # Get a fresh embedding for comparison
    verification_embedding = vo.embed([text], model="voyage-3-large").embeddings[0]
    
    # Calculate cosine similarity
    similarity = 1 - cosine(embedding, verification_embedding)
    
    # Print similarity if requested
    if print_similarity:
        print(f"Similarity: {similarity:.3f}")
    
    if similarity < similarity_threshold:
        print(f"❌ Low semantic similarity ({similarity:.3f}) for chunk: {text[:100]}...")
        return False
    
    return True

In [53]:
# Sample a few rows for verification
sample_size = 10
sample_df = df_with_embeddings.sample(n=sample_size)

print(f"Verifying semantic consistency for {sample_size} random samples...")
print("=" * 60)

for idx, row in sample_df.iterrows():
    print(f"\nSample {idx}:")
    print(f"Text: {row['section_content'][:100]}...")
    
    # Verify title embedding
    title_consistent = verify_semantic_consistency(
        row['section_title'], 
        row['section_title_embedding'], 
        vo
    )
    print(f"Title embedding consistent: {'✅' if title_consistent else '❌'}")
    
    # Verify content embedding
    content_consistent = verify_semantic_consistency(
        row['section_content'], 
        row['section_content_embedding'], 
        vo
    )
    print(f"Content embedding consistent: {'✅' if content_consistent else '❌'}")
    
    print("-" * 40)

print("\nVerification complete!")


Verifying semantic consistency for 10 random samples...

Sample 6380:
Text: II. THEORETICAL FRAMEWORK A. Relationships and relationship building As individuals engage with each...
Title embedding consistent: ✅
Content embedding consistent: ✅
----------------------------------------

Sample 5966:
Text: III. RESEARCH QUESTION Our study wanted to understand instructors’ views around cultural relevance, ...
Title embedding consistent: ✅
Content embedding consistent: ✅
----------------------------------------

Sample 858:
Text: III. RESEARCH A. Aims and research questions This study is based on quantitative and qualitative res...
Title embedding consistent: ✅
Content embedding consistent: ✅
----------------------------------------

Sample 4585:
Text: II. DESIGN-BASED RESEARCH: INTEGRATING EVIDENCE-BASED, THEORY-DRIVEN, AND PRACTICE-BASED ASPECTS INT...
Title embedding consistent: ✅
Content embedding consistent: ✅
----------------------------------------

Sample 2086:
Text: IV. RESULTS Our f

#### Save embeddings

Finally, we save the fruits of our hard labour. We save this first result containing only the embeddigns for the whole sections as a "light" version, this will be useful later on when running analysis only on the whole sections. Once we add the chunked sections and its embeddings, the saved `.pkl` file will baloon to approx. 3GB.

In [1]:
from python_code.helpers import save_processed_embeddings

In [None]:
# Save the embeddings
save_processed_embeddings(df_with_embeddings, "processed_embeddings_light")

Saved processed_embeddings to ./processed_embeddings_light_20250715_132106.pkl


'./processed_embeddings_light_20250715_132106.pkl'

### Embedding chuncked sections

In [None]:
# Optionally load the df generated above to run this section of the notebook in isolation

# df_with_embeddings = pd.read_pickle("processed_embeddings_light_20250715_132106.pkl")
# df_with_embeddings.keys()

Index(['article_abstract', 'article_articleType', 'article_authors',
       'article_affiliations', 'article_date', 'article_type',
       'article_metadata_last_modified_at', 'article_last_modified_at',
       'article_id', 'article_identifiers', 'article_issue',
       'article_pageStart', 'article_hasArticleId', 'article_numPages',
       'article_publisher', 'article_rights', 'article_journal',
       'article_title', 'article_volume', 'article_notes',
       'article_tocSection', 'article_fundings',
       'article_classificationSchemes', 'article_doi', 'article_full_text_xml',
       'article_full_text', 'article_year', 'section_relative_position',
       'section_label', 'section_title', 'section_content', 'section_id',
       'section_title_embedding', 'section_content_embedding'],
      dtype='object')

In [None]:
# Download Punkt tokenizer (first time only)
download('punkt')

def nltk_sentence_splitter(text, min_chunk=50, max_chunk=300):
    """NLTK implementation with chunk size control"""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sent in sentences:
        sent_len = len(sent)
        # Start new chunk if adding this sentence would exceed max size
        if current_length + sent_len > max_chunk and current_length >= min_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sent]
            current_length = sent_len
        else:
            current_chunk.append(sent)
            current_length += sent_len
    
    # Add the last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks


def chunk_section(section_text):
    return nltk_sentence_splitter(section_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sfgar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

def embed_section(df_chunk):
    # Process all sections in the dataframe chunk
    all_chunks = []
    chunk_mapping = []  # Track which chunk belongs to which section
    
    for idx, section in df_chunk.iterrows():
        section_text = section["section_content"]
        assert len(section_text) > 20, "Section text is too short"

        # Chunk the section
        chunked_contents = chunk_section(section_text)
        
        # Add chunks to the aggregated list
        for chunk in chunked_contents:
            all_chunks.append(chunk)
            chunk_mapping.append(idx)  # Track which section this chunk belongs to
    print(sum(len(chunk) for chunk in all_chunks)//4)
    # Embed all chunks using the custom token manager function
    embedded_chunks = embed_w_token_manager(all_chunks, vo, max_tokens=120000, model="voyage-3-large")
    
    # Resplit the aggregated embeddings back into sections
    # Create a copy of the dataframe to avoid modifying the original
    result_df = df_chunk.copy()
    
    # Create dictionaries to store the embeddings and text chunks for each section
    section_embeddings_dict = {}
    section_text_chunks_dict = {}
    current_idx = 0
    for section_idx in df_chunk.index:
        section_chunks = []
        section_text_chunks = []
        # Find all chunks that belong to this section
        while current_idx < len(chunk_mapping) and chunk_mapping[current_idx] == section_idx:
            section_chunks.append(embedded_chunks[current_idx])
            section_text_chunks.append(all_chunks[current_idx])
            current_idx += 1
        section_embeddings_dict[section_idx] = section_chunks
        section_text_chunks_dict[section_idx] = section_text_chunks
    
    # Add the new columns all at once using the dictionaries
    result_df['section_chunked_embeddings'] = result_df.index.map(section_embeddings_dict)
    result_df['section_chunked_texts'] = result_df.index.map(section_text_chunks_dict)
    
    return result_df

apply_in_chunks(df_with_embeddings, embed_section, chunk_size=200, output_dir="chunks_chunked_sections")

Processing 7313 rows in chunks of 200 for a total of 37 chunks...
Skipping chunk 0-199, already exists.
Skipping chunk 200-399, already exists.
Processing chunk 400-599...
505909
Starting embedding process for 8562 texts with max_tokens=120000
Position 0: Trying chunk size 1000, estimated tokens: 47924
✓ Using maximum chunk size: 1000
Processing chunk 1: texts 0 to 999
Position 1000: Trying chunk size 1000, estimated tokens: 47010
✓ Using maximum chunk size: 1000
Processing chunk 2: texts 1000 to 1999
Position 2000: Trying chunk size 1000, estimated tokens: 48650
✓ Using maximum chunk size: 1000
Processing chunk 3: texts 2000 to 2999
Position 3000: Trying chunk size 1000, estimated tokens: 51845
✓ Using maximum chunk size: 1000
Processing chunk 4: texts 3000 to 3999
Position 4000: Trying chunk size 1000, estimated tokens: 46354
✓ Using maximum chunk size: 1000
Processing chunk 5: texts 4000 to 4999
Position 5000: Trying chunk size 1000, estimated tokens: 51894
✓ Using maximum chunk siz

In [58]:
df_all_embeddings = recombine_chunks("chunks_chunked_sections")

Processing chunks:   0%|          | 0/2 [18:25<?, ?chunk/s, Processing chunk 0-49]


In [63]:
df_all_embeddings.keys()

Index(['section_label', 'article_title', 'section_title_raw', 'article_id',
       'year', 'content_text', 'relative_position', 'title_embedding',
       'content_embedding', 'chunked_section_embeddings',
       'chunked_section_texts'],
      dtype='object')

### Verify the chuncks by similarity

We check the similarity of a sample of sections chunked embeddings with a similar logic to above. This makes sure that the embedding was carried out correctly, and that there was no mismatch between text and embedding.

In [None]:
def verify_some_chunks(df, vo, sample_size=10, random_state=None, print_similarity=False):
    """
    Sample sections from dataframe and verify semantic consistency of their chunks.
    
    Parameters:
        df: DataFrame with 'chunked_section_texts' and 'chunked_section_embeddings' columns
        vo: VoyageAI client instance
        sample_size: Number of sections to sample for verification
        random_state: Random seed for reproducible sampling
    """
    print(f"Sampling {sample_size} sections for semantic consistency verification...")
    
    # Sample sections
    sampled_sections = df.sample(n=min(sample_size, len(df)), random_state=random_state)
    
    total_chunks_verified = 0
    failed_chunks = 0
    
    for idx, row in sampled_sections.iterrows():
        section_texts = row['section_chunked_texts']
        section_embeddings = row['section_chunked_embeddings']
        
        print(f"\nVerifying section {idx} with {len(section_texts)} chunks...")
        
        for i, (chunk_text, chunk_embedding) in enumerate(zip(section_texts, section_embeddings)):
            total_chunks_verified += 1
            if not verify_semantic_consistency(chunk_text, chunk_embedding, vo, print_similarity=print_similarity):
                failed_chunks += 1
                print(f"  ❌ Chunk {i} failed verification")
            else:
                print(f"  ✓ Chunk {i} verified")
    
    print("\n=== Verification Summary ===")
    print(f"Total chunks verified: {total_chunks_verified}")
    print(f"Failed chunks: {failed_chunks}")
    print(f"Success rate: {((total_chunks_verified - failed_chunks) / total_chunks_verified * 100):.1f}%")
    
    return failed_chunks == 0

res = verify_some_chunks(df_all_embeddings, vo, sample_size=10)

Sampling 3 sections for semantic consistency verification...

Verifying section 2584 with 7 chunks...
  ✓ Chunk 0 verified
  ✓ Chunk 1 verified
  ✓ Chunk 2 verified
  ✓ Chunk 3 verified
  ✓ Chunk 4 verified
  ✓ Chunk 5 verified
  ✓ Chunk 6 verified

Verifying section 3454 with 36 chunks...
  ✓ Chunk 0 verified
  ✓ Chunk 1 verified
  ✓ Chunk 2 verified
  ✓ Chunk 3 verified
  ✓ Chunk 4 verified
  ✓ Chunk 5 verified
  ✓ Chunk 6 verified
  ✓ Chunk 7 verified
  ✓ Chunk 8 verified
  ✓ Chunk 9 verified
  ✓ Chunk 10 verified
  ✓ Chunk 11 verified
  ✓ Chunk 12 verified
  ✓ Chunk 13 verified
  ✓ Chunk 14 verified
  ✓ Chunk 15 verified
  ✓ Chunk 16 verified
  ✓ Chunk 17 verified
  ✓ Chunk 18 verified
  ✓ Chunk 19 verified
  ✓ Chunk 20 verified
  ✓ Chunk 21 verified
  ✓ Chunk 22 verified
  ✓ Chunk 23 verified
  ✓ Chunk 24 verified
  ✓ Chunk 25 verified
  ✓ Chunk 26 verified
  ✓ Chunk 27 verified
  ✓ Chunk 28 verified
  ✓ Chunk 29 verified
  ✓ Chunk 30 verified
  ✓ Chunk 31 verified
  ✓ Chunk 32 ve

True

In [71]:
save_processed_embeddings(df_all_embeddings)