In [1]:
import pandas as pd
import re

In [9]:
# Main execution
ref_df = pd.read_excel(r'..\acts_and_miracles\acts_and_miracles_preprep.xlsx', sheet_name='ref_to_text')
scripture_df = pd.read_csv(r'..\scripture\asv.csv')

In [10]:
scripture_df

Unnamed: 0,Verse ID,Book Name,Book Number,Chapter,Verse,Text
0,1,Genesis,1,1,1,In the beginning God created the heavens and t...
1,2,Genesis,1,1,2,And the earth was waste and void; and darkness...
2,3,Genesis,1,1,3,"And God said, Let there be light: and there wa..."
3,4,Genesis,1,1,4,"And God saw the light, that it was good: and G..."
4,5,Genesis,1,1,5,"And God called the light Day, and the darkness..."
...,...,...,...,...,...,...
31097,31098,Revelation,66,22,17,"And the Spirit and the bride say, Come. And he..."
31098,31099,Revelation,66,22,18,I testify unto every man that heareth the word...
31099,31100,Revelation,66,22,19,and if any man shall take away from the words ...
31100,31101,Revelation,66,22,20,"He who testifieth these things saith, Yea: I c..."


In [38]:
import pandas as pd
import re

def expand_reference(ref_str):
    """
    Parse and expand scripture references into individual verses.
    
    Handles formats like:
    - Genesis 1:3 (single verse)
    - Genesis 1:6-8 (verse range in same chapter)
    - Genesis 1:1-2:3 (cross-chapter range)
    - Genesis 1:24-25, 2:19 (multiple references)
    - Genesis 1:24, 26, 34, 2:19 (mixed: ch1v24, ch1v26, ch1v34, ch2v19)
    """
    results = []
    parts = ref_str.split(',')
    
    current_book = None
    current_chapter = None
    
    for part in parts:
        part = part.strip()
        
        # Check if this is a full reference (Book Chapter:Verse)
        full_match = re.match(r'([A-Za-z0-9\s]+)\s+(\d+):(\d+)(?:-(?:(\d+):)?(\d+))?', part)
        
        if full_match:
            # Full reference with book name
            current_book = full_match.group(1).strip()
            current_chapter = int(full_match.group(2))
            start_v = int(full_match.group(3))
            
            if full_match.group(5):  # Range exists
                if full_match.group(4):  # Cross-chapter range (Ch:V-Ch:V)
                    end_ch = int(full_match.group(4))
                    end_v = int(full_match.group(5))
                    # Generate all verses in range
                    for ch in range(current_chapter, end_ch + 1):
                        v_start = start_v if ch == current_chapter else 1
                        v_end = end_v if ch == end_ch else 999  # Max verse assumption
                        for v in range(v_start, v_end + 1):
                            results.append((current_book, ch, v))
                else:  # Same chapter range (Ch:V-V)
                    end_v = int(full_match.group(5))
                    for v in range(start_v, end_v + 1):
                        results.append((current_book, current_chapter, v))
            else:  # Single verse
                results.append((current_book, current_chapter, start_v))
        
        else:
            # Check for chapter:verse without book name (uses current book)
            ch_v_match = re.match(r'^(\d+):(\d+)(?:-(\d+))?$', part)
            if ch_v_match and current_book:
                current_chapter = int(ch_v_match.group(1))
                start_v = int(ch_v_match.group(2))
                
                if ch_v_match.group(3):  # Range in same chapter
                    end_v = int(ch_v_match.group(3))
                    for v in range(start_v, end_v + 1):
                        results.append((current_book, current_chapter, v))
                else:  # Single verse
                    results.append((current_book, current_chapter, start_v))
            
            # Check for just verse or verse range (uses current book and chapter)
            elif re.match(r'^\d+(?:-\d+)?$', part) and current_book and current_chapter:
                verse_match = re.match(r'^(\d+)(?:-(\d+))?$', part)
                if verse_match:
                    start_v = int(verse_match.group(1))
                    if verse_match.group(2):  # Verse range
                        end_v = int(verse_match.group(2))
                        for v in range(start_v, end_v + 1):
                            results.append((current_book, current_chapter, v))
                    else:  # Single verse
                        results.append((current_book, current_chapter, start_v))
    
    return results

def main():
    # Read reference data from Excel worksheet
    ref_df = pd.read_excel(r'..\acts_and_miracles\acts_and_miracles_preprep.xlsx', sheet_name='ref_to_text')
    
    # Read scripture CSV
    scripture_df = pd.read_csv(r'..\scripture\asv.csv')
    
    # Create lookup dictionary for scripture text
    scripture_dict = {
        (row['Book Name'], row['Chapter'], row['Verse']): row['Text']
        for _, row in scripture_df.iterrows()
    }
    
    # Process references and build result
    rows = []
    missing_count = 0
    
    for _, ref_row in ref_df.iterrows():
        event_id = ref_row['EventID']
        reference = ref_row['Reference']
        
        if pd.isna(reference):
            continue
            
        # Expand reference to individual verses
        verses = expand_reference(reference)
        
        # Look up text for each verse
        for book, chapter, verse in verses:
            key = (book, chapter, verse)
            if key in scripture_dict:
                rows.append({
                    'EventID': event_id,
                    'Reference': reference,
                    'Reference_Code': f"{book} {chapter}:{verse}",
                    'Reference_Text': scripture_dict[key]
                })
            else:
                missing_count += 1
    
    # Create result dataframe and save
    result_df = pd.DataFrame(rows)
    result_df.to_csv('expanded_references.csv', index=False)
    
    print(f"Created {len(result_df)} rows from {ref_df['EventID'].nunique()} events")
    if missing_count > 0:
        print(f"Note: {missing_count} verse references could not be found in scripture file")
    print("\nFirst 5 rows:")
    print(result_df.head())
    
    # Example of handling complex references
    print("\nExample parsing:")
    test_ref = "Genesis 1:24, 26, 34, 2:19"
    print(f"'{test_ref}' expands to: {expand_reference(test_ref)}")
    
    return result_df

def create_concatenated_references(input_csv='expanded_references.csv', output_csv='concatenated_references.csv', input_df=None):
    """
    Create a new table where each reference has all its verses concatenated into a single text field.
    Format: [Book Name] [ch 1] [verse 1] text [verse 2] text [ch 2] [verse 1] text...
    
    Parameters:
    - input_csv: Path to the expanded references CSV (used if input_df is None)
    - output_csv: Path for the output concatenated CSV
    - input_df: Optional DataFrame to use instead of reading from CSV
    """
    # Use provided dataframe or read from CSV
    if input_df is not None:
        df = input_df
    else:
        df = pd.read_csv(input_csv)
    
    # Group by EventID and Reference
    grouped = df.groupby(['EventID', 'Reference'])
    
    result_rows = []
    
    for (event_id, reference), group in grouped:
        # Sort by Reference_Code to ensure proper verse order
        group = group.sort_values('Reference_Code')
        
        # Build concatenated text with formatting
        concatenated_text = ""
        current_book = None
        current_chapter = None
        
        for _, row in group.iterrows():
            ref_code = row['Reference_Code']
            text = row['Reference_Text']
            
            # Parse the reference code to get book, chapter, verse
            match = re.match(r'([A-Za-z0-9\s]+)\s+(\d+):(\d+)', ref_code)
            if match:
                book = match.group(1).strip()
                chapter = int(match.group(2))
                verse = int(match.group(3))
                
                # Add book marker if it's a new book
                if book != current_book:
                    concatenated_text += f"[{book}] "
                    current_book = book
                    current_chapter = None  # Reset chapter when book changes
                
                # Add chapter marker if it's a new chapter
                if chapter != current_chapter:
                    concatenated_text += f"[ch {chapter}] "
                    current_chapter = chapter
                
                # Add verse marker and text
                concatenated_text += f"[verse {verse}] {text} "
        
        result_rows.append({
            'EventID': event_id,
            'Reference': reference,
            'Concatenated_Text': concatenated_text.strip()
        })
    
    # Create result dataframe and save
    result_df = pd.DataFrame(result_rows)
    result_df.to_csv(output_csv, index=False)
    
    print(f"\nCreated concatenated references with {len(result_df)} rows")
    print(f"Saved to: {output_csv}")
    print("\nFirst example:")
    if len(result_df) > 0:
        first_row = result_df.iloc[0]
        print(f"EventID: {first_row['EventID']}")
        print(f"Reference: {first_row['Reference']}")
        print(f"Text preview (first 500 chars): {first_row['Concatenated_Text'][:500]}...")
    
    return result_df

def create_concatenated_references_alt(input_df=None, input_csv='expanded_references.csv', 
                                      output_csv='concatenated_references_alt.csv', 
                                      separator=' | ', include_markers=True):
    """
    Alternative version with customizable formatting.
    
    Parameters:
    - input_df: Optional DataFrame to use instead of reading from CSV
    - input_csv: Path to the expanded references CSV (used if input_df is None)
    - output_csv: Path for the output concatenated CSV
    - separator: String to separate verses (default: ' | ')
    - include_markers: Whether to include [book], [ch], [verse] markers
    """
    # Use provided dataframe or read from CSV
    df = input_df if input_df is not None else pd.read_csv(input_csv)
    
    # Group by EventID and Reference
    grouped = df.groupby(['EventID', 'Reference'])
    
    result_rows = []
    
    for (event_id, reference), group in grouped:
        # Sort to ensure proper order
        group = group.sort_values('Reference_Code')
        
        if include_markers:
            # Build with markers as before
            concatenated_text = ""
            current_book = None
            current_chapter = None
            
            for _, row in group.iterrows():
                ref_code = row['Reference_Code']
                text = row['Reference_Text']
                
                match = re.match(r'([A-Za-z0-9\s]+)\s+(\d+):(\d+)', ref_code)
                if match:
                    book = match.group(1).strip()
                    chapter = int(match.group(2))
                    verse = int(match.group(3))
                    
                    if book != current_book:
                        concatenated_text += f"[{book}] "
                        current_book = book
                        current_chapter = None
                    
                    if chapter != current_chapter:
                        concatenated_text += f"[ch {chapter}] "
                        current_chapter = chapter
                    
                    concatenated_text += f"[v{verse}] {text}{separator}"
        else:
            # Simple concatenation without markers
            texts = []
            for _, row in group.iterrows():
                ref_code = row['Reference_Code']
                text = row['Reference_Text']
                texts.append(f"{ref_code}: {text}")
            concatenated_text = separator.join(texts)
        
        result_rows.append({
            'EventID': event_id,
            'Reference': reference,
            'Concatenated_Text': concatenated_text.rstrip(separator).strip()
        })
    
    result_df = pd.DataFrame(result_rows)
    result_df.to_csv(output_csv, index=False)
    
    print(f"\nCreated alternative concatenated references with {len(result_df)} rows")
    print(f"Saved to: {output_csv}")
    
    return result_df

if __name__ == "__main__":
    main()

Created 12803 rows from 2926 events
Note: 53402 verse references could not be found in scripture file

First 5 rows:
   EventID        Reference Reference_Code  \
0        1  Genesis 1:1-2:3    Genesis 1:1   
1        1  Genesis 1:1-2:3    Genesis 1:2   
2        1  Genesis 1:1-2:3    Genesis 1:3   
3        1  Genesis 1:1-2:3    Genesis 1:4   
4        1  Genesis 1:1-2:3    Genesis 1:5   

                                      Reference_Text  
0  In the beginning God created the heavens and t...  
1  And the earth was waste and void; and darkness...  
2  And God said, Let there be light: and there wa...  
3  And God saw the light, that it was good: and G...  
4  And God called the light Day, and the darkness...  

Example parsing:
'Genesis 1:24, 26, 34, 2:19' expands to: [('Genesis', 1, 24), ('Genesis', 1, 26), ('Genesis', 1, 34), ('Genesis', 2, 19)]


In [39]:
expanded_df = main()

Created 12803 rows from 2926 events
Note: 53402 verse references could not be found in scripture file

First 5 rows:
   EventID        Reference Reference_Code  \
0        1  Genesis 1:1-2:3    Genesis 1:1   
1        1  Genesis 1:1-2:3    Genesis 1:2   
2        1  Genesis 1:1-2:3    Genesis 1:3   
3        1  Genesis 1:1-2:3    Genesis 1:4   
4        1  Genesis 1:1-2:3    Genesis 1:5   

                                      Reference_Text  
0  In the beginning God created the heavens and t...  
1  And the earth was waste and void; and darkness...  
2  And God said, Let there be light: and there wa...  
3  And God saw the light, that it was good: and G...  
4  And God called the light Day, and the darkness...  

Example parsing:
'Genesis 1:24, 26, 34, 2:19' expands to: [('Genesis', 1, 24), ('Genesis', 1, 26), ('Genesis', 1, 34), ('Genesis', 2, 19)]


In [40]:
concatenated_df = create_concatenated_references(input_df=expanded_df)


Created concatenated references with 2925 rows
Saved to: concatenated_references.csv

First example:
EventID: 1
Reference: Genesis 1:1-2:3
Text preview (first 500 chars): [Genesis] [ch 1] [verse 1] In the beginning God created the heavens and the earth. [verse 10] And God called the dry land Earth; and the gathering together of the waters called he Seas: and God saw that it was good. [verse 11] And God said, Let the earth put forth grass, herbs yielding seed, [and] fruit-trees bearing fruit after their kind, wherein is the seed thereof, upon the earth: and it was so. [verse 12] And the earth brought forth grass, herbs yielding seed after their kind, and trees bea...


In [52]:
concatenated_df.head()

Unnamed: 0,EventID,Reference,Concatenated_Text
0,1,Genesis 1:1-2:3,[Genesis] [ch 1] [verse 1] In the beginning Go...
1,2,Genesis 1:3,"[Genesis] [ch 1] [verse 3] And God said, Let t..."
2,3,Genesis 1:6-8,"[Genesis] [ch 1] [verse 6] And God said, Let t..."
3,4,Genesis 1:9-10,[Genesis] [ch 1] [verse 10] And God called the...
4,5,Genesis 1:11-13,"[Genesis] [ch 1] [verse 11] And God said, Let ..."


In [45]:
result_df.head(60)

Unnamed: 0,EventID,Reference,Reference_Code,Reference_Text
0,1,Genesis 1:1-2:3,Genesis 1:1,In the beginning God created the heavens and t...
1,1,Genesis 1:1-2:3,Genesis 1:2,And the earth was waste and void; and darkness...


In [32]:
result_df.to_csv(r'..\acts_and_miracles\parsed_scripture.csv')