In [2]:
import csv
import os
import re

import fitz
import PyPDF2

In [3]:
#Load path to PDFs

#Journal of Parasitology
all_pdfs = "/Users/averyszewczak/Desktop/Anoplura_traitextraction/JP/"

In [4]:
#pull out character traits into list

traits = "/Users/averyszewczak/Desktop/Anoplura_traitextraction/anoplura_terms.csv"

#Terms are in the second column named 'pattern' | This document was pulled from Ali Zeltzin Lira-Olguin's documents (Anoplura shared google drive)
# NOTE: This will need to change if anoplura_terms.csv is altered
column_index = 1

#list of traits
trait_data = []

with open(traits, newline="") as traitfile:
    trait_reader = csv.reader(traitfile)
    next(trait_reader)  #skipping header row
    for row in trait_reader:
        if row:  #ensures row is not empty
            trait_data.append(row[column_index])
print(trait_data)

['allotype', 'allotype', 'allotypes', 'anmhs', 'anterior marginal head seta', 'anterior marginal head setae', 'aphs', 'apical head seta', 'apical head setae', 'base pairs', 'bp', 'chaeta', 'cheta', 'dachs', 'danchs', 'danhs', 'dcas', 'dlas', 'dmas', 'dmhs', 'dmss', 'dorsal accessory head seta', 'dorsal accessory head setae', 'dorsal anterior central head seta', 'dorsal anterior central head setae', 'dorsal anterior head seta', 'dorsal anterior head setae', 'dorsal central abdominal seta', 'dorsal central abdominal setae', 'dorsal lateral abdominal seta', 'dorsal lateral abdominal setae', 'dorsal marginal abdominal seta', 'dorsal marginal abdominal setae', 'dorsal marginal head seta', 'dorsal marginal head setae', 'dorsal mesothoracic seta', 'dorsal mesothoracic setae', 'dorsal posterior central head seta', 'dorsal posterior central head setae', 'dorsal posterior head seta', 'dorsal posterior head setae', 'dorsal preantennal head seta', 'dorsal preantennal head setae', 'dorsal principal

In [6]:
#Function to extract species names. The functions aims to pull out the name of the species being described and the correct number of species
#being described if more than 1

def extract_species_context(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:  #for each page in PDF
            text += page.extract_text() + " "  #combining sentences that break (new line) with a space in between

        text = text.replace("\n", " ").replace("\r", " ")  #removing line breaks and character returns
        words = text.split()  # Splitting sentence up into words

        #search for the number of new species described in the first page
        first_page_text = reader.pages[0].extract_text()   #in the first page of the PDF
        species_count_match = re.search(r"(\d+|a|the|one|two|three|four|five|six|seven|eight|nine|ten) new species", first_page_text, re.IGNORECASE)
        #print("species_count_match")
        #print(species_count_match)

        #convert number from words to digits if necessary
        if species_count_match:
            num_map = {"a": 1, "the": 1, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10}
            species_count = species_count_match.group(1).lower()
            #print("species_count1")
            #print(species_count)
            species_count = num_map.get(species_count, species_count)
            #print("species_count2")
            #print(species_count)
            try:
                species_count = int(species_count)
            except ValueError:
                print(f"Error: '{species_count}' is not a valid number of species.")
                return []
        else:
            species_count = 1  # Default to 1 species if not specified

        # Searching for keywords to find species names
        keywords = ["n.", "nova", "nov.", "new species"]
        species_found = 0
        species_names = []
        for i, word in enumerate(words):
            if word.lower() in keywords and i >= 2:
                # Grabbing the 2 words (hopefully genus and species) before the keyword
                word1 = words[i-2]
                word2 = words[i-1]
                # Double checking format
                if (word1[0].isupper() and word2[0].islower() and
                    all(c.isalnum() or c.isspace() for c in word1 + word2)):
                    context = " ".join([word1, word2])
                    species_names.append(context)
                    species_found += 1
                    if species_found >= species_count:
                        break  # Stop after finding the specified number of species
        print("Species described:", species_names)

        return species_names

#prints out which PDF the species name was extracted from
for ano_pdf in os.listdir(all_pdfs):
    if ano_pdf.endswith(".pdf"):
        pdf_path = os.path.join(all_pdfs, ano_pdf)
        print(f"Extracted from: {ano_pdf}")
        extract_species_context(pdf_path)

Extracted from: Durden, LA., et al. 2020. A new species of sucking louse from the mandrill from Gabon.pdf
Species described: ['Pedicinus gabonensis']
Extracted from: Durden and Rausch. 2007. Haemodipsus brachylagy n. sp., Polyplacidae, a new sucking louse from the pygmy rabbit in Nevada.pdf
Species described: ['Haemodipsis brachylagi']
Extracted from: Durden, LA., et al. 2020. Two new species of sucking lice, Hoplopleuridae and Polyplacidae from South Africa.pdf
Species described: ['Hoplopleura granti', 'Polyplax megacephalus']
Extracted from: Durden, LA., et al. 2015. A new species of sucking louse from Kenya.pdf
Species described: ['Linognathus samburi']
Extracted from: Durden, LA., et al. 2022. Sucking lice parasitizing mongolian rodents with the description of a new species of Hoplopleura from mountain voles.pdf
Species described: ['Hoplopleura altaiensis']
Extracted from: Durden and Timm. 2001. Hoplopleura janzeni n. sp., a new sucking louse from a central american swimming mouse.

In [7]:
# Initialize dictionary for pdfs_data

#Finding the header to the description section. This was made around articles from Journal of Parisitology; could be different for other journals
desc_spec = re.compile("(\\s*\\n\\s*DESCRIPTION\\S?\\s*\\n\\s*)([A-Z][a-z]+[\\s|\n][a-z]+)")

#to store extracted text in later on
pdfs_data = {}

#description sections in journal of parasitology
#Includes duplicate names for article with space before term
desc_sections = ["\nHead", "\nThorax", "\nAbdomen", "\nParatergal plates", "\nGenitalia", "\n Head", "\n Thorax", "\n Abdomen", "\n Paratergal plates", "\n Genitalia"]


#this function breaks the description text up into subsections
def extract_section_text(text, sections):
    section_data = {}
    for i, section in enumerate(sections):
        start_pos = text.find(section)
        if start_pos == -1:
            continue
        end_pos = len(text)
        for next_section in sections[i+1:]:
            next_pos = text.find(next_section, start_pos + len(section))
            if next_pos != -1:
                end_pos = next_pos
                break
        section_data[section.strip()] = text[start_pos:end_pos].strip()
    return section_data


#this function extracts the text in the description section that isn't captured in any of the subsections (called 'Free Text')
def extract_free_text(full_text, match, desc_sections):
    start_pos = match.end()
    first_section_pos = len(full_text)
    for section in desc_sections:
        pos = full_text.find(section, start_pos)
        if pos != -1 and pos < first_section_pos:
            first_section_pos = pos
    free_text = full_text[start_pos:first_section_pos].strip()
    free_text = free_text.replace("\n", " ")

    # Remove lines that start with "Downloaded from"
    filtered_lines = [line for line in free_text.split("\n") if "Downloaded from" not in line]
    return "\n".join(filtered_lines)



#Running through functions to extract text and save to output file
for pdf_name in os.listdir(all_pdfs):
    if pdf_name.endswith(".pdf"):
        pdf_path = os.path.join(all_pdfs, pdf_name)
        louse_doc = fitz.open(pdf_path)
        species_list = extract_species_context(pdf_path)
        print(f"Species from {pdf_name}: {species_list}") ##getting species names

        full_text = ""      #creating empty string to add pdf text into
        for page_num in range(len(louse_doc)):
            page = louse_doc.load_page(page_num)
            full_text += page.get_text()

        #PDF name into dictionary
        pdfs_data[pdf_name] = {}

        for species in species_list:
            match = desc_spec.search(full_text)   ##match/find where the description starts. desc_spec is a regex object created earlier
            if match:
                species_name_desc = match.group(2)
                print(f"Species Name: {species_name_desc}")   ###makes sure the description text is for the correct species

                # Capturing the text in between 'DESCRIPTION' and the first subsection
                free_text = extract_free_text(full_text, match, desc_sections)

                # First section
                start_pos = match.end()   #specifies where to start capturing text in the lines below
                post_match_text = full_text[start_pos:]  #grabs all text after the 'Description' match above
                end_pos_female = post_match_text.lower().find("\nfemale") #index where male description ends and female begins
                if end_pos_female == -1:        #if 'female' wasn't found from line above, modify the search
                    end_pos_female = post_match_text.lower().find("\n female") #added to include a PDF that has extra spaces


                if end_pos_female != -1:     #if the index is not -1, meaning 'female' was found from lines above
                    male_description_text = post_match_text[:end_pos_female].strip()   #save male text from start of description section up until the female section
                    post_female_text = post_match_text[end_pos_female:] #save female text from beginning of female up until it finds 'taxonomic summary'
                    end_pos_taxonomic_summary = post_female_text.lower().find("\ntaxonomic summary") #finding index for end of female description
                    if end_pos_taxonomic_summary == -1:
                        end_pos_taxonomic_summary = post_female_text.lower().find("\n taxonomic summary")

                    if end_pos_taxonomic_summary != -1:
                        female_description_text = post_female_text[:end_pos_taxonomic_summary].strip() #if end of section was found, cut off female description text there
                    else:
                        female_description_text = post_female_text.strip() #if not, just save all of text as female text. makes sure female text is saved, even if next section is not titled 'taxonomic summary'
                else:
                    male_description_text = post_match_text.strip()
                    female_description_text = "Not found"    #print if not female text is found.

                #trying to remove the downloaded lines from pdf to make text cleaner
                male_filtered_lines = [line for line in male_description_text.split("\n") if "Downloaded from" not in line]
                male_description_text = "\n".join(male_filtered_lines)

                female_filtered_lines = [line for line in female_description_text.split("\n") if "Downloaded from" not in line]
                female_description_text = "\n".join(female_filtered_lines)

                #getting female free text
                search_sections = ["Head", "Thorax", "Abdomen", "Paratergal plates", "Genitalia"]
                fem_free_text_end = "|".join([rf"\n\s*{term}" for term in search_sections])
                fem_free_end_match = re.search(fem_free_text_end, female_description_text)

                #from the beginning of the female description text, grab the text that is before any match from search_sections
                if fem_free_end_match:
                    fem_free_text = female_description_text[:fem_free_end_match.start()]
                else:
                    fem_free_text = "No female free text"
                    print("no female free text")


                # Keep each section in dictionary with species name as key
                pdfs_data[pdf_name][species] = {
                    "free_text": free_text,
                    "male": extract_section_text(male_description_text, desc_sections),
                    "female": extract_section_text(female_description_text, desc_sections),
                    "female_free_text": fem_free_text
                }

                print(f"Species: {species}")
                print(f"Free Text:\n{free_text}")
                print(f"Male Description Text:\n{male_description_text}")
                print(f"Female Description Text:\n{female_description_text}")
            else:
                print(f"No description found for species {species} in {pdf_name}")

#Opening file to save extracted text
#File to be used later to parse and organize
with open("extracted_traits_aug.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["PDF Name", "Species", "Sex", "Category", "Text"])

    for pdf_name, species_dict in pdfs_data.items():
        for species, sections in species_dict.items():
            writer.writerow([pdf_name, species, "Male", "Free Text", sections.get("free_text", "N/A")])
            for sex in ["male", "female"]:
                if sex in sections:
                    descriptions = sections[sex]
                    for section, text in descriptions.items():
                        writer.writerow([pdf_name, species, sex.capitalize(), section.replace("\n", ""), text.replace("\n", " ")])
                    if sex == "female" and "female_free_text" in sections:
                        writer.writerow([pdf_name, species, "Female", "Fem Free Text", sections["female_free_text"].replace("\n", " ")])


  desc_spec = re.compile('(\s*\\n\s*DESCRIPTION\S?\s*\\n\s*)([A-Z][a-z]+[\s|\n][a-z]+)')


Species described: ['Pedicinus gabonensis']
Species from Durden, LA., et al. 2020. A new species of sucking louse from the mandrill from Gabon.pdf: ['Pedicinus gabonensis']
Species Name: Pedicinus gabonensis
Species: Pedicinus gabonensis
Free Text:
Durden, Kessler, and Greiman n. sp. Male (Fig. 1A–C; n ¼ 1) Total body length of holotype, 1.675 mm. Head, thorax, and abdomen moderately sclerotized.
Male Description Text:
Durden, Kessler, and Greiman n. sp.
Male (Fig. 1A–C; n ¼ 1)
Total body length of holotype, 1.675 mm. Head, thorax, and
abdomen moderately sclerotized.
Head (Fig. 1A): Maximum width, 0.255 mm. Much longer
than wide with pair of distinct but nonbulging eyes situated about
halfway along lateral sides of head. Head widest centrally,
narrowest posteriorly, and broadly rounded anteriorly. One long
dorsal principal head seta situated posteriorly, 2 dorsal posterior
head setae, 1 dorsal anterior head seta, 3 dorsal marginal head
setae, 4–5 apical head setae, 1 ventral preantenna

In [46]:

mf_descriptions = "/Users/averyszewczak/Desktop/Anoplura_traitextraction/extracted_traits_aug.csv"
output_file_path = "/Users/averyszewczak/Desktop/Anoplura_traitextraction/parsed_extracted_traits_final.csv"


unit_measurements = ["millimeter", "mm"]

#regex to find and capture numbers
finding_nums = re.compile(r"\d")
capturing_nums = re.compile(r"\d+(?:[-–]\d+)?(?:\.\d+)?")

#sorting trait data by length to ensure longest capture first
trait_data_sorted = sorted(trait_data, key=len, reverse=True)

#regex to split sentences based on periods followed by an uppercase letter
sentence_endings = re.compile(r"(?<!\b\w\.\s)(?<!\.\d)(?<=\.)\s(?=[A-Z])")


# Regex pattern to capture both trait and nearby numbers
# This looks for numbers or ranges immediately before or after the trait
#capturing_trait_num = re.compile(r'(\d+(?:[-–]\d+)?(?:\.\d+)?)\s+([a-zA-Z\s]+(?:setae|seta))|([a-zA-Z\s]+(?:setae|seta))\s+(\d+(?:[-–]\d+)?(?:\.\d+)?)', re.IGNORECASE)

# Dictionary mapping number words to numeric digits
number_words = {
    "one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7",
    "eight": "8", "nine": "9", "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13",
    "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17", "eighteen": "18", "nineteen": "19",
    "twenty": "20", "thirty": "30", "forty": "40", "fifty": "50", "sixty": "60", "seventy": "70", "eighty": "80", "ninety": "90"
}

# Updated regex pattern to capture both traits and numbers (including flexibility for extra text)
capturing_trait_num = re.compile(
    r"((?:\d+(?:\.\d+)?(?:[-–]\d+(?:\.\d+)?)?)|(?:" + "|".join(number_words.keys()) +
    r"))\s*(?:[^a-zA-Z]*?)\s*([a-zA-Z\s]+(?:setae|seta|body|head|holotype))|([a-zA-Z\s]+(?:setae|seta|body|head|holotype))\s*(?:[^a-zA-Z]*?)\s*((?:\d+(?:\.\d+)?(?:[-–]\d+(?:\.\d+)?)?)|(?:" +
    "|".join(number_words.keys()) + r"))", re.IGNORECASE)

with open(mf_descriptions, newline="") as mf_data, open(output_file_path, mode="w", newline="") as output_file:
    reader = csv.reader(mf_data)
    writer = csv.writer(output_file)

    header = next(reader)  # Skipping header row
    category_index = header.index("Category")
    species_index = header.index("Species")
    text_index = header.index("Text")
    pdf_index = header.index("PDF Name")
    sex_index = header.index("Sex")

    # Header for output file
    writer.writerow(["PDF_Name", "Species", "Sex", "Category", "Full_Data", "Section_Data", "Characteristic", "Num_Data", "Unit"])

    # Running through each row of input file
    row_num = 0
    for row in reader:
        row_num += 1

        # Extracting columns
        category = row[category_index].strip()
        pdf_name = row[pdf_index].strip()
        species = row[species_index].strip()
        sex = row[sex_index].strip()
        species_info = row[text_index].strip()  # Text column

        # Cleaning up text a bit
        species_info_ns = species_info.replace("\n", " ").replace("-\n", "").replace("‚Äì", "-").replace("¬º", "=").replace("width,", "width").replace("mean,", "mean").replace("range,", "range").replace("- ", "")

        # Split text into sentences
        species_info_list = sentence_endings.split(species_info_ns)

        # Iterate through each sentence in the split text
        for sentence in species_info_list:
            # Make a copy of the original sentence for processing
            processed_sentence = sentence

            # Find lines with numbers (including written numbers)
            if finding_nums.search(sentence) or any(word in sentence.lower() for word in number_words):
                # Skip fully uppercase sentences
                if sentence.isupper():
                    continue

                # Find all traits with their corresponding numbers (digits or written numbers) in the sentence
                trait_num_matches = capturing_trait_num.findall(processed_sentence)

                for match in trait_num_matches:
                    # The match can be in different groups, so we need to handle both possibilities
                    if match[0]:  # If the number comes first (before the trait)
                        num_data = match[0].lower()
                        characteristic = match[1].strip()
                    else:  # If the trait comes first (before the number)
                        characteristic = match[2].strip()
                        num_data = match[3].lower()

                    # Convert word-based numbers to digits if necessary
                    if num_data in number_words:
                        num_data = number_words[num_data]

                    # Write the parsed information to the output file using the original sentence (unmodified)
                    writer.writerow([pdf_name, species, sex, category, species_info_ns, sentence, characteristic, num_data, "NA"])  # Unit would be 'NA' unless you want to adjust for units here


####some text gets converted when writing out. it was cleaned above but will need to be fixed again. e.g. dash (-) changes to ‚Äì

In [36]:
#### getting all descriptive text into sentences. could be used for something else; different code to parse??


##parsing apart text that was selected from previous cell and stored in extracted_traits_aug.csv


#Male and female description output file. Reading in to pull out character traits
mf_descriptions = "/Users/averyszewczak/Desktop/Anoplura_traitextraction/extracted_traits_aug.csv"


#Creating 'species_info' to be used in next cell to extract traits
#some lines are redundant with next cell
#breaking up sections into sentences

with open(mf_descriptions, newline="") as mf_data:
    mfdata_reader = csv.reader(mf_data)
    next(mfdata_reader) #skips header row

    all_species_info = []

    for row in mfdata_reader:    #iterating through each row
        species_info = row[4]   #extracting text from column 5 (index 4) which contains the extracted text
        species_info_ns = species_info.replace("\n", " ").replace("-\n", "").replace("‚Äì", "-").replace("¬º", "=").replace("width,", "width").replace("mean,", "mean").replace("range,", "range").replace("- ", "")

        # Split the string into sentences
        sentence_endings = re.compile(r"(?<!\b\w\.\s)(?<!\.\d)(?<=\.)\s(?=[A-Z])")
        species_info_list = sentence_endings.split(species_info_ns)
        #print(species_info_list)


        species_dict = {}
        for item in species_info_list:
            # Use regex to split by both comma and period
            split_bits = re.split(r"'", item)
            # Remove leading/trailing whitespace from each bit
            split_bits = [bit.strip() for bit in split_bits if bit.strip()]  # Exclude empty strings
            species_dict[item] = split_bits

        all_species_info.append(species_info_list) #saving info to use later
        # Print each key with its corresponding list of values
        #for key, value in species_dict.items():
         #   print(f"{key}: {value}")
print(all_species_info)

for all_lines in all_species_info:
    print(all_lines)
    if isinstance(all_lines, list):
        sentences = all_lines
    for single_sentence in sentences:
        print(single_sentence)

['Durden, Kessler, and Greiman n. sp.', 'Male (Fig. 1A–C; n ¼ 1) Total body length of holotype, 1.675 mm.', 'Head, thorax, and abdomen moderately sclerotized.']
Durden, Kessler, and Greiman n. sp.
Male (Fig. 1A–C; n ¼ 1) Total body length of holotype, 1.675 mm.
Head, thorax, and abdomen moderately sclerotized.
['Head (Fig. 1A): Maximum width 0.255 mm.', 'Much longer than wide with pair of distinct but nonbulging eyes situated about halfway along lateral sides of head.', 'Head widest centrally, narrowest posteriorly, and broadly rounded anteriorly.', 'One long dorsal principal head seta situated posteriorly, 2 dorsal posterior head setae, 1 dorsal anterior head seta, 3 dorsal marginal head setae, 4–5 apical head setae, 1 ventral preantennal head seta, 1 supra-antennal head seta, and 2 ventral lateral head setae on each side.', 'Antennae 5-segmented; ﬁrst segment very large, slightly wider than long; second and third segments elongate, much longer than wide; fourth and ﬁfth segments smal