In [9]:
import docx, pandas as pd
 

def extract_data_from_docx(file_path):
    """Function extract docx and convert it into pandas DataFrame.
    Args:
        file_path (str, optional): Incoming docx file.
    Returns:
        docx_para_list (list): containing list of extracted para.
        pd.DataFrame: Dataframe containing the docx meta details.
    """
    # create document object
    doc = docx.Document(file_path)
    # Initialize an empty list to store extracted text
    docx_para_list = []
    
    # Iterate through each element in the document
    for element in doc.element.xpath('.//*'):
        # Check if the element is a paragraph
        if element.tag.endswith('}p'):
            # Skip paragraphs that are inside table cells
            if element.getparent().tag.endswith('}tc'):
                continue
            # Get all text elements in the paragraph
            text_elements = element.xpath('.//w:t')
            if text_elements:
                # Concatenate the text from all text elements
                text = ''.join([text_element.text for text_element in text_elements])
                docx_para_list.append(text)
        # Check if the element is a table
        elif element.tag.endswith('}tbl'):
            # Iterate through each row in the table
            for row in element.xpath('.//w:tr'):
                # Iterate through each cell in the row
                for cell in row.xpath('.//w:tc'):
                    cell_text = []
                    # Get all paragraphs in the cell
                    for paragraph in cell.xpath('.//w:p'):
                        runs = paragraph.xpath('.//w:r')
                        cell_text_by_paragraph = ""
                        # Concatenate the text from all runs in each paragraph
                        for run in runs:
                            run_text = run.xpath('.//w:t')
                            if run_text:
                                cell_text_by_paragraph += run_text[0].text
                        cell_text.append(cell_text_by_paragraph)
                    # Check if the cell contains a dropdown list
                    result_val = cell.xpath('.//w:r/w:fldChar/w:ffData/w:ddList/w:result/@w:val')
                    if result_val:
                        selected_value = cell.xpath('.//w:ffData/w:ddList/w:listEntry/@w:val')
                        if selected_value:
                            # Iterate through each paragraph in the cell and add the selected value for each dropdown to the list
                            for i, cell_text_by_paragraph in enumerate(cell_text):
                                if i < len(result_val) and i < len(selected_value):
                                    docx_para_list.append(f"{cell_text_by_paragraph} {selected_value[int(result_val[i])]}")
                                else:
                                    docx_para_list.append(cell_text_by_paragraph)
                        else:
                            # Iterate through each paragraph in the cell and add its text to the list
                            for cell_text_by_paragraph in cell_text:
                                docx_para_list.append(cell_text_by_paragraph)
                    else:
                        # Iterate through each paragraph in the cell and add its text to the list
                        for cell_text_by_paragraph in cell_text:
                            docx_para_list.append(cell_text_by_paragraph)
    # append list onto dataframe
    output_data=pd.DataFrame({"extracted_data":docx_para_list})
    return docx_para_list, output_data

text_corpus,_ = extract_data_from_docx(file_path="/Users/senthil/Desktop/Senthil/myTesting/Project_Creation/PM Brief Form 4.docx")
text_corpus

['Project Management Brief – Academic Production',
 'Basic Details',
 'Title: Applied History and Contemporary Policymaking',
 'Subtitle: School of Statecraft',
 'Series: N/A',
 'Bloomsbury Production contact:  Faye Robinson',
 'Publication:   November',
 'Requested finals date:   29 June 2022',
 'Contact Details',
 'Name:',
 'Role:',
 'Email:',
 'Dr. Robert Crowcroft',
 'Editor',
 'R.G.Crowcroft@ed.ac.uk',
 '',
 '',
 '',
 '',
 '',
 '',
 '(User: add/remove rows as needed)',
 'Copy-edit queries sent to: Authors/Editors listed above',
 '',
 'Proofs to go to: Authors/Editors listed above and Bloomsbury Production contact',
 '',
 'Any known unavailability: N/A',
 '',
 'Confidential author/editor notes: N/A',
 'Text Specifications',
 'TPS:  234 x 156mm',
 'Other: N/A',
 'Text colour(s): ',
 'Further details (colours, etc.): N/A',
 '',
 'Text design template:    Monograph C',
 'If ‘Custom’, or any changes to text design specs: N/A',
 '',
 'Any further text design details/additional notes for

In [11]:
import re


def find_text_with_regexes(regex_dict, text):
    # Create an empty dictionary to hold the results
    results = {}
    # print(text)
    my_text = '\n'.join(text)
    # print(my_text)
    # Loop through each label and regex in the dictionary
    for label, regex in regex_dict.items():
        # Use regex to search for the pattern in the text
        # print(f"Searching for label '{label}' with regex '{regex}'")
        match = re.search(regex, my_text)

        # If a match is found, add it to the results dictionary with the current label as the key
        if match:
            # print(f"Found match '{match.group(2)}'")
            results[label] = match.group(2).strip()
        # else:
        #     print(f"{label} not found")

    # Return the results dictionary
    return results


regex_dict = {"book_title":"(?i)(book\s+title|title)(?:[:,-–])?(?:[:,-–]?\s+)?(.+)",
"subtitle":"(?i)(book\ssub\s?title|sub\s?title)(?:[:,-–])?(?:[:,-–]?\s+)?(.+)",
"author":"(?i)(Name:,\s?Role:,\s?Email:,|Author(?:s?)(?:\(s\))?|contact\sauthor(?:s?)(?:\(s\))|Editor(?:s)?(?:\(s\))?|Contributor(?:s?)(?:\(s\))|Author\/Editor)(?:[:,-–])?(?:[:,-–]?\s+)?(.+)(?:\@.+)?",

"isbn":"(?i)(ISBN)\s?(?:[:,-–])?(?:[:,-–]?\s+)?([0-9-]+)()",
"hardback_isbn":"(?i)(ISBN\s\(hardback\)|ISBN\s\(hard\)|hardback)\s?(?:[:,-–])?(?:[:,-–]?\s+)?([0-9-]+)",
"paperback_isbn":"(?i)(ISBN\s\(paperback\)|ISBN\s\(paper\)|paperback)\s?(?:[:,-–])?(?:[:,-–]?\s+)?([0-9-]+)",
"epub_isbn":"(?i)(ePub)\s?(?:[:,-–])?(?:[:,-–]?\s+)?([0-9-]+)",
"ebook_master":"(?i)(Web\sPDF|ebook|PDF)\s?(?:[:,-–])?(?:[:,-–]?\s+)?([0-9-]+)",

"ebook_master_opt":"(?i)(\s)?(.+)\((Hardback|ISBN\s\(paper\)|ISBN\s\(cloth\)|ISBN\s\(paperback\)|ISBN\s\(Hardback\)|paperback|PDF\sebook|Web\sPDF|XHTML|ePub\sebook|ePub|PDF|ePDF|ebook)\)",
"isbn_opt":"(?i)(\s)?(.+)\((ISBN(?:\s?))\)",
"hardback_isbn_opt":"(?i)(\s)?(.+)\((Hardback)\)",
"paperback_isbn_opt":"(?i)(\s)?(.+)\((paperback)\)",
"epub_isbn_opt":"(?i)(\s)?(.+)\((ePub)\)",
"ebook_master_opt":"(?i)(\s)?(.+)\((Web\sPDF)\)",

"po_number":"(?i)(MUP\skeycode|Purchase\sOrder\sNumber|P\.O\.\s\#)(?:[:,-–])?(?:[:,-–]?\s+)?(.+)",
"tps":"(?i)(Estimated\sprint\sextent|Estimated\spage\sextent|Est\.\sfinal\spp\scount)\s?(?:[:,-–])?(?:[:,-–]?\s+)?(.+)(?:pp\.)",
"project_startDate":"(?i)(Handover\sdate|start\sdate|^date)\s?(?:[:,-–])?(?:[:,-–]?\s+)?(.+)",
"project_endDate":"(?i)(Requested\sfinals\sdate|Target\ssend-to-print\sdate|Schedule:\sReturn\smanuscript\sby|Final\sPDF\s\+\sPOD\sfiles|Printer\sDate|Final\sPDF\sdue)\s?(?:[:,-–])?(?:[:,-–]?\s+)?(?:late\s|spring)?(.+)",
"production_editor":"(Bloomsbury\sProduction\scontact|Anthem\sEditor|Production\sEditor|From)\s?(?:[:,-–])?(?:[:,-–]?\s+)?(.+)(,\s?Rowman\s\&\sLittlefield)?",
"project_manager":"(?i)(To)\s?(?:[:,-–])?(?:[:,-–]?\s+)?(.+)(,\s?Deanta Global)",
"text_design":"(?i)(Text\sdesign\stemplate|Text\sdesign|^template|Design\stemplate|Layout\sstyle.+?X)\s?(?:[:,-–])?(?:[:,-–]?\s+)?(.+)",
"trim_size":"(?i)(TPS|Page\strim|^Format|Trim\ssize)\s?(?:[:,-–])?(?:[:,-–]?\s+)?(.+)"}

# text = "Title: Applied History and Contemporary Policymaking"

results = find_text_with_regexes(regex_dict, text=text_corpus)
results


{'book_title': 'Applied History and Contemporary Policymaking',
 'subtitle': 'School of Statecraft',
 'author': 'R.G.Crowcroft@ed.ac.uk',
 'hardback_isbn': '9781350177024',
 'epub_isbn': '9781350177048',
 'ebook_master': '9781350177031',
 'project_endDate': '29 June 2022',
 'production_editor': 'Faye Robinson',
 'text_design': 'Monograph C',
 'trim_size': '234 x 156mm'}