In [31]:
import PyPDF2
import pdfplumber
import pandas as pd
import re
import os

def parse_summary(pdf_path):
    
    with pdfplumber.open(pdf_path) as pdf:
        summary_page = pdf.pages[2]  # Assuming the table of contents is on the fourth page
        text = summary_page.extract_text()

        # Initialize variables
        start_of_section = None
        end_of_section = None
        
        # Search for the start of the section
        start_of_section_match = re.search(r"\s*(Réserves\s*et\s*propositions\s*:?|Liste des observations relevées / commentaires)\s*\.*\s*(\d+)", text, re.I)
        #start_of_section_match = re.search(r"\s*Réserves\s+et\s+Propositions\s*(\d+)\s*$", text, re.I | re.M)

        if start_of_section_match:
            start_of_section = int(start_of_section_match.group(2).strip())-1

        # Search for the end of the section 
        end_of_section_match = re.search(r"\s*(Tableau\s*de\s*conformité|Annexe)\s*\.*\s*(\d+)", text,re.I)
      
        if end_of_section_match:
            end_of_section = int(end_of_section_match.group(2).strip())-1
        else: 
            end_of_section = len(pdf.pages)-1
       
        #print(start_of_section, end_of_section)

        combined_text = " "
        for i in range(start_of_section, end_of_section):  # Adjust page range as needed
            page = pdf.pages[i]
            table = page.extract_table()
           # print(table)

            if table is not None:
                df = pd.DataFrame(table[2:], columns=table[0])  # Assuming the first two rows are headers
                #print(df.columns)

                for row in df.itertuples(index=False):  # Iterate over rows
                    serial_number = row[0]
                    split_pattern = r'(Préconisation|Propositions|Proposition)[.,;:\s]*'
                    content = re.split(split_pattern, row[1], maxsplit=1,flags=re.IGNORECASE)[0]
                     # Split content at 'Préconisation' or the other words at most once and take the first part

                    combined_text += f"{serial_number}. {content};\n"  # Concatenate serial number and content
                    
    #print(combined_text)
                    
    return combined_text
      

def extract_data_from_pdf(pdf_path):
    
    # Initialize variables to store extracted information
    registration_number = None
    verified_installation = None
    place_of_verification = None
    nature_of_audit = None
    dates = None
    intervenants = None
    published_on = None
    producer = None
    user_of_machine= None
    date_of_placing_condition= None
    date_of_placing_establishment=None
    phase_report=None
    file_name= None
    
    file_name_match = re.search(r"[^/\\]+$", pdf_path)
    if file_name_match:
        file_name = file_name_match.group()
    
    with open(pdf_path, 'rb') as file:
            
            reader = PyPDF2.PdfReader(file)
            
            first_page = reader.pages[0]
            text0 = first_page.extract_text()
             # combine the lines but keep mutiple newlines
            text0 = re.sub(r'\n(?=[a-zA-Z])', ' ', text0)
            
            third_page = reader.pages[3]
            text3 = third_page.extract_text()          # Extracting each piece of information using regular expressions
            
            registration_number_match = re.search(r"E\d+\s*[A-Z]?\d*\s*(Ind\.[01])?", text0, re.I)
            if registration_number_match:
                registration_number = registration_number_match.group(0).strip()
            registration_number_match = re.search(r"\s+enregistrement([^\n]*)", text0, re.I)
            
            #if registration_number_match:
             #   registration_number = registration_number_match.group(1).strip()
              #  registration_number = registration_number.lstrip(':.').strip()

            verified_installation_match = re.search(r"Installation\s+vérifiée\s+([^\n]+)", text0,re.I)
            if verified_installation_match:
                verified_installation = verified_installation_match.group(1).strip()

            #place_of_verification_match = re.search(r"Place\s+of\s+verification\s*(.*?)(?:s*\.|$)", text0)
            #place_of_verification_match = re.search(r"Place\s+of\s+verification\s+([\s\S]*?Ltd\.)", text0)
            #if place_of_verification_match:
             #   place_of_verification = place_of_verification_match.group(1).strip()
                
                
            nature_of_audit_match = re.search(r"\s*Nature\s+de\s+la\s+vérification\s+([^\n]*)", text0, re.I)
            if nature_of_audit_match:
                nature_of_audit = nature_of_audit_match.group(1).strip()
                
            # Extracting 'dates' 
            #dates_match = re.search(r"(?:Dates|Audit dates)\s+([^\n]*)", text0)
            dates_match = re.search(r"\s*Dates\s+de\s+vérification\s+([^\n]*)", text0,re.I)
            if dates_match:
                dates = dates_match.group(1).strip()

            # Extracting 'intervenants' using regular expressions
            intervenants_match = re.search(r"\s*Intervenant\(s\)([^\n]*)", text0)
            if intervenants_match:
                intervenants_full = intervenants_match.group(1).strip()
                intervenants_words = intervenants_full.split()  # Split the string into a list of words
                last_three_words = intervenants_words[-2:]  # Get the last three words
                intervenants = ' '.join(last_three_words)

            # Extracting 'date of report' (published on) using regular expressions
            published_on_match = re.search(r"\s*été\s+édité\s*([^\n]*)", text0)
            if published_on_match:
                published_on = published_on_match.group(1).strip()
            
            
            # extract info from page 3
            company_match = re.search(r"Marque([^\n]*)", text3,re.I)
            #company_match = re.search(r"mark\s*et\s*\n((?:|[^\n]*)", text)
            if company_match:
                producer= company_match.group(1).strip()
                
            user_of_machine_match =re.search(r"Établissement([^\n]*)",text3)
            if user_of_machine_match:
                user_of_machine =user_of_machine_match.group(1).strip()
        
            date_of_placing_condition_match =re.search(r"\s*à\s+l’état\s+neuf\s*([^\n]*)",text3)
            if date_of_placing_condition_match:
                date_of_placing_condition =date_of_placing_condition_match.group(1).strip()
            
            date_of_placing_establishment_match= re.search(r"\s*dans\s+l’établissement\s*([^\n]*)",text3)
            if date_of_placing_establishment_match:
                date_of_placing_establishment = date_of_placing_condition_match.group(1).strip()
                
            phase_report_match= re.search(r"Cumul.*\n(\S+)\s+(\S+)",text3)
            if phase_report_match:
                phase_report =phase_report_match.group(2).strip()
            
            text_findings= parse_summary(pdf_path)
            
                
        # Creating a DataFrame with the extracted information
            data = {
            "FileName": file_name,
            'RegistrationNumber': registration_number,
            'VerifiedInstallation': verified_installation,
            'PlaceVerification':place_of_verification,
            'NatureAudit': nature_of_audit,
            'DateInspection': dates,
            'Intervenant': intervenants,
            'DateReport': published_on,
            "PhaseReport": phase_report,
            'Producer':producer,
            "UserOfMachine":user_of_machine,
            "DateInMarketNewCondtion": date_of_placing_condition,
            "DateInMarketEstablishment": date_of_placing_establishment,
            "TextFindings": text_findings
            
            }
    return data

        


def process_directory(directory_path):
    all_data = []  # List to store all data dictionaries
    failed_files = []  # List to store the names of files that caused errors

    # Process each file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.pdf'):
            try:
                pdf_path = os.path.join(directory_path, filename)
                data = extract_data_from_pdf(pdf_path)
                all_data.append(data)  # Append the data dictionary to the list
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
                failed_files.append(filename)  # Store the name of the file that failed

    # Create a DataFrame from the list of dictionaries
    df_France = pd.DataFrame(all_data)

    # Optionally, return the list of failed files as well
    return df_France, failed_files




In [32]:
# Specify the directory path containing the PDF files
directory_path = '/Users/rongwang/Documents/Dekra/reports-anuary2024/reports France/template1'

df_France,failed_files =  process_directory(directory_path)

print(df_France.columns)

#textfindings_df = df_china["file_name","phase of report","text findings"]
#textfindings_df.to_json("text_findings.json", orient='records', lines=True)
# Assuming df is your DataFrame
#textfindings_df.to_xml('text_findings.xml', index=False)

#df_china.to_csv(tracy_file,index=False,quoting=csv.QUOTE_NONNUMERIC)

df_France.to_xml("./reports France/normal_files.xml",index= False)
df_France.to_csv("./reports France/normal_files.csv",index= False,encoding='utf_8_sig')

Index(['FileName', 'RegistrationNumber', 'VerifiedInstallation',
       'PlaceVerification', 'NatureAudit', 'DateInspection', 'Intervenant',
       'DateReport', 'PhaseReport', 'Producer', 'UserOfMachine',
       'DateInMarketNewCondtion', 'DateInMarketEstablishment', 'TextFindings'],
      dtype='object')


In [4]:
num_rows = df_France.shape[0]
print("Number of rows:", num_rows)

Number of rows: 29


In [None]:
import pandas as pd
import docx

# Load the document
word_file_path = '/Users/rongwang/Documents/French/test.docx'

# This function will extract data from the first table in the document.
# You might need to adjust the logic to select the correct table.
def extract_data_from_table(doc):
    # Assuming that your data of interest is in the first table
    table = doc.tables[0] 

    # Prepare a list to store each row as a dictionary
    data = []

    # Extract column headers from the first row of the table
    headers = []
    for cell in table.rows[0].cells:
        headers.append(cell.text.strip())

    # Extract the rest of the data
    for row in table.rows[1:]:
        row_data = {}
        for idx, cell in enumerate(row.cells):
            # Use the header from the corresponding column as the dictionary key
            row_data[headers[idx]] = cell.text.strip()
        data.append(row_data)

    return data

# Extract the data
table_data = extract_data_from_table(doc)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(table_data)

# Print the DataFrame
print(df)


In [None]:

pdf_path = 'E19546002301RK1.pdf'
result = extract_data_from_pdf(pdf_path)
print(result)

with pdfplumber.open(pdf_path) as pdf:
        summary_page = pdf.pages[2]  # Assuming the table of contents is on the fourth page
        text = summary_page.extract_text()
print(text)

In [None]:
import re

def clean_text(text):
    # Replace line breaks followed by a lowercase letter or certain characters with a space
    cleaned_text = re.sub(r'\n(?=[a-z\-])', ' ', text)
    return cleaned_text

# Example usage with your extracted text
extracted_text = """
Nature of the 
audit Documentary assessment of work 
equipment 
"""

cleaned_text = clean_text(extracted_text)
print(cleaned_text)
