In [None]:
import os
import logging
import csv
import xml.etree.ElementTree as ET
from docx_extraction import extract_docx_properties
from docx import Document

def process_folder(folder_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.docx'):
                input_file_path = os.path.join(root, file)
                output_csv_path = os.path.join(output_folder, os.path.splitext(file)[0] + '.csv')
                extract_and_save_properties(input_file_path, output_csv_path)

def extract_and_save_properties(input_file_path, output_csv_path):
    logging.info(f"Started to extract the docx properties from {input_file_path}")
    doc = Document(input_file_path)
    remove_empty_paragraphs(doc)
    doc.save(input_file_path)  # Save the cleaned document
    para_properties_xml = extract_docx_properties(input_file_path)
    if para_properties_xml:
        logging.info(f"Converting properties XML to CSV: {output_csv_path}")
        xml_to_csv(para_properties_xml, output_csv_path)
    else:
        logging.warning(f"No properties extracted from {input_file_path}")

def xml_to_csv(para_properties_xml, output_csv_path):
    root = ET.fromstring(para_properties_xml)
    properties_list = []
    for para in root.findall('ParagraphProperties'):
        properties_dict = {}
        for prop in para:
            properties_dict[prop.tag] = prop.text
        properties_list.append(properties_dict)
    
    fieldnames = properties_list[0].keys() if properties_list else []
    with open(output_csv_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(properties_list)

def remove_empty_paragraphs(doc):
    for para in doc.paragraphs[:]:
        if not para.text.strip():
            doc._element.body.remove(para._element)

if __name__ == "__main__":
    folder_path = '/Users/senthil/Downloads/JPT/journals_data_element_prediction/dataset prep/updated_docx'
    output_folder = '/Users/senthil/Downloads/JPT/journals_data_element_prediction/dataset prep/updated_docx/dataset'
    process_folder(folder_path, output_folder)


In [None]:
import os
import logging
import csv
import xml.etree.ElementTree as ET
from docx_extraction import extract_docx_properties
from docx import Document

def process_folder(folder_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.docx'):
                input_file_path = os.path.join(root, file)
                output_csv_path = os.path.join(output_folder, os.path.splitext(file)[0] + '.csv')
                try:
                    extract_and_save_properties(input_file_path, output_csv_path)
                except Exception as e:
                    logging.warning(f"Error processing file {input_file_path}: {str(e)}")

def extract_and_save_properties(input_file_path, output_csv_path):
    logging.info(f"Started to extract the docx properties from {input_file_path}")
    doc = Document(input_file_path)
    remove_empty_paragraphs(doc)
    doc.save(input_file_path)  # Save the cleaned document
    try:
        para_properties_xml = extract_docx_properties(input_file_path)
        if para_properties_xml:
            logging.info(f"Converting properties XML to CSV: {output_csv_path}")
            xml_to_csv(para_properties_xml, output_csv_path)
        else:
            logging.warning(f"No properties extracted from {input_file_path}")
    except Exception as e:
        logging.warning(f"Error extracting properties from {input_file_path}: {str(e)}")

def xml_to_csv(para_properties_xml, output_csv_path):
    try:
        root = ET.fromstring(para_properties_xml)
        properties_list = []
        for para in root.findall('ParagraphProperties'):
            properties_dict = {}
            for prop in para:
                properties_dict[prop.tag] = prop.text
            properties_list.append(properties_dict)
        
        fieldnames = properties_list[0].keys() if properties_list else []
        with open(output_csv_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(properties_list)
    except Exception as e:
        logging.warning(f"Error converting properties XML to CSV: {str(e)}")

def remove_empty_paragraphs(doc):
    for para in doc.paragraphs[:]:
        if not para.text.strip():
            doc._element.body.remove(para._element)

if __name__ == "__main__":
    folder_path = '/Users/senthil/Downloads/JPT/journals_data_element_prediction/dataset prep/updated_docx'
    output_folder = '/Users/senthil/Downloads/JPT/journals_data_element_prediction/dataset prep/updated_docx/dataset'
    process_folder(folder_path, output_folder)
