In [2]:
import os
import re
import pandas as pd

# Function to extract bold text and associated content
def extract_content(text):
    # Remove lines containing specific phrases
    ignore_words = [
        "Bike Tested", "Price OTR",
        "Alternatives", "Road Test No", "Test Location",
        "Riders", "Picture Editing"
    ]
    text = '\n'.join([line for line in text.splitlines() if not any(word in line for word in ignore_words)])

    # Find all bold text with their positions
    pattern = r"(\*\*(.+?)\*\*)"
    matches = list(re.finditer(pattern, text))

    extracted_data = []

    for i, match in enumerate(matches):
        bold_text = match.group(0)

        # Stop parsing if 'Further Reading' is found in bold text
        if "Further Reading" in bold_text:
            break

        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        content = text[start:end].strip()

        # Remove lines starting with specific phrases
        content = '\n'.join([line for line in content.splitlines() if not line.startswith("Processing additional page:") and not line.startswith("Home » Bike News")])

        extracted_data.append([bold_text, content])

    return extracted_data

# Process all .txt files in the current directory
def process_txt_files():
    current_folder = os.getcwd()
    txt_files = [f for f in os.listdir(current_folder) if f.endswith('.txt')]

    if not txt_files:
        print("No .txt files found in the current folder.")
        return

    output_folder = os.path.join(current_folder, 'output_files')
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for txt_file in txt_files:
        print(f"\nProcessing file: {txt_file}")
        with open(txt_file, 'r', encoding='utf-8') as file:
            content = file.read()

        extracted_data = extract_content(content)

        # Create a DataFrame from the extracted data
        if extracted_data:
            df = pd.DataFrame(extracted_data, columns=["Bold Text", "Content"])
        else:
            # Treat the entire article as a single entry if no bold text is found or extracted_data is empty
            df = pd.DataFrame([["Full Article", content]], columns=["Bold Text", "Content"])
        
        print(df)

        # Save the DataFrame to a CSV file for each input file
        output_file = os.path.join(output_folder, f"{os.path.splitext(txt_file)[0]}_extracted.csv")
        df.to_csv(output_file, index=False)

if __name__ == "__main__":
    process_txt_files()



Processing file: bajaj-pulsar-rs-200-track-test-review.txt
                        Bold Text  \
0  **Bajaj Pulsar RS 200 Review**   

                                             Content  
0  The Pulsar RS 200 offers commendable handling ...  

Processing file: bajaj-pulsar-as-150-test-ride-review.txt
                                 Bold Text  \
0           **Bajaj Pulsar AS 150 Review**   
1                              **Styling**   
2    **Instrument Cluster and Switchgear**   
3                           **Ergonomics**   
4                          **Performance**   
5                      **Riding Dynamics**   
6                              **Verdict**   
7                          **What’s Cool**   
8                   **What’s Not So Cool**   
9   **Bajaj Pulsar AS 150 Specifications**   
10      **Bajaj Pulsar AS 150 Dimensions**   

                                              Content  
0   The Pulsar AS 150 brings top notch hardware an...  
1   – Bajaj Auto have come a lo