In [1]:
import os
import re
import pandas as pd
from pathlib import Path

# Directory containing Obsidian notes
notes_dir = os.getenv('TARGET_DIR')
processed_notes_csv = os.getenv('PROCESSED_NOTES_CSV')
processed_notes_json = os.getenv('PROCESSED_NOTES_JSON')

# Function to clean markdown text
def clean_text(text):
    # Remove markdown links, images, etc.
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    # Remove markdown headers
    text = re.sub(r'#.*', '', text)
    # Remove extra spaces and newlines
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

# Extract content from markdown files
data = []
for file in Path(notes_dir).glob('**/*.md'):
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
        cleaned_content = clean_text(content)
        data.append({'filename': file.name, 'content': cleaned_content})

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV or JSON
df.to_csv(processed_notes_csv, index=False)
# or
df.to_json(processed_notes_json, orient='records', lines=True)

print("Data preprocessing complete.")


Data preprocessing complete.
