In [2]:
import pandas as pd
from pathlib import Path

def collect_poems_from_directory(root_dir, label_type):
    poems = []
    for category_dir in Path(root_dir).iterdir():
        if category_dir.is_dir():
            label = category_dir.name
            for file in category_dir.glob("*.txt"):
                try:
                    content = file.read_text(encoding="utf-8").strip()
                    if len(content.split()) < 10:
                        continue  # skip extremely short pieces
                    poems.append({
                        "title": file.stem[:100],  # truncate long titles
                        "poem": content,
                        "topic": label if label_type == "topic" else None,
                        "form": label if label_type == "form" else None,
                        "author": "Unknown",
                        "emotion": "unknown"
                    })
                except Exception:
                    continue  # skip unreadable files
    return poems

# Collect from both topic and form folders
topic_poems = collect_poems_from_directory(f"../../data/topics", "topic")
form_poems = collect_poems_from_directory(f"../../data/forms", "form")

# Combine and normalize into a DataFrame
all_poems_df = pd.DataFrame(topic_poems + form_poems)

output_path = ".\data_cleaned/full_labeled_poetry_dataset.csv"
all_poems_df.to_csv(output_path, index=False)
print(f"Saved cleaned dataset to {output_path}")

Saved cleaned dataset to .\data_cleaned/full_labeled_poetry_dataset.csv
