In [None]:
import os
import pandas as pd
from pathlib import Path

INPUT_DIR = "idiom_examples"  # Here should be the path to the folder with csv examples (named as idiom ids each)
OUTPUT_FILE = "examples_all.csv"
SOURCE_NAME = "RNC"

input_path = Path(INPUT_DIR)
print("Working directory:", os.getcwd())
print("Looking for CSVs in:", input_path.resolve())

files = sorted(input_path.glob("*.csv"))
print(f"Found {len(files)} CSV files:", [f.name for f in files][:10], "...")
# only prints first 10 names

if not files:
    raise RuntimeError(
        f"No CSV files found in {input_path.resolve()}.\n"
        f"Check that INPUT_DIR is correct and that files are named like 1.csv, 2.csv, ..."
    )

all_rows = []
current_example_id = 1

for file in files:
    idiom_id = int(file.stem)  # assumes that each file is named after idiom id

    print(f"[INFO] Processing idiom_id={idiom_id} from file: {file.name}")

    df = pd.read_csv(file, sep=";", quotechar='"', dtype=str).fillna("")

    examples = pd.DataFrame({
        "example_id": range(current_example_id, current_example_id + len(df)),
        "idiom_id": idiom_id,
        "ru_example": df.get("Full context", ""),
        "en_example": df.get("Para context 1", ""),
        "author": df.get("Author", ""),
        "title": df.get("Title", ""),
        "birthday": df.get("Birthday", ""),
        "publication": df.get("Publication", ""),
        "header": df.get("Header", ""),
        "sphere": df.get("Sphere", ""),
        "topic": df.get("Topic", ""),
        "medium": df.get("Medium", ""),
        "publ_year": df.get("Publ_year", ""),
        "translator": df.get("Translator", ""),
        "lang_orig": df.get("Lang", ""),
        "lang_trans": df.get("Lang_trans", ""),
        "source": SOURCE_NAME,
        "example_source_id": df.get("Example source", "")
    })

    all_rows.append(examples)
    current_example_id += len(df)

final_df = pd.concat(all_rows, ignore_index=True)
final_df.to_csv(OUTPUT_FILE, index=False)
print(f"Saved {len(final_df)} examples â†’ {OUTPUT_FILE}")

## Merging existing files

In [None]:
en = pd.read_csv("EN.csv", sep=";", dtype=str)
ex = pd.read_csv("/Users/ciwwwnd/Desktop/examples_all.csv", dtype=str) # example table

en["id"] = en["id"].astype(int)
ex["idiom_id"] = ex["idiom_id"].astype(int)

merged = ex.merge(en, left_on="idiom_id", right_on="id", how="left")
merged = merged.drop(columns=["id"])

cols = [
    "example_id", "idiom_id",
    "head_ru", "transliteration", "literal", "meaning", "equivalents", "transparency",
    "ru_example", "en_example",
    "author", "title", "birthday", "publication", "header",
    "sphere", "topic", "medium", "publ_year",
    "translator", "lang_orig", "lang_trans",
    "source", "example_source_id"
]

merged = merged[cols]

merged.to_csv("examples_with_idioms.csv", index=False)

print("Rows in merged file:", len(merged))