In [1]:
import os
import pandas as pd
import re

folder_path = "C:\\Users\\Dell\\Desktop\\merlin\\german"
data = []

def clean_text(text):
    # Remove multiple line breaks
    text = re.sub(r'\n+', ' ', text)
    # Remove postal codes / numbers (optional)
    text = re.sub(r'\d{5}', '', text)
    text = re.sub(r'\d{1,3}[.-]\d{1,3}[.-]?\d*', '', text)
    # Remove salutations and signatures
    text = re.sub(r'Sehr geehrte Damen und Herren', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Mit freundlichen Gr[uÃ¼]ssen.*', '', text, flags=re.IGNORECASE)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

for file in os.listdir(folder_path):
    if file.endswith(".txt"):
        file_path = os.path.join(folder_path, file)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
        except UnicodeDecodeError:
            with open(file_path, "r", encoding="latin-1") as f:
                content = f.read()

        # Extract CEFR overall rating
        cefr_line = [line for line in content.splitlines() if "Overall CEFR rating:" in line]
        cefr = cefr_line[0].split(":")[-1].strip() if cefr_line else None

        # Extract learner text
        if "Learner text:" in content:
            text = content.split("Learner text:")[-1].strip()
        else:
            text = ""

        # Clean text
        text = clean_text(text)

        data.append({
            "text": text,
            "CEFR": cefr
        })

# Convert to CSV
df = pd.DataFrame(data)
df.to_csv("merlin_meta_german_clean.csv", index=False, encoding="utf-8-sig")
print("CSV created with", len(df), "rows")


CSV created with 1033 rows


In [None]:
import pandas as pd
from googletrans import Translator
import time

# Load your cleaned CSV
df = pd.read_csv("merlin_meta_german_clean.csv")

# Only minority classes
minority_df = df[df['CEFR'].isin(['A1', 'C1', 'C2'])]

translator = Translator()
augmented_texts = []

# Number of augmented versions per text
n_augment = 2  # You can change to 1,2,3

for idx, row in minority_df.iterrows():
    text = row['text']
    label = row['CEFR']
    
    for i in range(n_augment):
        try:
            # German -> English
            en_text = translator.translate(text, src='de', dest='en').text
            # English -> German
            de_text = translator.translate(en_text, src='en', dest='de').text
            augmented_texts.append({'text': de_text, 'CEFR': label})
            time.sleep(1)  # avoid hitting Google too fast
        except Exception as e:
            print(f"Translation failed for index {idx}: {e}")

# Create DataFrame of augmented texts
aug_df = pd.DataFrame(augmented_texts)

# Combine with original data
df_augmented = pd.concat([df, aug_df], ignore_index=True)

# Save to CSV
df_augmented.to_csv("cefr_augmented.csv", index=False, encoding='utf-8')
print("Augmentation done! Total rows:", len(df_augmented))
