# Morphological Generation for ግድል

## Import Libraries

In [25]:
import itertools
import pandas as pd

## Raw Word List

In [26]:
import json

with open('../data/raw.json', 'r') as f:
    raw_patterns_list = json.load(f)['patterns']

raw_patterns = "\n".join(raw_patterns_list)

## Conversion to RegEx

In [27]:
import re

def convert_rule(rule: str) -> str:
    # Strip all square brackets and whitespace
    text = rule.replace('[', '').replace(']', '').replace(' ', '')

    # Replace stem groups {a,b,...} -> non-capturing group (?:a|b|...)
    def stem_repl(m: re.Match) -> str:
        options = m.group(1).split(',')
        return f"(?:{'|'.join(options)})"

    text = re.sub(r"\{([^}]+)\}", stem_repl, text)

    # Replace affix groups (a,b,...) -> non-capturing group (?:a|b|...)
    def affix_repl(m: re.Match) -> str:
        options = m.group(1).split(',')
        return f"(?:{'|'.join(options)})"

    text = re.sub(r"\(([^)]+)\)", affix_repl, text)

    # Remove concatenation markers
    text = text.replace('+', '')

    return text


# Process each pattern
patterns = [convert_rule(r) for r in raw_patterns_list]
combined = '^(' + '|'.join(patterns) + ')$'

## Tabulated Word List

In [28]:
# Regex to split into segments, ignoring separators
SEG_RE = re.compile(r'(\{[^}]*\}|\([^)]*\)|[^(){}\[\],+]+)')

generated = []  # will hold dicts with lists: prefixes, stem, suffixes, full_word

# Parse and generate combinations
for line in raw_patterns.splitlines():
    txt = line.strip()
    if not txt or txt.startswith('#'):
        continue
    if txt.startswith('[') and txt.endswith(']'):
        txt = txt[1:-1]

    raw_segs = SEG_RE.findall(txt)
    options = []
    stem_idx = None
    for idx, seg in enumerate(raw_segs):
        seg = seg.strip()
        if not seg:
            continue
        if seg.startswith('{') and seg.endswith('}'):
            opts = [s.strip() for s in seg[1:-1].split(',') if s.strip()]
            stem_idx = idx
        elif seg.startswith('(') and seg.endswith(')'):
            opts = [s.strip() for s in seg[1:-1].split(',') if s.strip()]
        else:
            opts = [seg]
        options.append(opts)

    for combo in itertools.product(*options):
        if stem_idx is not None:
            prefixes = list(combo[:stem_idx])
            stem = combo[stem_idx]
            suffixes = list(combo[stem_idx+1:])
        else:
            prefixes = list(combo[:-1])
            stem = combo[-1]
            suffixes = []
        generated.append({
            'prefixes': prefixes,
            'stem': stem,
            'suffixes': suffixes,
            'full_word': ''.join(combo)
        })

# Determine maximum prefix/suffix lengths
max_pre = max((len(item['prefixes']) for item in generated), default=0)
max_suf = max((len(item['suffixes']) for item in generated), default=0)

# Build final rows with right-aligned prefixes and left-aligned suffixes
rows = []
for item in generated:
    row = {}
    # right-align prefixes into prefix_1...prefix_N
    k = len(item['prefixes'])
    for i in range(1, max_pre+1):
        # position in item['prefixes'] is offset: idx = i - (max_pre - k) - 1
        idx = i - (max_pre - k) - 1
        row[f'prefix_{i}'] = item['prefixes'][idx] if 0 <= idx < k else ''
    # stem centered (single column)
    row['stem'] = item['stem']
    # left-align suffixes: fill suffix_1...suffix_M in order
    for j in range(1, max_suf+1):
        row[f'suffix_{j}'] = item['suffixes'][j-1] if j-1 < len(item['suffixes']) else ''
    row['full_word'] = item['full_word']
    rows.append(row)

# Assemble DataFrame with ordered columns
cols = [f'prefix_{i}' for i in range(1, max_pre+1)] + ['stem'] + [f'suffix_{j}' for j in range(1, max_suf+1)] + ['full_word']
df = pd.DataFrame(rows, columns=cols)

# Display table
df.head(10)

Unnamed: 0,prefix_1,prefix_2,prefix_3,stem,suffix_1,suffix_2,suffix_3,full_word
0,,,የ,ግድያ,,,,የግድያ
1,,,በ,ግድያ,,,,በግድያ
2,,,ከ,ግድያ,,,,ከግድያ
3,,,ስለ,ግድያ,,,,ስለግድያ
4,,,የ,ግድያ,ን,,,የግድያን
5,,,የ,ግድያ,ም,,,የግድያም
6,,,የ,ግድያ,ስ,,,የግድያስ
7,,,በ,ግድያ,ም,,,በግድያም
8,,,በ,ግድያ,ስ,,,በግድያስ
9,,,ከ,ግድያ,ም,,,ከግድያም


## Total Word Count

In [29]:
print(f"Total number of words generated: {df.__len__()} unique words")

Total number of words generated: 7326 unique words


## Export Words to CSV

In [30]:
df.to_csv('../output/word_list.csv', index=False)