In [None]:
# DFA for simplified English words

def dfa_validate(word):
    state = 0  # start state
    for i, ch in enumerate(word):
        if state == 0:
            # First character must be lowercase a-z
            if 'a' <= ch <= 'z':
                state = 1
            else:
                return False
        elif state == 1:
            # Following characters must also be lowercase a-z
            if 'a' <= ch <= 'z':
                state = 1
            else:
                return False
    return state == 1  # must end in valid state


# Test words
test_words = [
    "cat", "dog", "a", "zebra",   # Accepted
    "dog1", "1dog", "DogHouse", "Dog_house", " cats"  # Not accepted
]

# Run DFA and print results
for word in test_words:
    if dfa_validate(word):
        print(f"{word} -> Accepted")
    else:
        print(f"{word} -> Not Accepted")


cat -> Accepted
dog -> Accepted
a -> Accepted
zebra -> Accepted
dog1 -> Not Accepted
1dog -> Not Accepted
DogHouse -> Not Accepted
Dog_house -> Not Accepted
 cats -> Not Accepted


In [None]:
# Install automathon and graphviz
!pip install automathon graphviz

from automathon import DFA
from IPython.display import Image

# States
Q = {'q0', 'q1', 'q_dead'}

# Alphabet
sigma = set([chr(c) for c in range(ord('a'), ord('z')+1)] + ['other'])

# Transition function
delta = {
    'q0': dict(**{ch: 'q1' for ch in [chr(c) for c in range(ord('a'), ord('z')+1)]},
               **{'other': 'q_dead'}),
    'q1': dict(**{ch: 'q1' for ch in [chr(c) for c in range(ord('a'), ord('z')+1)]},
               **{'other': 'q_dead'}),
    'q_dead': {ch: 'q_dead' for ch in sigma}
}

# Start state
q0 = 'q0'

# Final states
F = {'q1'}

# Create DFA
dfa = DFA(Q, sigma, delta, q0, F)

# View DFA (this will create "dfa.gv.png")
dfa.view("dfa")

# Display DFA diagram
Image(filename="dfa.gv.png")

# Function to classify words
def classify(word):
    if not word:
        return "Not Accepted"
    if not ('a' <= word[0] <= 'z'):
        return "Not Accepted"
    for ch in word:
        if not ('a' <= ch <= 'z'):
            return "Not Accepted"
    return "Accepted"

# Test words
test_words = ["cat", "dog", "a", "zebra", "dog1", "1dog", "DogHouse", "Dog_house", " cats"]
for w in test_words:
    print(f"{w} -> {classify(w)}")


cat -> Accepted
dog -> Accepted
a -> Accepted
zebra -> Accepted
dog1 -> Not Accepted
1dog -> Not Accepted
DogHouse -> Not Accepted
Dog_house -> Not Accepted
 cats -> Not Accepted


In [None]:
# Colab-ready cell: read brown_nouns.pdf (or brown_nouns.txt), produce morphological outputs
# Save this cell and run it in one go in Colab.

# Install pdf extraction lib (only if PDF is used)
!pip install -q pdfplumber

import os
import re
import pdfplumber

PDF_PATH = "/content/brown_nouns.pdf"   # path where your uploaded PDF appears in Colab
TXT_PATH = "/mnt/data/brown_nouns.txt"   # optional: if you have a plain text file
OUT_PATH = "/content/output.txt"

# Helper: extract tokens (words) from PDF (if present) or from TXT
def extract_words_from_pdf(path):
    words = []
    if not os.path.exists(path):
        return words
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            # split on whitespace and punctuation, keep simple tokens
            toks = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?", text)
            words.extend(toks)
    return words

def extract_words_from_txt(path):
    words = []
    if not os.path.exists(path):
        return words
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    toks = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?", text)
    words.extend(toks)
    return words

# Load nouns: prefer TXT if exists, else PDF
tokens = []
if os.path.exists(TXT_PATH):
    print("Found text file, reading", TXT_PATH)
    tokens = extract_words_from_txt(TXT_PATH)
elif os.path.exists(PDF_PATH):
    print("Found PDF file, reading", PDF_PATH)
    tokens = extract_words_from_pdf(PDF_PATH)
else:
    raise FileNotFoundError("Neither brown_nouns.txt nor brown_nouns.pdf were found at expected paths. "
                            "Upload the file to /mnt/data or change the paths in the script.")

# Normalize and unique-ify while preserving order
seen = set()
nouns = []
for t in tokens:
    w = t.strip().lower()
    if not w:
        continue
    if w in seen:
        continue
    seen.add(w)
    nouns.append(w)

print(f"Extracted {len(nouns):,} unique candidate tokens (lowercased).")

# FST-like rule engine for plural generation
# Rules (order matters):
# 1) E insertion: if word ends with s, z, x, ch, sh -> add "es"  (examples: fox -> foxes, watch -> watches)
# 2) Y replacement: if word ends with consonant + y -> replace y with "ies" (try -> tries)
# 3) S addition (fallback): add "s"
#
# Validation: If the token contains any non-alphabetic char (except apostrophe inside which we allow above),
# we treat as Invalid Word. Also reject empty tokens.
#
# Output format:
# Accepted forms:
#   plural_form = root+N+PL
#   root = root+N+SG
# If invalid: plural_form_or_root -> Invalid Word    (we will print both singular and plural lines as Invalid Word to be explicit)

def is_valid_root(word):
    # valid only if all letters a-z (we already lowered) and length >= 1
    return bool(re.fullmatch(r"[a-z]+", word))

def generate_plural(word):
    # Must assume input is lowercase and alphabetic.
    # apply rules in order
    # check endings: 'ch' and 'sh' are two-letter endings
    if re.search(r"(s|z|x)$", word) or re.search(r"(ch|sh)$", word):
        return word + "es"
    # consonant + y -> replace y with ies
    if re.search(r"[^aeiou]y$", word):
        return word[:-1] + "ies"
    # fallback: just add s
    return word + "s"

# Build outputs
lines = []
for root in nouns:
    if not is_valid_root(root):
        # generate explicit Invalid Word output for both forms
        lines.append(f"{root} = Invalid Word")
        lines.append(f"(plural of {root}) = Invalid Word")
        continue

    plural = generate_plural(root)

    # Extra safety: avoid some pathological outputs (not required, but ensures no invalid strings)
    if not is_valid_root(plural):
        lines.append(f"{plural} = Invalid Word")
    else:
        # Write outputs as requested: "foxes = fox+N+PL" and also singular mapping "fox = fox+N+SG"
        lines.append(f"{plural} = {root}+N+PL")
        lines.append(f"{root} = {root}+N+SG")

# Save to file
with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print(f"Done — morphological outputs written to: {OUT_PATH}")
print("\nSample (first 40 output lines):\n")
with open(OUT_PATH, "r", encoding="utf-8") as f:
    for i, ln in enumerate(f):
        if i >= 40:
            break
        print(ln.rstrip())

# If you want to download / inspect the file in Colab:
print("\nYou can download the file from Colab file browser or read it with:")
print(f"!sed -n '1,200p' {OUT_PATH}")


Found PDF file, reading /content/brown_nouns.pdf
Extracted 17,462 unique candidate tokens (lowercased).
Done — morphological outputs written to: /content/output.txt

Sample (first 40 output lines):

investigations = investigation+N+PL
investigation = investigation+N+SG
primaries = primary+N+PL
primary = primary+N+SG
elections = election+N+PL
election = election+N+SG
evidences = evidence+N+PL
evidence = evidence+N+SG
irregularitieses = irregularities+N+PL
irregularities = irregularities+N+SG
places = place+N+PL
place = place+N+SG
juries = jury+N+PL
jury = jury+N+SG
presentmentses = presentments+N+PL
presentments = presentments+N+SG
charges = charge+N+PL
charge = charge+N+SG
praises = praise+N+PL
praise = praise+N+SG
thankses = thanks+N+PL
thanks = thanks+N+SG
manners = manner+N+PL
manner = manner+N+SG
terms = term+N+PL
term = term+N+SG
reportses = reports+N+PL
reports = reports+N+SG
handfuls = handful+N+PL
handful = handful+N+SG
interests = interest+N+PL
interest = interest+N+SG
numbers

In [None]:
# Colab-ready script with accuracy metric

!pip install -q pdfplumber

import os
import re
import pdfplumber

PDF_PATH = "/content/brown_nouns.pdf"
TXT_PATH = "/mnt/data/brown_nouns.txt"
OUT_PATH = "/content/output3"

# Extract from PDF
def extract_words_from_pdf(path):
    words = []
    if not os.path.exists(path):
        return words
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            toks = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?", text)
            words.extend(toks)
    return words

# Extract from TXT
def extract_words_from_txt(path):
    words = []
    if not os.path.exists(path):
        return words
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    toks = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?", text)
    words.extend(toks)
    return words

# Load nouns
tokens = []
if os.path.exists(TXT_PATH):
    tokens = extract_words_from_txt(TXT_PATH)
elif os.path.exists(PDF_PATH):
    tokens = extract_words_from_pdf(PDF_PATH)
else:
    raise FileNotFoundError("Upload brown_nouns.txt or brown_nouns.pdf to /mnt/data in Colab.")

# Normalize & unique
seen = set()
nouns = []
for t in tokens:
    w = t.strip().lower()
    if not w:
        continue
    if w in seen:
        continue
    seen.add(w)
    nouns.append(w)

print(f"Extracted {len(nouns):,} unique candidate tokens.")

# Validation
def is_valid_root(word):
    return bool(re.fullmatch(r"[a-z]+", word))

# Rule-based plural generator with classification
def generate_plural(word):
    if re.search(r"(s|z|x)$", word) or re.search(r"(ch|sh)$", word):
        return word + "es", "E-insertion"
    if re.search(r"[^aeiou]y$", word):
        return word[:-1] + "ies", "Y-replacement"
    return word + "s", "S-addition"

# Counters for accuracy
invalid_count = 0
rule_counts = {"E-insertion": 0, "Y-replacement": 0, "S-addition": 0}

lines = []
for root in nouns:
    if not is_valid_root(root):
        invalid_count += 1
        lines.append(f"{root} = Invalid Word")
        lines.append(f"(plural of {root}) = Invalid Word")
        continue

    plural, rule_type = generate_plural(root)
    rule_counts[rule_type] += 1

    lines.append(f"{plural} = {root}+N+PL")
    lines.append(f"{root} = {root}+N+SG")

# Save output file
with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

# Accuracy summary
total = len(nouns)
valid_count = total - invalid_count
valid_pct = (valid_count / total) * 100
invalid_pct = (invalid_count / total) * 100

print("\n--- Accuracy Summary ---")
print(f"Total words processed: {total}")
print(f"Valid words: {valid_count} ({valid_pct:.2f}%)")
print(f"Invalid words: {invalid_count} ({invalid_pct:.2f}%)")

print("\n--- Valid word distribution by rule ---")
for rule, count in rule_counts.items():
    pct = (count / valid_count * 100) if valid_count else 0
    print(f"{rule}: {count} words ({pct:.2f}%)")

print(f"\nDetailed output saved to: {OUT_PATH}")


Extracted 17,462 unique candidate tokens.

--- Accuracy Summary ---
Total words processed: 17462
Valid words: 17429 (99.81%)
Invalid words: 33 (0.19%)

--- Valid word distribution by rule ---
E-insertion: 6622 words (37.99%)
Y-replacement: 1008 words (5.78%)
S-addition: 9799 words (56.22%)

Detailed output saved to: /content/output3
