In [32]:
import ast, itertools
import json
import random
import pandas as pd
from collections import Counter

In [3]:
# Load core structures
with open('/Users/rafaelfelix/Projects/demos/benky-fy/tmp/core.json', 'r', encoding='utf-8') as f:
    core_data = {key: pd.DataFrame(item) for key, item in json.load(f).items()}
    structures = core_data['structures']
    vocab = core_data['vocab'] = pd.read_json('/Users/rafaelfelix/Projects/demos/benky-fy/tmp/vocab.json')
    verbs = pd.read_json('/Users/rafaelfelix/Projects/demos/benky-fy/tmp/verbs.json')
vocab_interests = json.load(open('/Users/rafaelfelix/Projects/demos/benky-fy/tmp/vocab_interests.json'))


In [None]:
structure = structures.sample(1).iloc[0]
# vocab intrests:
#  politeness, people_identity, daily_life_lifestyle, places_travel_events, 
#  knowledge_communication, nature_description, actions_time, ai_professional
vocab_criterias = [
    ("priority_group", ["p0", "p1", "p2"]),
    ("tags", vocab_interests['places_travel_events'])
]

tvocab = vocab.copy()
for key, value in vocab_criterias:
    tvocab = tvocab[tvocab[key].apply(lambda x: bool(set(x) & set(value)) if isinstance(x, list) else x in value)]

words = {}
for key, value in structure.slots.items():
    if key == 'clause':
        print(NotImplementedError("Clause is not implemented yet"))
    elif key == "Verb":
        if value != "any" and value != ["any"]:
            pool = verbs[verbs.tags.apply(
                lambda x: bool(set(x['semantic'] if isinstance(x['semantic'], list) else [x['semantic']]) & set(value))
            )]
        else:
            pool = verbs
    elif key == "Adj":
        pool = vocab[vocab.category.isin(["adjective"])]
    else:
        pool = tvocab[tvocab.category.isin(value)]
    words[key] = pool.sample().iloc[0].english
    
print("structure: ", structure.structure)
print("theme: ", random.choice(structure.theme))
for key, value in words.items():
    print(f'{key}: {value}')


structure:  A を Verb
theme:  action with object
A: sandwich
Verb: to know


In [57]:
import random

# -------------------------------
# Theme constraints
# -------------------------------
theme_constraints = {
    "action with object": {
        "verb_semantics": ["action", "transaction", "communication", "physiological", "artistic", "learning"],
        "noun_tags": ["food", "object", "shopping", "daily", "social"]
    },
    "motion / destination": {
        "verb_semantics": ["motion"],
        "noun_tags": ["place", "location", "travel"]
    },
    "existence / subject focus": {
        "verb_semantics": ["existence", "stative", "cognitive"],
        "noun_tags": ["people", "object", "place"]
    },
    "description": {
        "verb_semantics": [],   # adj-driven
        "adj_tags": ["adjective"],
        "noun_tags": ["object", "people"]
    },
    "possession / relation": {
        "verb_semantics": [],   # noun-driven
        "noun_tags": ["people", "object"]
    },
    "inquiry": {
        "verb_semantics": ["any"],
        "noun_tags": ["any"]
    },
    "negation": {
        "verb_semantics": ["any"],
        "noun_tags": ["any"]
    }
}

# -------------------------------
# Sentence generation
# -------------------------------
def generate_sentence(structures, verbs, vocab):
# if True:
    # 1. Pick structure
    # structure = structures.sample(1).iloc[0]
    theme = structure.theme if isinstance(structure.theme, str) else structure.theme[0]
    constraints = theme_constraints.get(theme, {"verb_semantics": ["any"], "noun_tags": ["any"]})

    words = {}
    for key, value in structure.slots.items():
        # Clause (not implemented yet)
        if key == "clause":
            continue

        # Verb selection
        elif key == "Verb":
            if constraints["verb_semantics"] and "any" not in constraints["verb_semantics"]:
                pool = verbs[verbs.tags.apply(
                    lambda t: bool(set(t.get("semantic", [])) & set(constraints["verb_semantics"]))
                )]
            else:
                pool = verbs

        # Adjective selection
        elif key == "Adj":
            pool = vocab[vocab.category.isin(["adjective"])]

        # Noun / other slots
        else:
            if constraints["noun_tags"] and "any" not in constraints["noun_tags"]:
                pool = vocab[vocab.tags.apply(
                    lambda t: bool(set(t) & set(constraints["noun_tags"]))
                )]
            else:
                pool = vocab[vocab.category.isin(value)]

        # Fallback if empty
        if pool.empty:
            pool = vocab if key != "Verb" else verbs

        # Pick one word
        # print(key, value, pool.shape)
        words[key] = {
            'english':pool.sample().iloc[0].english,
            'possibilities': value,
            'hiragana': pool.sample().iloc[0].hiragana if 'hiragana' in pool.columns else pool.sample().iloc[0].furigana,
        }

    # -------------------------------
    # Print result
    # -------------------------------
    print("structure:", structure.structure)
    print("theme:", theme)
    for k, v in words.items():
        print(f"{k}: {v}")

    return words

_ = generate_sentence(structures, verbs, vocab)

structure: A は B です
theme: identity
A: {'english': 'pork cutlet', 'possibilities': ['noun', 'pronoun'], 'hiragana': 'ビジネスシューズ'}
B: {'english': 'beer', 'possibilities': ['noun', 'na-adj', 'i-adj'], 'hiragana': 'ところ'}


In [63]:
verbs.tags.apply(lambda x: x['usage']).explode().unique()

array(['daily', 'general', 'conversation', 'business', 'home', 'work',
       'routine', 'travel', 'health', 'social', 'school', 'shopping',
       'market', 'learning', 'entertainment', 'nature', 'weather',
       'emotion', 'personal', 'life', 'cooking', 'craft'], dtype=object)

In [75]:
structures.slots.apply(lambda x: x.keys()).explode().unique()

array(['A', 'B', 'Verb', 'Adj', 'Noun', 'clause'], dtype=object)

In [81]:
context = vocab.english.tolist()

In [84]:
context += verbs.english.tolist()

In [85]:
with open('/Users/rafaelfelix/Projects/demos/benky-fy/tmp/context-words.txt', 'w') as f:
    f.write('\n'.join(context))

In [87]:
structures.iloc[0]

structure                                              A は B です
theme                                [identity, classification]
slots         {'A': ['noun', 'pronoun'], 'B': ['noun', 'na-a...
particles                                               [は, です]
extensions                                            [か, ではない]
example                                             わたしは せんせいです
Name: 0, dtype: object