<a href="https://colab.research.google.com/github/ryosanhin/NeoUniCorpus/blob/main/NeoUniCorpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/ryosanhin/NeoUniTextCollection.git

In [None]:
# @title
import os
import glob
import re
import itertools
from dataclasses import dataclass, astuple
import pandas as pd
from natsort import natsorted

@dataclass
class ContextExample:
    speaker: str
    source: str
    beforeContext: str
    context: str
    afterContext: str

# create natural sort ordered csv files
def get_sorted_csv_files(root_dir_path):
    files = glob.glob(os.path.join(root_dir_path, "**/*.csv"), recursive=True)
    return natsorted(files)

# search examples from story data
def search_example_from_story(csv_path, word, count):
    context_column = 1
    try:
        # read csv as str data
        df = pd.read_csv(csv_path, header=None, comment='#', dtype=str, encoding="utf-8", na_filter=False)
    except pd.errors.EmptyDataError:
        return []

    # delete "\\n"
    df = df.map(lambda elem: re.sub(r"\\n", "", elem))

    # create mask with the word looking for
    mask = df[context_column].str.contains(word, na=False)

    # quit if there is no cell looking for
    if not any(mask):
        return []

    return [
        ContextExample(
            speaker = df.iloc[index][0],
            source = df.iloc[0][1],

            beforeContext = next(
                itertools.dropwhile(lambda contexts: len(contexts)<count, itertools.accumulate(reversed(df.iloc[:index][1]), func = lambda x, y: y+x)),
                "".join(df.iloc[:index][1])
            )[-count:],

            context = df.iloc[index][1],

            afterContext = next(
                itertools.dropwhile(lambda contexts: len(contexts)<count, itertools.accumulate(df.iloc[index+1:][1])),
                "".join(df.iloc[index+1:][1])
            )[:count]
        ) for index in df[mask].index
    ]

# search examples from voice data
def search_example_from_voice(csv_path, word):
    context_column = 1
    try:
        # read csv as str data
        df = pd.read_csv(csv_path, header=None, comment='#', dtype=str, encoding="utf-8", na_filter=False)
    except pd.errors.EmptyDataError:
        return []

    # delete "\\n"
    df = df.map(lambda elem: re.sub(r"\\n", "", elem))

    # create mask with the word looking for
    mask = df[context_column].str.contains(word, na=False)

    # quit if there is no cell looking for
    if not any(mask):
        return []

    return [
        ContextExample(
            speaker = "ネオユニヴァース",
            source = "ボイス",
            beforeContext = df.iloc[index][0],
            context = df.iloc[index][1],
            afterContext = ""
        ) for index in df[mask].index
    ]

# search examples to terms
def search_examples(word, length):
    dir_path = "/content/NeoUniTextCollection"

    examples = ([
        result
        for character_dir in glob.glob(os.path.join(dir_path, "**/"))
        for csv_path in get_sorted_csv_files(os.path.join(character_dir, "VoiceText"))
        for result in search_example_from_voice(csv_path, word)
    ] + [
        result
        for character_dir in glob.glob(os.path.join(dir_path, "**/"))
        for csv_path in get_sorted_csv_files(os.path.join(character_dir, "StoryText"))
        for result in search_example_from_story(csv_path, word, length)
    ])

    return pd.DataFrame([astuple(elem) for elem in examples], columns=["speaker", "source", "before context", "target context", "after context"])

In [None]:
# input word here you are looking for
WORD_LOOKING_FOR = "neouniverse"

# set accompanying context length
CONTEXT_LENGTH = 100

output = search_examples(WORD_LOOKING_FOR, CONTEXT_LENGTH)

output.style.set_properties(
  subset=["speaker", "source", "before context", "target context", "after context"],
  **{"white-space": "pre-wrap", "text-align": "left"}
)