# Preprocess Lexemes

Processes the Lexemes YAML file into a standard `lexemes.csv` file.

## Define File Names

In [1]:
INPUT_FILE_NAME = "../BibleCore/Resources/lexemes.yaml"
OUTPUT_FILE_NAME = "lexemes.csv"
MORPHGNT_CSV = "morphgnt.csv"

## Define Column Names

In [2]:
BDAG_ENTRY = "BDAG Entry"
COUNT = "Count"
DANKER_ENTRY = "Danker Entry"
DODSON_ENTRY = "Dodson Entry"
DODSON_PART_OF_SPEECH_CODE = "Dodson Part of Speech Code"
FULL_CITATION_FORM = "Full Citation Form"
GK = "GK"
GLOSS = "Gloss"
INDEX = "Index"
LEMMA = "Lemma"
MOUNCE_ENTRY = "Mounce Entry"
MOUNCE_MORPHCAT = "Mounce MorphCat"
PART_OF_SPEECH = "Part of Speech"
PART_OF_SPEECH_CODE = "Part of Speech Code"
STRONGS = "Strongs"

## Parse Lexemes File into DataFrame (DF_LEXEMES)

Columns:

* Part of Speech Code (pos)
* Full Citation Form (full-citation-form)
* BDAG Entry (bdag-headword)
* Danker Entry (danker-entry)
* Dodson Entry (dodson-entry)
* Mounce Entry (mounce-headword)
* Strongs (strongs)
* GK (gk)
* Dodson Part of Speech Code (dodson-pos)
* Gloss (gloss)
* Mounce MorphCat (mounce-morphcat)

In [3]:
import pandas as pd
import yaml
from pprint import pprint

with open(INPUT_FILE_NAME, "r", encoding="utf-8") as file:
    yaml_data = yaml.safe_load(file)

DF_LEXEMES = pd.DataFrame.from_dict(yaml_data, orient="index")
DF_LEXEMES.index.name = LEMMA
DF_LEXEMES.rename(
    columns={
        "pos": PART_OF_SPEECH_CODE,
        "full-citation-form": FULL_CITATION_FORM,
        "bdag-headword": BDAG_ENTRY,
        "danker-entry": DANKER_ENTRY,
        "dodson-entry": DODSON_ENTRY,
        "mounce-headword": MOUNCE_ENTRY,
        "strongs": STRONGS,
        "gk": GK,
        "dodson-pos": DODSON_PART_OF_SPEECH_CODE,
        "gloss": GLOSS,
        "mounce-morphcat": MOUNCE_MORPHCAT,
    },
    inplace=True,
)

# print("===== DF_LEXEMES")
# print(DF_LEXEMES.__class__.__name__)
# print("-----")
# pprint(vars(DF_LEXEMES))
# print("-----")
# pprint(DF_LEXEMES)

## Add Part of Speech

The Part of Speech column is determined using the `morphgnt.csv` file. 

In [4]:
DF_MORPHGNT = pd.read_csv(MORPHGNT_CSV, index_col=INDEX)
GB_LEMMA = DF_MORPHGNT.groupby([LEMMA])

morphgnt_parts_of_speech = dict(
    [(name[0], group[PART_OF_SPEECH].unique()[0]) for name, group in GB_LEMMA]
)

DF_LEXEMES[PART_OF_SPEECH] = pd.Series(morphgnt_parts_of_speech)

# print("===== DF_LEXEMES")
# print(DF_LEXEMES.__class__.__name__)
# print("-----")
# pprint(vars(DF_LEXEMES))
# print("-----")
# pprint(DF_LEXEMES)

## Write lexemes.csv File

In [5]:
DF_LEXEMES.to_csv(OUTPUT_FILE_NAME, index_label=LEMMA)

## Utility - Obtain all unique combinations of Part of Speech Code and Dodson Part of Speech Code

In [6]:
GB_POS = DF_LEXEMES.groupby([PART_OF_SPEECH_CODE, DODSON_PART_OF_SPEECH_CODE])
GB_POS = GB_POS.size()
GB_POS = GB_POS.reset_index()
GB_POS = GB_POS.rename(columns={0: COUNT})

# print("===== GB_POS")
# print(GB_POS.__class__.__name__)
# print("-----")
# pprint(vars(GB_POS))
# print("-----")
# pprint(GB_POS)

GB_POS

Unnamed: 0,Part of Speech Code,Dodson Part of Speech Code,Count
0,A,A,718
1,A,"A,A-NUI",1
2,A,"A,ADV",4
3,A,"A,ADV-C",2
4,A,"A,N:F,N:M",1
...,...,...,...
106,X/INJ,INJ,3
107,X/INJ,"INJ,N-OI",1
108,X/PRT-I,"PRT-I,PRT-N",1
109,X/V,INJ,1
