# Preprocess MorphGNT

Processes all MorphGNT text files into a standard `morphgnt.csv` file.

## Parse MorphGnt Files into DataFrame (DF_WORDS)

Loads all MorphGnt files into `DF_WORDS`.

In [1]:
import pandas as pd
from glob import glob
from os import path
from pprint import pprint

morphgnt_path = "../BibleCore/Resources/MorphGnt"

all_files = glob(path.join(morphgnt_path, "*.txt"))

DF_WORDS = pd.concat(
    (
        pd.read_csv(
            f,
            names=[
                "Scripture Reference",
                "Part of Speech Code",
                "Inflection Codes",
                "Text",
                "Word",
                "Normalized Word",
                "Lemma",
            ],
            dtype={"Scripture Reference": "str"},
            sep="\\s+",
            index_col=False,
        )
        for f in all_files
    ),
    ignore_index=True,
)

# print("===== DF_WORDS")
# print(DF_WORDS.__class__.__name__)
# print("-----")
# pprint(vars(DF_WORDS))
# print("-----")
# pprint(DF_WORDS)

## Parse Scripture Reference

Parses the `Scripture Reference` column into separate `Book`, `Chapter`, and `Verse` columns.

In [2]:
DF_WORDS["Book"] = DF_WORDS["Scripture Reference"].str[0:2].astype(int)
DF_WORDS["Chapter"] = DF_WORDS["Scripture Reference"].str[2:4].astype(int)
DF_WORDS["Verse"] = DF_WORDS["Scripture Reference"].str[4:6].astype(int)

# print("===== DF_WORDS")
# print(DF_WORDS.__class__.__name__)
# print("-----")
# pprint(vars(DF_WORDS))
# print("-----")
# pprint(DF_WORDS)

## Parse Part of Speech Code

Maps the values in `Part of Speech Code` into `Part of Speech`.

In [3]:
parts_of_speech = {
    "A-": "Adjective",
    "C-": "Conjunction",
    "D-": "Adverb",
    "I-": "Interjection",
    "N-": "Noun",
    "P-": "Preposition",
    "RA": "Definite Article",
    "RD": "Demonstrative Pronoun",
    "RI": "Indefinite Pronoun",
    "RP": "Personal Pronoun",
    "RR": "Relative Pronoun",
    "V-": "Verb",
    "X-": "Particle",
}

DF_WORDS["Part of Speech"] = DF_WORDS["Part of Speech Code"].map(parts_of_speech)

# print("===== DF_WORDS")
# print(DF_WORDS.__class__.__name__)
# print("-----")
# pprint(vars(DF_WORDS))
# print("-----")
# pprint(DF_WORDS)

## Parse Inflection Codes

In [4]:
infection_person = {"1": "First", "2": "Second", "3": "Third"}

inflection_tense = {
    "P": "Present",
    "I": "Imperfect",
    "F": "Future",
    "A": "Aorist",
    "X": "Perfect",
    "Y": "Pluperfect",
}

inflection_voice = {
    "A": "Active",
    "M": "Middle",
    "P": "Passive",
}

inflection_mood = {
    "I": "Indicative",
    "D": "Imperative",
    "S": "Subjunctive",
    "O": "Optative",
    "N": "Infinitive",
    "P": "Participle",
}

inflection_case = {
    "N": "Nominative",
    "G": "Genitive",
    "D": "Dative",
    "A": "Accusative",
    "V": "Vocative",
}

inflection_number = {
    "S": "Singular",
    "P": "Plural",
}

inflection_gender = {"M": "Masculine", "F": "Feminine", "N": "Neuter"}

inflection_degree = {"C": "Comparative", "S": "Superlative"}

DF_WORDS["Person"] = DF_WORDS["Inflection Codes"].str[0].map(infection_person)
DF_WORDS["Tense"] = DF_WORDS["Inflection Codes"].str[1].map(inflection_tense)
DF_WORDS["Voice"] = DF_WORDS["Inflection Codes"].str[2].map(inflection_voice)
DF_WORDS["Mood"] = DF_WORDS["Inflection Codes"].str[3].map(inflection_mood)
DF_WORDS["Case"] = DF_WORDS["Inflection Codes"].str[4].map(inflection_case)
DF_WORDS["Number"] = DF_WORDS["Inflection Codes"].str[5].map(inflection_number)
DF_WORDS["Gender"] = DF_WORDS["Inflection Codes"].str[6].map(inflection_gender)
DF_WORDS["Degree"] = DF_WORDS["Inflection Codes"].str[7].map(inflection_degree)

# print("===== DF_WORDS")
# print(DF_WORDS.__class__.__name__)
# print("-----")
# pprint(vars(DF_WORDS))
# print("-----")
# pprint(DF_WORDS)

## Write morphgnt.csv File

In [5]:
DF_WORDS.to_csv("morphgnt.csv", index_label="Index")