# Preprocess MorphGNT

Processes all MorphGNT text files into a standard `morphgnt.csv` file.

## Define File Names

In [1]:
INPUT_PATH_NAME = "../BibleCore/Resources/MorphGnt"
OUTPUT_FILE_NAME = "morphgnt.csv"

## Define Column Names

In [2]:
BOOK = "Book"
CASE = "Case"
CHAPTER = "Chapter"
DEGREE = "Degree"
GENDER = "Gender"
INDEX = "Index"
INFLECTION_CODES = "Inflection Codes"
LEMMA = "Lemma"
MOOD = "Mood"
NORMALIZED_WORD = "Normalized Word"
NUMBER = "Number"
PART_OF_SPEECH = "Part of Speech"
PART_OF_SPEECH_CODE = "Part of Speech Code"
PERSON = "Person"
SCRIPTURE_REFERENCE = "Scripture Reference"
TENSE = "Tense"
TEXT = "Text"
VERSE = "Verse"
VOICE = "Voice"
WORD = "Word"

## Parse MorphGnt Files into DataFrame (DF_WORDS)

Loads all MorphGnt files into `DF_WORDS`.

In [3]:
import pandas as pd
from glob import glob
from os import path
from pprint import pprint

all_files = glob(path.join(INPUT_PATH_NAME, "*.txt"))

DF_WORDS = pd.concat(
    (
        pd.read_csv(
            f,
            names=[
                SCRIPTURE_REFERENCE,
                PART_OF_SPEECH_CODE,
                INFLECTION_CODES,
                TEXT,
                WORD,
                NORMALIZED_WORD,
                LEMMA,
            ],
            dtype={SCRIPTURE_REFERENCE: "str"},
            sep="\\s+",
            index_col=False,
        )
        for f in all_files
    ),
    ignore_index=True,
)

# print("===== DF_WORDS")
# print(DF_WORDS.__class__.__name__)
# print("-----")
# pprint(vars(DF_WORDS))
# print("-----")
# pprint(DF_WORDS)

## Parse Scripture Reference

Parses the `Scripture Reference` column into separate `Book`, `Chapter`, and `Verse` columns.

In [4]:
DF_WORDS[BOOK] = DF_WORDS[SCRIPTURE_REFERENCE].str[0:2].astype(int)
DF_WORDS[CHAPTER] = DF_WORDS[SCRIPTURE_REFERENCE].str[2:4].astype(int)
DF_WORDS[VERSE] = DF_WORDS[SCRIPTURE_REFERENCE].str[4:6].astype(int)

# print("===== DF_WORDS")
# print(DF_WORDS.__class__.__name__)
# print("-----")
# pprint(vars(DF_WORDS))
# print("-----")
# pprint(DF_WORDS)

## Parse Part of Speech Code

Maps the values in `Part of Speech Code` into `Part of Speech`.

In [5]:
parts_of_speech = {
    "A-": "Adjective",
    "C-": "Conjunction",
    "D-": "Adverb",
    "I-": "Interjection",
    "N-": "Noun",
    "P-": "Preposition",
    "RA": "Definite Article",
    "RD": "Pronoun - Demonstrative",
    "RI": "Pronoun - Indefinite",
    "RP": "Pronoun - Personal",
    "RR": "Pronoun - Relative",
    "V-": "Verb",
    "X-": "Particle",
}

DF_WORDS[PART_OF_SPEECH] = DF_WORDS[PART_OF_SPEECH_CODE].map(parts_of_speech)

# print("===== DF_WORDS")
# print(DF_WORDS.__class__.__name__)
# print("-----")
# pprint(vars(DF_WORDS))
# print("-----")
# pprint(DF_WORDS)

## Parse Inflection Codes

In [6]:
infection_person = {"1": "First", "2": "Second", "3": "Third"}

inflection_tense = {
    "P": "Present",
    "I": "Imperfect",
    "F": "Future",
    "A": "Aorist",
    "X": "Perfect",
    "Y": "Pluperfect",
}

inflection_voice = {
    "A": "Active",
    "M": "Middle",
    "P": "Passive",
}

inflection_mood = {
    "I": "Indicative",
    "D": "Imperative",
    "S": "Subjunctive",
    "O": "Optative",
    "N": "Infinitive",
    "P": "Participle",
}

inflection_case = {
    "N": "Nominative",
    "G": "Genitive",
    "D": "Dative",
    "A": "Accusative",
    "V": "Vocative",
}

inflection_number = {
    "S": "Singular",
    "P": "Plural",
}

inflection_gender = {"M": "Masculine", "F": "Feminine", "N": "Neuter"}

inflection_degree = {"C": "Comparative", "S": "Superlative"}

DF_WORDS[PERSON] = DF_WORDS[INFLECTION_CODES].str[0].map(infection_person)
DF_WORDS[TENSE] = DF_WORDS[INFLECTION_CODES].str[1].map(inflection_tense)
DF_WORDS[VOICE] = DF_WORDS[INFLECTION_CODES].str[2].map(inflection_voice)
DF_WORDS[MOOD] = DF_WORDS[INFLECTION_CODES].str[3].map(inflection_mood)
DF_WORDS[CASE] = DF_WORDS[INFLECTION_CODES].str[4].map(inflection_case)
DF_WORDS[NUMBER] = DF_WORDS[INFLECTION_CODES].str[5].map(inflection_number)
DF_WORDS[GENDER] = DF_WORDS[INFLECTION_CODES].str[6].map(inflection_gender)
DF_WORDS[DEGREE] = DF_WORDS[INFLECTION_CODES].str[7].map(inflection_degree)

# print("===== DF_WORDS")
# print(DF_WORDS.__class__.__name__)
# print("-----")
# pprint(vars(DF_WORDS))
# print("-----")
# pprint(DF_WORDS)

## Write morphgnt.csv File

In [7]:
DF_WORDS.to_csv(OUTPUT_FILE_NAME, index_label=INDEX)