# Word Usage

Determines New Testament word usage.

## Define Files Names

In [1]:
LEXEMES_CSV = "lexemes.csv"
MORPHGNT_CSV = "morphgnt.csv"

## Define Column Names

In [2]:
BDAG_ENTRY = "BDAG Entry"
BOOK = "Book"
CHAPTER = "Chapter"
DODSON_ENTRY = "Dodson Entry"
GK = "GK"
GLOSS = "Gloss"
INDEX = "Index"
LEMMA = "Lemma"
NEW_TESTAMENT_WORD_INDEX = "New Testament Word Index"
PART_OF_SPEECH = "Part of Speech"
STRONGS = "Strongs"
WORD_COUNT = "Word Count"
WORD_INDEX = "Word Index"
WORD_PERCENTAGE = "Word Percentage"
WORD_PERCENTAGE_CUMULATIVE = "Word Percentage Cumulative"

## Create Analyzer Class

In [3]:
import pandas as pd
from pprint import pprint


class Analyzer:

    def __init__(self):
        self.enable_dump = False

    def load_data(self):
        self.DF_MORPHGNT = pd.read_csv(MORPHGNT_CSV, index_col=INDEX)
        self.DF_LEXEMES = pd.read_csv(LEXEMES_CSV, index_col=LEMMA)
        self._dump(self.DF_MORPHGNT, "DF_MORPHGNT")
        self._dump(self.DF_LEXEMES, "DF_LEXEMES")

        self.TOTAL_WORD_COUNT = len(self.DF_MORPHGNT)
        self.TOTAL_LEXEME_COUNT = len(self.DF_LEXEMES)

    def get_new_testament_report(self):

        return self._get_report(self.DF_MORPHGNT)

    def get_book_report(self, book, chapter=None, add_nt_word_index=None):

        df_morphgnt_book = self.DF_MORPHGNT[(self.DF_MORPHGNT[BOOK] == book)]
        if chapter:
            df_morphgnt_book = df_morphgnt_book[(df_morphgnt_book[CHAPTER] == chapter)]
        self._dump(df_morphgnt_book, "df_morphgnt_book")

        report = self._get_report(df_morphgnt_book)

        if add_nt_word_index:
            new_testament_report = self.get_new_testament_report()
            new_testament_word_index = new_testament_report[WORD_INDEX]
            report.insert(
                loc=1, column=NEW_TESTAMENT_WORD_INDEX, value=new_testament_word_index
            )

        return report

    def get_report_styler(self, df_report):

        report_styler = (
            df_report.style.hide(axis="index")
            .format(
                {WORD_PERCENTAGE: "{:.2%}", WORD_PERCENTAGE_CUMULATIVE: "{:.2%}"},
                precision=2,
            )
            .set_properties(
                subset=[GLOSS, BDAG_ENTRY, DODSON_ENTRY, PART_OF_SPEECH],
                **{"text-align": "left"},
            )
            .set_properties(
                subset=[WORD_COUNT, WORD_PERCENTAGE, WORD_PERCENTAGE_CUMULATIVE],
                **{"text-align": "right"},
            )
            .set_table_styles([{"selector": "th", "props": [("text-align", "left")]}])
            .bar(subset=[WORD_PERCENTAGE_CUMULATIVE], color="LightBlue", vmax=1)
        )

        if NEW_TESTAMENT_WORD_INDEX in df_report:
            report_styler = report_styler.apply(
                lambda x: ["background: LightGreen" if v > 100 else "" for v in x],
                subset=[NEW_TESTAMENT_WORD_INDEX],
                axis="columns",
            )

        return report_styler

    def _get_report(self, df_morphgnt):

        total_word_count = len(df_morphgnt)

        s_lemma_word_counts = df_morphgnt.groupby(LEMMA).size()
        self._dump(s_lemma_word_counts, "S_LEMMA_WORD_COUNTS")

        df_analysis = s_lemma_word_counts.to_frame(name=WORD_COUNT)
        df_analysis.index.name = LEMMA
        df_analysis[WORD_PERCENTAGE] = df_analysis[WORD_COUNT] / total_word_count
        self._dump(df_analysis, "DF_ANALYSIS")

        df_analysis_sorted = df_analysis.sort_values(WORD_PERCENTAGE, ascending=False)
        df_analysis_sorted[WORD_INDEX] = range(1, len(df_analysis_sorted) + 1)
        df_analysis_sorted[WORD_PERCENTAGE_CUMULATIVE] = df_analysis_sorted[
            WORD_PERCENTAGE
        ].cumsum()
        self._dump(df_analysis_sorted, "DF_ANALYSIS_SORTED")

        df_merged = df_analysis_sorted.join(self.DF_LEXEMES)
        self._dump(df_merged, "DF_MERGED")

        df_report = df_merged.reindex(
            columns=[
                WORD_INDEX,
                BDAG_ENTRY,
                DODSON_ENTRY,
                PART_OF_SPEECH,
                GLOSS,
                STRONGS,
                GK,
                WORD_COUNT,
                WORD_PERCENTAGE,
                WORD_PERCENTAGE_CUMULATIVE,
            ]
        )
        self._dump(df_report, "DF_REPORT")

        return df_report

    def _dump(self, object, name):

        if self.enable_dump:
            print(f"===== {name}")
            print(object.__class__.__name__)
            print("-----")
            pprint(vars(object))
            print("-----")
            pprint(object)

## Analyze New Testament

In [4]:
ANALYZER = Analyzer()

ANALYZER.load_data()

new_testament_report = ANALYZER.get_new_testament_report()

new_testament_report_styler = ANALYZER.get_report_styler(new_testament_report.head(100))

report_html = new_testament_report_styler.to_html()

with open("new_testament_report.html", "w", encoding="utf-8") as file:
    file.write(report_html)

## Analyze Book

In [5]:
ANALYZER = Analyzer()

ANALYZER.load_data()

book_report = ANALYZER.get_book_report(4, 1, add_nt_word_index=True)

book_top_100 = book_report[
    (book_report[WORD_INDEX] < 100) | (book_report[NEW_TESTAMENT_WORD_INDEX] < 100)
]
book_top_100 = book_top_100.drop(columns=[WORD_PERCENTAGE_CUMULATIVE])
book_top_100[WORD_PERCENTAGE_CUMULATIVE] = book_top_100[WORD_PERCENTAGE].cumsum()

report_styler = ANALYZER.get_report_styler(book_top_100)
report_styler

Word Index,New Testament Word Index,BDAG Entry,Dodson Entry,Part of Speech,Gloss,Strongs,GK,Word Count,Word Percentage,Word Percentage Cumulative
1,1,ὁ,"ὁ, ἡ, τό",Definite Article,the,3588.0,3836,110,13.32%,13.32%
2,2,καί,καί,Conjunction,"and, even, also, namely",2532.0,2779,55,6.66%,19.98%
3,3,αὐτός,"αὐτός, αὐτή, αὐτό",Personal Pronoun,"he, she, it, they, them, same",846.0,899,49,5.93%,25.91%
4,8,εἰμί,εἰμί,Verb,"I am, exist",1510.0,1639,37,4.48%,30.39%
5,9,λέγω,λέγω,Verb,"I say, speak",3004.0,3306,30,3.63%,34.02%
6,7,ἐγώ,ἐγώ,Personal Pronoun,I,1473.0,1609,19,2.30%,36.32%
7,4,σύ,"σύ, σοῦ, σοί, σέ",Personal Pronoun,you,4771.0,5148,15,1.82%,38.14%
8,11,οὐ,οὐ,Adverb,"no, not",3756.0,4024,14,1.69%,39.83%
9,12,ὅς,"ὅς, ἥ, ὅ",Relative Pronoun,"who, which, what, that",3739.0,4005,14,1.69%,41.53%
10,6,ἐν,ἐν,Preposition,"in, on, among",1722.0,1877,14,1.69%,43.22%
