# Word Usage

Determines New Testament word usage.

## Create Analyzer Class

In [35]:
import pandas as pd
from pprint import pprint


class Analyzer:

    def __init__(self):
        self.enable_dump = False

    def load_data(self):
        self.DF_MORPHGNT = pd.read_csv("morphgnt.csv", index_col="Index")
        self.DF_LEXEMES = pd.read_csv("lexemes.csv", index_col="Lemma")
        self.dump(self.DF_MORPHGNT, "DF_MORPHGNT")
        self.dump(self.DF_LEXEMES, "DF_LEXEMES")

        self.TOTAL_WORD_COUNT = len(self.DF_MORPHGNT)
        self.TOTAL_LEXEME_COUNT = len(self.DF_LEXEMES)

    def get_new_testament_report(self):

        s_lemma_word_counts = self.DF_MORPHGNT.groupby("Lemma").size()
        self.dump(s_lemma_word_counts, "S_LEMMA_WORD_COUNTS")

        df_analysis = s_lemma_word_counts.to_frame(name="Word Count")
        df_analysis.index.name = "Lemma"
        df_analysis["Word Percentage"] = (
            df_analysis["Word Count"] / self.TOTAL_WORD_COUNT
        ) * 100
        self.dump(df_analysis, "DF_ANALYSIS")

        df_analysis_sorted = df_analysis.sort_values("Word Percentage", ascending=False)
        df_analysis_sorted["Word Index"] = range(len(df_analysis_sorted))
        df_analysis_sorted["Word Percentage Cumulative"] = df_analysis_sorted[
            "Word Percentage"
        ].cumsum()
        self.dump(df_analysis_sorted, "DF_ANALYSIS_SORTED")

        df_merged = df_analysis_sorted.join(self.DF_LEXEMES)
        self.dump(df_merged, "DF_MERGED")

        df_report = df_merged.reindex(
            columns=[
                "Word Index",
                "BDAG Entry",
                "Dodson Entry",
                "Part of Speech",
                "Gloss",
                "Strongs",
                "GK",
                "Word Count",
                "Word Percentage",
                "Word Percentage Cumulative",
            ]
        )
        self.dump(df_report, "DF_REPORT")

        return df_report

    def get_book_report(self, book):

        df_morphgnt_book = self.DF_MORPHGNT[(self.DF_MORPHGNT.Book == book)]
        self.dump(df_morphgnt_book, "df_morphgnt_book")

        total_word_count = len(df_morphgnt_book)
        print(total_word_count)

        s_lemma_word_counts = df_morphgnt_book.groupby("Lemma").size()
        self.dump(s_lemma_word_counts, "S_LEMMA_WORD_COUNTS")

        df_analysis = s_lemma_word_counts.to_frame(name="Word Count")
        df_analysis.index.name = "Lemma"
        df_analysis["Word Percentage"] = (
            df_analysis["Word Count"] / total_word_count
        ) * 100
        self.dump(df_analysis, "DF_ANALYSIS")

        df_analysis_sorted = df_analysis.sort_values("Word Percentage", ascending=False)
        df_analysis_sorted["Word Index"] = range(len(df_analysis_sorted))
        df_analysis_sorted["Word Percentage Cumulative"] = df_analysis_sorted[
            "Word Percentage"
        ].cumsum()
        self.dump(df_analysis_sorted, "DF_ANALYSIS_SORTED")

        df_merged = df_analysis_sorted.join(self.DF_LEXEMES)
        self.dump(df_merged, "DF_MERGED")

        df_report = df_merged.reindex(
            columns=[
                "Word Index",
                "BDAG Entry",
                "Dodson Entry",
                "Part of Speech",
                "Gloss",
                "Strongs",
                "GK",
                "Word Count",
                "Word Percentage",
                "Word Percentage Cumulative",
            ]
        )
        self.dump(df_report, "DF_REPORT")

        return df_report

    def dump(self, object, name):

        if self.enable_dump:
            print(f"===== {name}")
            print(object.__class__.__name__)
            print("-----")
            pprint(vars(object))
            print("-----")
            pprint(object)

## Analyze New Testament

In [None]:
ANALYZER = Analyzer()

ANALYZER.load_data()

new_testament_report = ANALYZER.get_new_testament_report()

report_html = (
    new_testament_report.head(100)
    .style.hide(axis="index")
    .set_properties(
        subset=["Gloss", "BDAG Entry", "Dodson Entry", "Part of Speech"],
        **{"text-align": "left"}
    )
    .set_properties(
        subset=["Word Count", "Word Percentage", "Word Percentage Cumulative"],
        **{"text-align": "right"}
    )
    .set_table_styles([{"selector": "th", "props": [("text-align", "left")]}])
    .bar(subset=["Word Percentage Cumulative"], vmax=100)
    .to_html()
)

# report_html
with open("new_testament_report.html", "w", encoding="utf-8") as file:
    file.write(report_html)

## Display Word Analysis

In [None]:
ANALYZER = Analyzer()

ANALYZER.load_data()

new_testament_report = ANALYZER.get_new_testament_report()
# new_testament_report["Source NT"] = True

book_report = ANALYZER.get_book_report(1)
# book_report["Source Book"] = "True"

book_report["New Testament Word Index"] = new_testament_report["Word Index"]

book_top_100 = book_report[
    (book_report["Word Index"] < 100) | (book_report["New Testament Word Index"] < 100)
]
# df_merged = pd.concat([book_report, new_testament_report], axis=1)
# df_merged

book_top_100


# ANALYZER.get_new_testament_report().head(100).style.hide(axis="index").set_properties(
#     subset=["Gloss", "BDAG Entry", "Dodson Entry", "Part of Speech"],
#     **{"text-align": "left"}
# ).set_table_styles([{"selector": "th", "props": [("text-align", "left")]}]).bar(
#     subset=["Word Percentage Cumulative"], vmax=100

# )

# ANALYZER.get_book_report(2).head(100).style.hide(axis="index").set_properties(
#     subset=["Gloss", "BDAG Entry", "Dodson Entry", "Part of Speech"],
#     **{"text-align": "left"}
# ).set_table_styles([{"selector": "th", "props": [("text-align", "left")]}]).bar(
#     subset=["Word Percentage Cumulative"], vmax=100
# )

18329


Unnamed: 0_level_0,Word Index,BDAG Entry,Dodson Entry,Part of Speech,Gloss,Strongs,GK,Word Count,Word Percentage,Word Percentage Cumulative,New Testament Word Index
Lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ὁ,0,ὁ,"ὁ, ἡ, τό",Definite Article,the,3588,3836,2782,15.178133,15.178133,0
καί,1,καί,καί,Conjunction,"and, even, also, namely",2532,2779,1174,6.405150,21.583283,1
αὐτός,2,αὐτός,"αὐτός, αὐτή, αὐτό",Personal Pronoun,"he, she, it, they, them, same",846,899,918,5.008457,26.591740,2
λέγω,3,λέγω,λέγω,Verb,"I say, speak",3004,3306,505,2.755197,29.346937,8
δέ,4,δέ,δέ,Conjunction,"but, on the other hand, and",1161,1254,493,2.689727,32.036663,4
...,...,...,...,...,...,...,...,...,...,...,...
δόξα,343,δόξα,"δόξα, ης, ἡ",Noun,"honor, renown, glory splendor",1391,1518,7,0.038191,84.636369,94
ἔργον,392,ἔργον,"ἔργον, ου, τό",Noun,"work, labor, action, deed",2041,2240,6,0.032735,86.289487,93
Ἰουδαῖος,413,Ἰουδαῖος,"Ἰουδαῖος, αία, αῖον",Adjective,Jewish,2453,2681,5,0.027279,86.938731,79
τέ,588,τέ,τέ,Conjunction,"and, both",5037,5445,3,0.016368,90.954226,73
