# Word Usage

Determines New Testament word usage.

## Define Files Names

In [1]:
LEXEMES_CSV = "lexemes.csv"
MORPHGNT_CSV = "morphgnt.csv"

## Define Column Names

In [2]:
BDAG_ENTRY = "BDAG Entry"
BOOK = "Book"
CHAPTER = "Chapter"
DODSON_ENTRY = "Dodson Entry"
GK = "GK"
GLOSS = "Gloss"
INDEX = "Index"
LEMMA = "Lemma"
LEXICAL_ENTRY = "Lexical Entry"
NEW_TESTAMENT_WORD_INDEX = "New Testament Word Index"
PART_OF_SPEECH = "Part of Speech"
STRONGS = "Strongs"
WORD_COUNT = "Word Count"
WORD_INDEX = "Word Index"
WORD_PERCENTAGE = "Word Percentage"
WORD_PERCENTAGE_CUMULATIVE = "Word Percentage Cumulative"

## Define Book Names

In [3]:
BOOKS = {
    1: "Matthew",
    2: "Mark",
    3: "Luke",
    4: "John",
    5: "Acts",
    6: "Romans",
    7: "I Corinthians",
    8: "II Corinthians",
    9: "Galatians",
    10: "Ephesians",
    11: "Philippians",
    12: "Colossians",
    13: "I Thessalonians",
    14: "II Thessalonians",
    15: "I Timothy",
    16: "II Timothy",
    17: "Titus",
    18: "Philemon",
    19: "Hebrews",
    20: "James",
    21: "I Peter",
    22: "II Peter",
    23: "I John",
    24: "II John",
    25: "III John",
    26: "Jude",
    27: "Revelations",
}

## Create Report Class

In [4]:
class Report:

    def __init__(self, df):

        self.df = df
        self.properties = {}

    def get_styler(self, count=None):
         
        df_report = self.df.head(count) if count else self.df

        report_styler = (
            df_report.style.hide(axis="index")
            .format(
                {WORD_PERCENTAGE: "{:.2%}", WORD_PERCENTAGE_CUMULATIVE: "{:.2%}"},
                precision=2,
                na_rep=""
            )
            .set_properties(
                subset=[GLOSS, LEXICAL_ENTRY, PART_OF_SPEECH],
                **{"text-align": "left"},
            )
            .set_properties(
                subset=[WORD_COUNT, WORD_PERCENTAGE, WORD_PERCENTAGE_CUMULATIVE],
                **{"text-align": "right"},
            )
            .set_table_styles([{"selector": "th", "props": [("text-align", "left")]}])
            .bar(subset=[WORD_PERCENTAGE_CUMULATIVE], color="LightBlue", vmax=1)
        )

        if NEW_TESTAMENT_WORD_INDEX in df_report:
            report_styler = report_styler.apply(
                lambda x: ["background: LightGreen" if v > 100 else "" for v in x],
                subset=[NEW_TESTAMENT_WORD_INDEX],
                axis="columns",
            )

        return report_styler

## Create Analyzer Class

In [5]:
import pandas as pd
from pprint import pprint


class Analyzer:

    def __init__(self):
        self.enable_dump = False

    def load_data(self):
        self.DF_MORPHGNT = pd.read_csv(MORPHGNT_CSV, index_col=INDEX)
        self.DF_LEXEMES = pd.read_csv(LEXEMES_CSV, index_col=LEMMA)
        self._dump(self.DF_MORPHGNT, "DF_MORPHGNT")
        self._dump(self.DF_LEXEMES, "DF_LEXEMES")

        self.TOTAL_WORD_COUNT = len(self.DF_MORPHGNT)
        self.TOTAL_LEXEME_COUNT = len(self.DF_LEXEMES)

    def get_new_testament_report(self):

        report_df = self._create_report_df(self.DF_MORPHGNT)

        return Report(report_df)

    def get_book_report(self, book, chapter=None, add_nt_word_index=None):

        df_morphgnt_book = self.DF_MORPHGNT[(self.DF_MORPHGNT[BOOK] == book)]
        if chapter:
            df_morphgnt_book = df_morphgnt_book[(df_morphgnt_book[CHAPTER] == chapter)]
        self._dump(df_morphgnt_book, "df_morphgnt_book")

        report_df = self._create_report_df(df_morphgnt_book)

        if add_nt_word_index:
            new_testament_report_df = self._create_report_df(self.DF_MORPHGNT)
            new_testament_word_index = new_testament_report_df[WORD_INDEX]
            report_df.insert(
                loc=1, column=NEW_TESTAMENT_WORD_INDEX, value=new_testament_word_index
            )

        book_top_100 = report_df[
            (report_df[WORD_INDEX] < 100) | (report_df[NEW_TESTAMENT_WORD_INDEX] < 100)
        ]
        book_top_100 = book_top_100.drop(columns=[WORD_PERCENTAGE_CUMULATIVE])
        book_top_100[WORD_PERCENTAGE_CUMULATIVE] = book_top_100[WORD_PERCENTAGE].cumsum()

        return Report(book_top_100)
    
    def _create_report_df(self, df_morphgnt):

        total_word_count = len(df_morphgnt)

        s_lemma_word_counts = df_morphgnt.groupby(LEMMA).size()
        self._dump(s_lemma_word_counts, "S_LEMMA_WORD_COUNTS")

        df_analysis = s_lemma_word_counts.to_frame(name=WORD_COUNT)
        df_analysis.index.name = LEMMA
        df_analysis[WORD_PERCENTAGE] = df_analysis[WORD_COUNT] / total_word_count
        self._dump(df_analysis, "DF_ANALYSIS")

        df_analysis_sorted = df_analysis.sort_values(WORD_PERCENTAGE, ascending=False)
        df_analysis_sorted[WORD_INDEX] = range(1, len(df_analysis_sorted) + 1)
        df_analysis_sorted[WORD_PERCENTAGE_CUMULATIVE] = df_analysis_sorted[
            WORD_PERCENTAGE
        ].cumsum()
        self._dump(df_analysis_sorted, "DF_ANALYSIS_SORTED")

        df_merged = df_analysis_sorted.join(self.DF_LEXEMES)
        self._dump(df_merged, "DF_MERGED")

        df_merged[LEXICAL_ENTRY] = df_merged[DODSON_ENTRY].combine_first(df_merged[BDAG_ENTRY])

        df_report = df_merged.reindex(
            columns=[
                WORD_INDEX,
                LEXICAL_ENTRY,
                PART_OF_SPEECH,
                GLOSS,
                STRONGS,
                GK,
                WORD_COUNT,
                WORD_PERCENTAGE,
                WORD_PERCENTAGE_CUMULATIVE,
            ]
        )
        self._dump(df_report, "DF_REPORT")

        return df_report

    def _dump(self, object, name):

        if self.enable_dump:
            print(f"===== {name}")
            print(object.__class__.__name__)
            print("-----")
            pprint(vars(object))
            print("-----")
            pprint(object)

## Analyze New Testament

In [6]:
ANALYZER = Analyzer()

ANALYZER.load_data()

new_testament_report = ANALYZER.get_new_testament_report()

new_testament_report_styler = new_testament_report.get_styler(100)
new_testament_report_styler

Word Index,Lexical Entry,Part of Speech,Gloss,Strongs,GK,Word Count,Word Percentage,Word Percentage Cumulative
1,"ὁ, ἡ, τό",Definite Article,the,3588,3836,19769,14.37%,14.37%
2,καί,Conjunction,"and, even, also, namely",2532,2779,8973,6.52%,20.90%
3,"αὐτός, αὐτή, αὐτό",Personal Pronoun,"he, she, it, they, them, same",846,899,5546,4.03%,24.93%
4,"σύ, σοῦ, σοί, σέ",Personal Pronoun,you,4771,5148,2894,2.10%,27.03%
5,δέ,Conjunction,"but, on the other hand, and",1161,1254,2766,2.01%,29.04%
6,ἐν,Preposition,"in, on, among",1722,1877,2733,1.99%,31.03%
7,ἐγώ,Personal Pronoun,I,1473,1609,2572,1.87%,32.90%
8,εἰμί,Verb,"I am, exist",1510,1639,2456,1.79%,34.68%
9,λέγω,Verb,"I say, speak",3004,3306,2345,1.70%,36.39%
10,εἰς,Preposition,"into, in, among, till, for",1519,1650,1754,1.28%,37.66%


## Analyze Book

In [7]:
ANALYZER = Analyzer()

ANALYZER.load_data()

book_report = ANALYZER.get_book_report(1, 1, add_nt_word_index=True)

report_styler = book_report.get_styler()
report_styler

Word Index,New Testament Word Index,Lexical Entry,Part of Speech,Gloss,Strongs,GK,Word Count,Word Percentage,Word Percentage Cumulative
1,1,"ὁ, ἡ, τό",Definite Article,the,3588.0,3836,76,17.43%,17.43%
2,5,δέ,Conjunction,"but, on the other hand, and",1161.0,1254,44,10.09%,27.52%
3,173,γεννάω,Verb,"I beget, bring forth, give birth to",1080.0,1164,41,9.40%,36.93%
4,3,"αὐτός, αὐτή, αὐτό",Personal Pronoun,"he, she, it, they, them, same",846.0,899,19,4.36%,41.28%
5,2,καί,Conjunction,"and, even, also, namely",2532.0,2779,12,2.75%,44.04%
6,19,"ἐκ, ἐξ",Preposition,"from out, out from among, from",1537.0,1666,7,1.61%,45.64%
7,274,"Δαυίδ, ὁ",Noun,David,1138.0,1253,6,1.38%,47.02%
8,48,"υἱός, οῦ, ὁ",Noun,"a son, descendent",5207.0,5626,6,1.38%,48.39%
9,28,ἀπό,Preposition,"from, away from",575.0,608,5,1.15%,49.54%
10,411,"Ἰωσήφ, ὁ",Noun,Joseph,2501.0,2737,5,1.15%,50.69%


## Create Usage Reports

In [8]:
from pathlib import Path

ANALYZER = Analyzer()
ANALYZER.load_data()

Path("reports").mkdir(exist_ok=True)

# Create New Testament summary report.
#
new_testament_report = ANALYZER.get_new_testament_report()
new_testament_report_styler = new_testament_report.get_styler(100)

report_html = f"<h2>New Testament</h2>"
report_html +=  new_testament_report_styler.to_html()
with open("reports/words_00_new_testament.html", "w", encoding="utf-8") as file:
    file.write(report_html)

# Create book detail reports.
#
for book_number in BOOKS:
    book_name = BOOKS[book_number]
    book_report = ANALYZER.get_book_report(book_number, add_nt_word_index=True)
    report_styler = book_report.get_styler()

    report_html = f"<h2>{book_name}</h2>"
    report_html += report_styler.to_html()
    with open(f"reports/words_{book_number:02d}_{book_name}.html", "w", encoding="utf-8") as file:
        file.write(report_html)