# Word Usage

Determines New Testament word usage.

## Define Files Names

In [25]:
LEXEMES_CSV = "lexemes.csv"
MORPHGNT_CSV = "morphgnt.csv"
MOUNCE_TXT = "../BibleCore/resources/mounce.txt"

## Define Column Names

In [26]:
BDAG_ENTRY = "BDAG Entry"
BOOK = "Book"
CHAPTER = "Chapter"
DODSON_ENTRY = "Dodson Entry"
GK = "GK"
GLOSS = "Gloss"
INDEX = "Index"
LEMMA = "Lemma"
LEXICAL_ENTRY = "Lexical Entry"
MOUNCE_CHAPTER = "Mounce"
NEW_TESTAMENT_WORD_INDEX = "NT Rank"
PART_OF_SPEECH = "Part of Speech"
STRONGS = "Strongs"
WORD_COUNT = "Word Count"
WORD_INDEX = "Rank"
WORD_PERCENTAGE = "Word Percentage"
WORD_PERCENTAGE_CUMULATIVE = "Word Percentage Cumulative"

## Define Book Names

In [27]:
BOOKS = {
    1: "Matthew",
    2: "Mark",
    3: "Luke",
    4: "John",
    5: "Acts",
    6: "Romans",
    7: "I Corinthians",
    8: "II Corinthians",
    9: "Galatians",
    10: "Ephesians",
    11: "Philippians",
    12: "Colossians",
    13: "I Thessalonians",
    14: "II Thessalonians",
    15: "I Timothy",
    16: "II Timothy",
    17: "Titus",
    18: "Philemon",
    19: "Hebrews",
    20: "James",
    21: "I Peter",
    22: "II Peter",
    23: "I John",
    24: "II John",
    25: "III John",
    26: "Jude",
    27: "Revelations",
}

## Create Report Class

In [28]:
import pandas as pd


class Report:

    def __init__(self, df, properties):

        self.df = df
        self.properties = dict(properties)

    def get_styler(self, highlight_nt_rank=True):

        df_report = self.df

        report_styler = (
            df_report.style.hide(axis="index")
            .format(
                # {WORD_PERCENTAGE: "{:.2%}", WORD_PERCENTAGE_CUMULATIVE: "{:.2%}"},
                {
                    STRONGS: _format_lexical_number,
                    GK: _format_lexical_number,
                    MOUNCE_CHAPTER: _format_mounce,
                    WORD_COUNT: "{:,}",
                },
                precision=2,
                na_rep="",
            )
            .set_properties(
                subset=[
                    GLOSS,
                    LEXICAL_ENTRY,
                    PART_OF_SPEECH,
                ],
                **{"text-align": "left"},
            )
            .set_properties(
                subset=[
                    WORD_COUNT,
                    STRONGS,
                    GK,
                    MOUNCE_CHAPTER,
                ],  # , WORD_PERCENTAGE, WORD_PERCENTAGE_CUMULATIVE],
                **{"text-align": "right"},
            )
            .set_table_styles([{"selector": "th", "props": [("text-align", "left")]}])
            # .bar(subset=[WORD_PERCENTAGE_CUMULATIVE], color="LightBlue", vmax=1)
        )

        if highlight_nt_rank & (NEW_TESTAMENT_WORD_INDEX in df_report):
            report_styler = report_styler.apply(_select_col, axis=None)

        return report_styler


def _format_lexical_number(value):
    try:
        int_value = int(value)
        return f"{int_value:04d}"
    except ValueError:
        return value


def _format_mounce(value):
    return ",".join(value)


def _select_col(df):
    c1 = "background-color: LightGreen"
    c2 = ""
    mask = df[NEW_TESTAMENT_WORD_INDEX] > 100
    df1 = pd.DataFrame(c2, index=df.index, columns=df.columns)
    df1.loc[mask, WORD_INDEX] = c1
    return df1

## Create Analyzer Class

In [29]:
import pandas as pd
from pprint import pprint


class Analyzer:

    def __init__(self):
        self.enable_dump = False

    def load_data(self):
        self.DF_MORPHGNT = pd.read_csv(MORPHGNT_CSV, index_col=INDEX)
        self.DF_LEXEMES = pd.read_csv(LEXEMES_CSV, index_col=LEMMA)
        self.DF_MOUNCE = (
            pd.read_csv(
                MOUNCE_TXT,
                sep="\t",
                names=[GK, MOUNCE_CHAPTER],
                index_col=GK,
                dtype={GK: "object", MOUNCE_CHAPTER: "object"},
            )
            .groupby(GK)[MOUNCE_CHAPTER]
            .apply(list)
            .to_frame(MOUNCE_CHAPTER)
        )
        self._dump(self.DF_MORPHGNT, "DF_MORPHGNT")
        self._dump(self.DF_LEXEMES, "DF_LEXEMES")

        self.TOTAL_WORD_COUNT = len(self.DF_MORPHGNT)
        self.TOTAL_LEXEME_COUNT = len(self.DF_LEXEMES)

    def get_new_testament_report(self):

        report_df = self._create_report_df(self.DF_MORPHGNT).head(100)

        percentage = report_df[WORD_PERCENTAGE_CUMULATIVE].max()

        df = report_df.drop(columns=[WORD_PERCENTAGE, WORD_PERCENTAGE_CUMULATIVE])

        return Report(
            df,
            {
                "Total Word Count": len(self.DF_MORPHGNT),
                "Unique Word Count": len(report_df),
                "Vocabulary Word Count": len(report_df),
                "Vocabulary Percentage": f"{percentage:.2%}",
            },
        )

    def get_book_report(self, book, chapter=None, add_nt_word_index=None):

        df_morphgnt_book = self.DF_MORPHGNT[(self.DF_MORPHGNT[BOOK] == book)]
        if chapter:
            df_morphgnt_book = df_morphgnt_book[(df_morphgnt_book[CHAPTER] == chapter)]
        self._dump(df_morphgnt_book, "df_morphgnt_book")

        report_df = self._create_report_df(df_morphgnt_book)

        if add_nt_word_index:
            new_testament_report_df = self._create_report_df(self.DF_MORPHGNT)
            new_testament_word_index = new_testament_report_df[WORD_INDEX]
            report_df.insert(
                loc=1, column=NEW_TESTAMENT_WORD_INDEX, value=new_testament_word_index
            )

        book_top_100 = report_df[
            (report_df[WORD_INDEX] <= 100)
            | (report_df[NEW_TESTAMENT_WORD_INDEX] <= 100)
        ]
        book_top_100 = book_top_100.drop(columns=[WORD_PERCENTAGE_CUMULATIVE])
        book_top_100[WORD_PERCENTAGE_CUMULATIVE] = book_top_100[
            WORD_PERCENTAGE
        ].cumsum()

        new_words = book_top_100[
            (book_top_100[WORD_INDEX] <= 100)
            & (book_top_100[NEW_TESTAMENT_WORD_INDEX] > 100)
        ]

        df = book_top_100.drop(columns=[WORD_PERCENTAGE, WORD_PERCENTAGE_CUMULATIVE])

        return Report(
            df,
            {
                "Total Word Count": len(df_morphgnt_book),
                "Unique Word Count": len(report_df),
                "New Vocabulary Word Count": len(new_words),
                "Total Vocabulary Percentage": f"{book_top_100[WORD_PERCENTAGE_CUMULATIVE].max():.2%}",
            },
        )

    def _create_report_df(self, df_morphgnt):

        total_word_count = len(df_morphgnt)

        s_lemma_word_counts = df_morphgnt.groupby(LEMMA).size()
        self._dump(s_lemma_word_counts, "S_LEMMA_WORD_COUNTS")

        df_analysis = s_lemma_word_counts.to_frame(name=WORD_COUNT)
        df_analysis.index.name = LEMMA
        df_analysis[WORD_PERCENTAGE] = df_analysis[WORD_COUNT] / total_word_count
        self._dump(df_analysis, "DF_ANALYSIS")

        df_analysis_sorted = df_analysis.sort_values(WORD_PERCENTAGE, ascending=False)
        df_analysis_sorted[WORD_INDEX] = range(1, len(df_analysis_sorted) + 1)
        df_analysis_sorted[WORD_PERCENTAGE_CUMULATIVE] = df_analysis_sorted[
            WORD_PERCENTAGE
        ].cumsum()
        self._dump(df_analysis_sorted, "DF_ANALYSIS_SORTED")

        df_merged = df_analysis_sorted.join(self.DF_LEXEMES).join(self.DF_MOUNCE, on=GK)
        self._dump(df_merged, "DF_MERGED")

        df_merged[LEXICAL_ENTRY] = df_merged[DODSON_ENTRY].combine_first(
            df_merged[BDAG_ENTRY]
        )

        df_report = df_merged.reindex(
            columns=[
                WORD_INDEX,
                PART_OF_SPEECH,
                LEXICAL_ENTRY,
                GLOSS,
                STRONGS,
                GK,
                MOUNCE_CHAPTER,
                WORD_COUNT,
                WORD_PERCENTAGE,
                WORD_PERCENTAGE_CUMULATIVE,
            ]
        )
        self._dump(df_report, "DF_REPORT")

        return df_report

    def _dump(self, object, name):

        if self.enable_dump:
            print(f"===== {name}")
            print(object.__class__.__name__)
            print("-----")
            pprint(vars(object))
            print("-----")
            pprint(object)

a = Analyzer()
a.load_data()
df = a.DF_MORPHGNT
r = a._create_report_df(a.DF_MORPHGNT)

## Analyze New Testament

In [30]:
ANALYZER = Analyzer()

ANALYZER.load_data()

new_testament_report = ANALYZER.get_new_testament_report()

new_testament_report_styler = new_testament_report.get_styler()
new_testament_report_styler

Rank,Part of Speech,Lexical Entry,Gloss,Strongs,GK,Mounce,Word Count
1,Definite Article,"ὁ, ἡ, τό",the,3588,3836,6.0,19769
2,Conjunction,καί,"and, even, also, namely",2532,2779,4.0,8973
3,Pronoun - Personal,"αὐτός, αὐτή, αὐτό","he, she, it, they, them, same",846,899,6.0,5546
4,Pronoun - Personal,"σύ, σοῦ, σοί, σέ",you,4771,5148,7.0,2894
5,Conjunction,δέ,"but, on the other hand, and",1161,1254,6.0,2766
6,Preposition,ἐν,"in, on, among (dat)",1722,1877,6.0,2733
7,Pronoun - Personal,ἐγώ,I,1473,1609,4.0,2572
8,Verb,εἰμί,"I am, exist",1510,1639,8.0,2456
9,Verb,λέγω,"I say, speak",3004,3306,78816.0,2345
10,Preposition,εἰς,"into, in, among, till, for (acc)",1519,1650,7.0,1754


## Analyze Book

In [31]:
ANALYZER = Analyzer()

ANALYZER.load_data()

book_report = ANALYZER.get_book_report(1, 1, add_nt_word_index=True)

report_styler = book_report.get_styler()
report_styler

Rank,NT Rank,Part of Speech,Lexical Entry,Gloss,Strongs,GK,Mounce,Word Count
1,1,Definite Article,"ὁ, ἡ, τό",the,3588.0,3836,6.0,76
2,5,Conjunction,δέ,"but, on the other hand, and",1161.0,1254,6.0,44
3,173,Verb,γεννάω,"I beget, bring forth, give birth to",1080.0,1164,19.0,41
4,3,Pronoun - Personal,"αὐτός, αὐτή, αὐτό","he, she, it, they, them, same",846.0,899,6.0,19
5,2,Conjunction,καί,"and, even, also, namely",2532.0,2779,4.0,12
6,19,Preposition,"ἐκ, ἐξ","from out, out from among, from (gen)",1537.0,1666,8.0,7
7,274,Noun,"Δαυίδ, ὁ",David,1138.0,1253,4.0,6
8,48,Noun,"υἱός, οῦ, ὁ","a son, descendent",5207.0,5626,7.0,6
9,28,Preposition,ἀπό,"from, away from (gen)",575.0,608,8.0,5
10,411,Noun,"Ἰωσήφ, ὁ",Joseph,2501.0,2737,,5


## Create Usage Reports

In [32]:
from pathlib import Path
import unicodedata


def strip_diacritics(series):
    return series.apply(lambda value: unicodedata.normalize("NFKD", value))


ANALYZER = Analyzer()
ANALYZER.load_data()
Path("reports").mkdir(exist_ok=True)

# Create New Testament summary report.
#
new_testament_report = ANALYZER.get_new_testament_report()

report_html = f"<h2>New Testament</h2>"
for property_name, property_value in new_testament_report.properties.items():
    report_html += f"<p>{property_name}: {property_value}"

report_html += "<h3>By Ranking</h3>"
new_testament_report_styler = new_testament_report.get_styler()
report_html += new_testament_report_styler.to_html()

report_html += "<h3>By Part of Speech</h3>"
new_testament_report.df.sort_values(
    [PART_OF_SPEECH, LEXICAL_ENTRY], inplace=True, key=strip_diacritics
)
new_testament_report_styler = new_testament_report.get_styler()
report_html += new_testament_report_styler.to_html()

with open("reports/words_00_new_testament.html", "w", encoding="utf-8") as file:
    file.write(report_html)

# Create book detail reports.
#
for book_number in BOOKS:
    book_name = BOOKS[book_number]
    book_report = ANALYZER.get_book_report(book_number, add_nt_word_index=True)

    report_html = f"<h2>{book_name}</h2>"
    for property_name, property_value in book_report.properties.items():
        report_html += f"<p>{property_name}: {property_value}"

    report_html += "<h3>By Ranking</h3>"
    report_styler = book_report.get_styler()
    report_html += report_styler.to_html()

    report_html += "<h3>New Words</h3>"
    book_report.df.sort_values(
        [PART_OF_SPEECH, LEXICAL_ENTRY], inplace=True, key=strip_diacritics
    )
    book_report.df.drop(
        book_report.df[book_report.df[NEW_TESTAMENT_WORD_INDEX] <= 100].index,
        inplace=True,
    )
    report_styler = book_report.get_styler(highlight_nt_rank=False)
    report_html += report_styler.to_html()

    with open(
        f"reports/words_{book_number:02d}_{book_name}.html", "w", encoding="utf-8"
    ) as file:
        file.write(report_html)