# Word Usage

Determines New Testament word usage.

## Create Analyzer Class

In [79]:
import pandas as pd
from pprint import pprint


class Analyzer:

    def __init__(self):
        self.enable_dump = False

    def load_data(self):
        self.DF_MORPHGNT = pd.read_csv("morphgnt.csv", index_col="Index")
        self.DF_LEXEMES = pd.read_csv("lexemes.csv", index_col="Lemma")
        self.dump(self.DF_MORPHGNT, "DF_MORPHGNT")
        self.dump(self.DF_LEXEMES, "DF_LEXEMES")

        self.TOTAL_WORD_COUNT = len(self.DF_MORPHGNT)
        self.TOTAL_LEXEME_COUNT = len(self.DF_LEXEMES)

    def get_new_testament_report(self):

        s_lemma_word_counts = self.DF_MORPHGNT.groupby("Lemma").size()
        self.dump(s_lemma_word_counts, "S_LEMMA_WORD_COUNTS")

        df_analysis = s_lemma_word_counts.to_frame(name="Word Count")
        df_analysis.index.name = "Lemma"
        df_analysis["Word Percentage"] = (
            df_analysis["Word Count"] / self.TOTAL_WORD_COUNT
        ) * 100
        self.dump(df_analysis, "DF_ANALYSIS")

        df_analysis_sorted = df_analysis.sort_values("Word Percentage", ascending=False)
        df_analysis_sorted["Word Index"] = range(len(df_analysis_sorted))
        df_analysis_sorted["Word Percentage Cumulative"] = df_analysis_sorted[
            "Word Percentage"
        ].cumsum()
        self.dump(df_analysis_sorted, "DF_ANALYSIS_SORTED")

        df_merged = df_analysis_sorted.join(self.DF_LEXEMES)
        self.dump(df_merged, "DF_MERGED")

        df_report = df_merged.reindex(
            columns=[
                "Word Index",
                "BDAG Entry",
                "Dodson Entry",
                "Part of Speech",
                "Gloss",
                "Strongs",
                "GK",
                "Word Count",
                "Word Percentage",
                "Word Percentage Cumulative",
            ]
        )
        self.dump(df_report, "DF_REPORT")

        return df_report

    def get_book_report(self, book, chapter= None):

        df_morphgnt_book = self.DF_MORPHGNT[(self.DF_MORPHGNT.Book == book)]
        if chapter:
            df_morphgnt_book = df_morphgnt_book[(df_morphgnt_book.Chapter == chapter)]
        self.dump(df_morphgnt_book, "df_morphgnt_book")

        total_word_count = len(df_morphgnt_book)
        print(total_word_count)

        s_lemma_word_counts = df_morphgnt_book.groupby("Lemma").size()
        self.dump(s_lemma_word_counts, "S_LEMMA_WORD_COUNTS")

        df_analysis = s_lemma_word_counts.to_frame(name="Word Count")
        df_analysis.index.name = "Lemma"
        df_analysis["Word Percentage"] = (
            df_analysis["Word Count"] / total_word_count
        ) * 100
        self.dump(df_analysis, "DF_ANALYSIS")

        df_analysis_sorted = df_analysis.sort_values("Word Percentage", ascending=False)
        df_analysis_sorted["Word Index"] = range(len(df_analysis_sorted))
        df_analysis_sorted["Word Percentage Cumulative"] = df_analysis_sorted[
            "Word Percentage"
        ].cumsum()
        self.dump(df_analysis_sorted, "DF_ANALYSIS_SORTED")

        df_merged = df_analysis_sorted.join(self.DF_LEXEMES)
        self.dump(df_merged, "DF_MERGED")

        df_report = df_merged.reindex(
            columns=[
                "Word Index",
                "BDAG Entry",
                "Dodson Entry",
                "Part of Speech",
                "Gloss",
                "Strongs",
                "GK",
                "Word Count",
                "Word Percentage",
                "Word Percentage Cumulative",
            ]
        )
        self.dump(df_report, "DF_REPORT")

        return df_report

    def dump(self, object, name):

        if self.enable_dump:
            print(f"===== {name}")
            print(object.__class__.__name__)
            print("-----")
            pprint(vars(object))
            print("-----")
            pprint(object)

## Analyze New Testament

In [80]:
ANALYZER = Analyzer()

ANALYZER.load_data()

new_testament_report = ANALYZER.get_new_testament_report()

report_html = (
    new_testament_report.head(100)
    .style.hide(axis="index")
    .set_properties(
        subset=["Gloss", "BDAG Entry", "Dodson Entry", "Part of Speech"],
        **{"text-align": "left"}
    )
    .set_properties(
        subset=["Word Count", "Word Percentage", "Word Percentage Cumulative"],
        **{"text-align": "right"}
    )
    .set_table_styles([{"selector": "th", "props": [("text-align", "left")]}])
    .bar(subset=["Word Percentage Cumulative"], vmax=100)
    .to_html()
)

# report_html
with open("new_testament_report.html", "w", encoding="utf-8") as file:
    file.write(report_html)

## Analyze Book

In [82]:
ANALYZER = Analyzer()

ANALYZER.load_data()

new_testament_report = ANALYZER.get_new_testament_report()

book_report = ANALYZER.get_book_report(4,1)

new_testament_word_index = new_testament_report["Word Index"]

book_report.insert(
    loc=1, column="New Testament Word Index", value=new_testament_word_index
)

book_top_100 = book_report[
    (book_report["Word Index"] < 100) | (book_report["New Testament Word Index"] < 100)
]

book_top_100["Word Percentage Cumulative"] = book_top_100["Word Percentage"].cumsum()

# df_merged = pd.concat([book_report, new_testament_report], axis=1)
# df_merged


report_html = book_top_100.style.hide(axis="index").set_properties(
    subset=["Gloss", "BDAG Entry", "Dodson Entry", "Part of Speech"],
    **{"text-align": "left"}
).apply(
    lambda x: ["background: LightGreen" if v > 100 else "" for v in x],
    subset=["New Testament Word Index"],
    axis="columns",
).set_table_styles(
    [{"selector": "th", "props": [("text-align", "left")]}]
).bar( color = "LightBlue",
    subset=["Word Percentage Cumulative"], vmax=100
)

report_html

# ANALYZER.get_book_report(2).head(100).style.hide(axis="index").set_properties(
#     subset=["Gloss", "BDAG Entry", "Dodson Entry", "Part of Speech"],
#     **{"text-align": "left"}
# ).set_table_styles([{"selector": "th", "props": [("text-align", "left")]}]).bar(
#     subset=["Word Percentage Cumulative"], vmax=100
# )

826


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_top_100["Word Percentage Cumulative"] = book_top_100["Word Percentage"].cumsum()


Word Index,New Testament Word Index,BDAG Entry,Dodson Entry,Part of Speech,Gloss,Strongs,GK,Word Count,Word Percentage,Word Percentage Cumulative
0,0,ὁ,"ὁ, ἡ, τό",Definite Article,the,3588.0,3836,110,13.317191,13.317191
1,1,καί,καί,Conjunction,"and, even, also, namely",2532.0,2779,55,6.658596,19.975787
2,2,αὐτός,"αὐτός, αὐτή, αὐτό",Personal Pronoun,"he, she, it, they, them, same",846.0,899,49,5.932203,25.90799
3,7,εἰμί,εἰμί,Verb,"I am, exist",1510.0,1639,37,4.479419,30.387409
4,8,λέγω,λέγω,Verb,"I say, speak",3004.0,3306,30,3.631961,34.01937
5,6,ἐγώ,ἐγώ,Personal Pronoun,I,1473.0,1609,19,2.300242,36.319613
6,3,σύ,"σύ, σοῦ, σοί, σέ",Personal Pronoun,you,4771.0,5148,15,1.815981,38.135593
7,10,οὐ,οὐ,Adverb,"no, not",3756.0,4024,14,1.694915,39.830508
8,11,ὅς,"ὅς, ἥ, ὅ",Relative Pronoun,"who, which, what, that",3739.0,4005,14,1.694915,41.525424
9,5,ἐν,ἐν,Preposition,"in, on, among",1722.0,1877,14,1.694915,43.220339
