# Word Usage

Determines word usage in book (Matthew).

NOTE: Superceded by Word Usage.ipynb.

## Parse MorphGnt File into DataFrame (DF_WORDS)

Loads the MorphGnt file for Matthew and filters words used in chapter 1.

Saves the total word count as `TOTAL_WORD_COUNT`

In [15]:
import pandas as pd
from glob import glob
from os import path
from pprint import pprint

morphgnt_path = "../BibleCore/Resources/MorphGnt"

all_files = glob(path.join(morphgnt_path, "*.txt"))

DF_WORDS = pd.concat(
    (
        pd.read_csv(
            f,
            names=[
                "Scripture Reference",
                "Part of Speech",
                "Inflection",
                "Text",
                "Word",
                "Normalized Word",
                "Lemma",
            ],
            dtype={"Scripture Reference": "str"},
            sep="\\s+",
            index_col=False,
        )
        for f in all_files
    ),
    ignore_index=True,
)

DF_WORDS = DF_WORDS[(DF_WORDS['Scripture Reference'] >= '020101')&( DF_WORDS['Scripture Reference'] < '020201')]
TOTAL_WORD_COUNT = len(DF_WORDS)


print("===== DF_WORDS")
print(DF_WORDS.__class__.__name__)
print("-----")
pprint(vars(DF_WORDS))
print("-----")
pprint(DF_WORDS)

print("===== TOTAL_WORD_COUNT")
print(TOTAL_WORD_COUNT)

===== DF_WORDS
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': <weakref at 0x00000204CCC565C0; dead>,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['Scripture Reference', 'Part of Speech', 'Inflection', 'Text', 'Word',
       'Normalized Word', 'Lemma'],
      dtype='object')
Axis 1: Index([18329, 18330, 18331, 18332, 18333, 18334, 18335, 18336, 18337, 18338,
       ...
       19020, 19021, 19022, 19023, 19024, 19025, 19026, 19027, 19028, 19029],
      dtype='int64', length=701)
NumpyBlock: slice(0, 7, 1), 7 x 701, dtype: object}
-----
      Scripture Reference Part of Speech Inflection        Text        Word  \
18329              020101             N-   ----NSF-        Ἀρχὴ        Ἀρχὴ   
18330              020101             RA   ----GSN-         τοῦ         τοῦ   
18331              020101             N-   ----GSN-  εὐαγγελίου  εὐαγγελίου   
18332              020101             N-   ----GSM-       Ἰησοῦ       Ἰησοῦ   
18333  

## Parse Lexemes File into DataFram (DF_LEXEMES)

In [16]:
import yaml

with open("../BibleCore/Resources/lexemes.yaml", "r", encoding="utf-8") as file:
    yaml_data = yaml.safe_load(file)

DF_LEXEMES = pd.DataFrame.from_dict(yaml_data, orient="index")
DF_LEXEMES.index.name = "Lemma"

print("===== DF_LEXEMES")
print(DF_LEXEMES.__class__.__name__)
print("-----")
pprint(vars(DF_LEXEMES))
print("-----")
pprint(DF_LEXEMES)

===== DF_LEXEMES
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['pos', 'full-citation-form', 'bdag-headword', 'danker-entry',
       'dodson-entry', 'mounce-headword', 'strongs', 'gk', 'dodson-pos',
       'gloss', 'mounce-morphcat'],
      dtype='object')
Axis 1: Index(['Ἀαρών', 'Ἀβαδδών', 'ἀβαρής', 'αββα', 'Ἅβελ', 'Ἀβιά', 'Ἀβιαθάρ',
       'Ἀβιληνή', 'Ἀβιούδ', 'Ἀβραάμ',
       ...
       'ὡσεί', 'Ὡσηέ', 'ὥσπερ', 'ὡσπερεί', 'ὥστε', 'ὠτάριον', 'ὠτίον',
       'ὠφέλεια', 'ὠφελέω', 'ὠφέλιμος'],
      dtype='object', name='Lemma', length=5464)
NumpyBlock: slice(0, 11, 1), 11 x 5464, dtype: object}
-----
         pos    full-citation-form bdag-headword          danker-entry  \
Lemma                                                                    
Ἀαρών      N              Ἀαρών, ὁ         Ἀαρών              Ἀαρών, ὁ   
Ἀβαδδών    N            Ἀβαδδών, ὁ       Ἀβαδδών            Ἀ

## Extract Lemma Series from DataFrame (S_LEMMA)

In [17]:
S_LEMMA = DF_WORDS["Lemma"]

print("===== S_LEMMA")
print(S_LEMMA.__class__.__name__)
print("-----")
pprint(vars(S_LEMMA))
print("-----")
pprint(S_LEMMA)

===== S_LEMMA
Series
-----
{'_attrs': {},
 '_cacher': ('Lemma',
             <weakref at 0x00000204C8B4B420; to 'pandas.core.frame.DataFrame' at 0x00000204CB0408A0>),
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': <weakref at 0x00000204CCC565C0; dead>,
 '_item_cache': {},
 '_mgr': SingleBlockManager
Items: Index([18329, 18330, 18331, 18332, 18333, 18334, 18335, 18336, 18337, 18338,
       ...
       19020, 19021, 19022, 19023, 19024, 19025, 19026, 19027, 19028, 19029],
      dtype='int64', length=701)
NumpyBlock: 701 dtype: object,
 '_name': 'Lemma'}
-----
18329          ἀρχή
18330             ὁ
18331    εὐαγγέλιον
18332        Ἰησοῦς
18333       Χριστός
            ...    
19025           καί
19026       ἔρχομαι
19027          πρός
19028         αὐτός
19029      πάντοθεν
Name: Lemma, Length: 701, dtype: object


## Group Lemmas by Value (GB_LEMMA_VALUES)

In [18]:
GB_LEMMA_VALUES = S_LEMMA.groupby(S_LEMMA.values)

print("===== GB_LEMMA_VALUES")
print(GB_LEMMA_VALUES.__class__.__name__)
print("-----")
pprint(vars(GB_LEMMA_VALUES))
print("-----")
pprint(GB_LEMMA_VALUES)

===== GB_LEMMA_VALUES
SeriesGroupBy
-----
{'_grouper': <pandas.core.groupby.ops.BaseGrouper object at 0x00000204CA5C9090>,
 '_selection': None,
 'as_index': True,
 'axis': 0,
 'dropna': True,
 'exclusions': frozenset(),
 'group_keys': True,
 'keys': array(['ἀρχή', 'ὁ', 'εὐαγγέλιον', 'Ἰησοῦς', 'Χριστός', 'καθώς', 'γράφω',
       'ἐν', 'ὁ', 'Ἠσαΐας', 'ὁ', 'προφήτης', 'ἰδού', 'ἀποστέλλω', 'ὁ',
       'ἄγγελος', 'ἐγώ', 'πρό', 'πρόσωπον', 'σύ', 'ὅς', 'κατασκευάζω',
       'ὁ', 'ὁδός', 'σύ', 'φωνή', 'βοάω', 'ἐν', 'ὁ', 'ἔρημος', 'ἑτοιμάζω',
       'ὁ', 'ὁδός', 'κύριος', 'εὐθύς', 'ποιέω', 'ὁ', 'τρίβος', 'αὐτός',
       'γίνομαι', 'Ἰωάννης', 'ὁ', 'βαπτίζω', 'ἐν', 'ὁ', 'ἔρημος',
       'κηρύσσω', 'βάπτισμα', 'μετάνοια', 'εἰς', 'ἄφεσις', 'ἁμαρτία',
       'καί', 'ἐκπορεύομαι', 'πρός', 'αὐτός', 'πᾶς', 'ὁ', 'Ἰουδαία',
       'χώρα', 'καί', 'ὁ', 'Ἱεροσολυμίτης', 'πᾶς', 'καί', 'βαπτίζω',
       'ὑπό', 'αὐτός', 'ἐν', 'ὁ', 'Ἰορδάνης', 'ποταμός', 'ἐξομολογέω',
       'ὁ', 'ἁμαρτία', 'αὐτός', 'καί', 'εἰμ

## Get Word Count for Each Lemma (S_LEMMA_VALUE_COUNTS)

In [19]:
S_LEMMA_VALUE_COUNTS = GB_LEMMA_VALUES.count()

print("===== S_LEMMA_VALUE_COUNTS")
print(S_LEMMA_VALUE_COUNTS.__class__.__name__)
print("-----")
pprint(vars(S_LEMMA_VALUE_COUNTS))
print("-----")
pprint(S_LEMMA_VALUE_COUNTS)

===== S_LEMMA_VALUE_COUNTS
Series
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': SingleBlockManager
Items: Index(['Γαλιλαία', 'Ζεβεδαῖος', 'Καφαρναούμ', 'Μωϋσῆς', 'Ναζαρέτ', 'Ναζαρηνός',
       'Σίμων', 'Σατανᾶς', 'Χριστός', 'αὐτός',
       ...
       'ὅς', 'ὅτε', 'ὅτι', 'ὑπάγω', 'ὑπακούω', 'ὑπό', 'ὑπόδημα', 'ὕδωρ', 'ὡς',
       'ὥστε'],
      dtype='object', length=239)
NumpyBlock: 239 dtype: int64,
 '_name': 'Lemma'}
-----
Γαλιλαία      5
Ζεβεδαῖος     2
Καφαρναούμ    1
Μωϋσῆς        1
Ναζαρέτ       1
             ..
ὑπό           3
ὑπόδημα       1
ὕδωρ          2
ὡς            3
ὥστε          2
Name: Lemma, Length: 239, dtype: int64


## Create DataFrame for Lemma Analysis (DF_ANALYSIS)

In [20]:
DF_ANALYSIS = S_LEMMA_VALUE_COUNTS.to_frame(name="Lemma Count")
DF_ANALYSIS.index.name = "Lemma"

print("===== DF_ANALYSIS")
print(DF_ANALYSIS.__class__.__name__)
print("-----")
pprint(vars(DF_ANALYSIS))
print("-----")
pprint(DF_ANALYSIS)

===== DF_ANALYSIS
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['Lemma Count'], dtype='object')
Axis 1: Index(['Γαλιλαία', 'Ζεβεδαῖος', 'Καφαρναούμ', 'Μωϋσῆς', 'Ναζαρέτ', 'Ναζαρηνός',
       'Σίμων', 'Σατανᾶς', 'Χριστός', 'αὐτός',
       ...
       'ὅς', 'ὅτε', 'ὅτι', 'ὑπάγω', 'ὑπακούω', 'ὑπό', 'ὑπόδημα', 'ὕδωρ', 'ὡς',
       'ὥστε'],
      dtype='object', name='Lemma', length=239)
NumpyBlock: slice(0, 1, 1), 1 x 239, dtype: int64}
-----
            Lemma Count
Lemma                  
Γαλιλαία              5
Ζεβεδαῖος             2
Καφαρναούμ            1
Μωϋσῆς                1
Ναζαρέτ               1
...                 ...
ὑπό                   3
ὑπόδημα               1
ὕδωρ                  2
ὡς                    3
ὥστε                  2

[239 rows x 1 columns]


## Add Word Percentage to Analysis (DF_ANALYSIS)

In [21]:
DF_ANALYSIS["Lemma Percentage"] = (DF_ANALYSIS["Lemma Count"] / TOTAL_WORD_COUNT) * 100

print("===== DF_ANALYSIS")
print(DF_ANALYSIS.__class__.__name__)
print("-----")
pprint(vars(DF_ANALYSIS))
print("-----")
pprint(DF_ANALYSIS)

===== DF_ANALYSIS
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {'Lemma Count': Lemma
Γαλιλαία      5
Ζεβεδαῖος     2
Καφαρναούμ    1
Μωϋσῆς        1
Ναζαρέτ       1
             ..
ὑπό           3
ὑπόδημα       1
ὕδωρ          2
ὡς            3
ὥστε          2
Name: Lemma Count, Length: 239, dtype: int64},
 '_mgr': BlockManager
Items: Index(['Lemma Count', 'Lemma Percentage'], dtype='object')
Axis 1: Index(['Γαλιλαία', 'Ζεβεδαῖος', 'Καφαρναούμ', 'Μωϋσῆς', 'Ναζαρέτ', 'Ναζαρηνός',
       'Σίμων', 'Σατανᾶς', 'Χριστός', 'αὐτός',
       ...
       'ὅς', 'ὅτε', 'ὅτι', 'ὑπάγω', 'ὑπακούω', 'ὑπό', 'ὑπόδημα', 'ὕδωρ', 'ὡς',
       'ὥστε'],
      dtype='object', name='Lemma', length=239)
NumpyBlock: slice(0, 1, 1), 1 x 239, dtype: int64
NumpyBlock: slice(1, 2, 1), 1 x 239, dtype: float64}
-----
            Lemma Count  Lemma Percentage
Lemma                                    
Γαλιλαία              5          0.713267
Ζεβεδαῖος

## Add Cumulative Percentage Column (DF_ANALYSYS)

In [22]:
DF_ANALYSIS = DF_ANALYSIS.sort_values("Lemma Percentage", ascending=False)
DF_ANALYSIS["Word Index"] = range(len(DF_ANALYSIS))
DF_ANALYSIS["Lemma Percentage Cumulative"] = DF_ANALYSIS["Lemma Percentage"].cumsum()

DF_ANALYSIS = DF_ANALYSIS[DF_ANALYSIS["Word Index"] < 100]

print("===== DF_ANALYSIS")
print(DF_ANALYSIS.__class__.__name__)
print("-----")
pprint(vars(DF_ANALYSIS))
print("-----")
pprint(DF_ANALYSIS)

===== DF_ANALYSIS
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': <weakref at 0x00000204C45EFBA0; dead>,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['Lemma Count', 'Lemma Percentage', 'Word Index',
       'Lemma Percentage Cumulative'],
      dtype='object')
Axis 1: Index(['ὁ', 'καί', 'αὐτός', 'εἰς', 'ἐν', 'λέγω', 'εἰμί', 'εὐθύς', 'σύ', 'ἐγώ',
       'ἔρχομαι', 'ἐξέρχομαι', 'ἔρημος', 'πνεῦμα', 'κηρύσσω', 'πρός', 'Ἰησοῦς',
       'Ἰωάννης', 'βαπτίζω', 'γίνομαι', 'ἐκ', 'Σίμων', 'μετά', 'Γαλιλαία',
       'πᾶς', 'ἐκβάλλω', 'ἀφίημι', 'συναγωγή', 'ὁράω', 'ἔχω', 'ὅτι', 'δέ',
       'οὐ', 'καθαρίζω', 'δαιμόνιον', 'θεός', 'γάρ', 'περί', 'ἀκάθαρτος',
       'τίς', 'φωνή', 'πολύς', 'ὀπίσω', 'ὅλος', 'ὑπό', 'ἀπέρχομαι',
       'εὐαγγέλιον', 'ὅς', 'ὡς', 'ἀπό', 'ἀδελφός', 'ἀλλά', 'χείρ', 'οἶδα',
       'μηδείς', 'δύναμαι', 'θέλω', 'θάλασσα', 'δίκτυον', 'διακονέω', 'διδαχή',
       'Ζεβεδαῖος', 'διδάσκω', 'πόλις', 'τόπος', 'ἐξουσία', 'ὕδωρ', '

## Merge Analysis with Lexemes (DF_MERGED)

In [23]:
DF_MERGED = DF_ANALYSIS.join(DF_LEXEMES)

print("===== DF_MERGED")
print(DF_MERGED.__class__.__name__)
print("-----")
pprint(vars(DF_MERGED))
print("-----")
pprint(DF_MERGED)

===== DF_MERGED
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['Lemma Count', 'Lemma Percentage', 'Word Index',
       'Lemma Percentage Cumulative', 'pos', 'full-citation-form',
       'bdag-headword', 'danker-entry', 'dodson-entry', 'mounce-headword',
       'strongs', 'gk', 'dodson-pos', 'gloss', 'mounce-morphcat'],
      dtype='object')
Axis 1: Index(['ὁ', 'καί', 'αὐτός', 'εἰς', 'ἐν', 'λέγω', 'εἰμί', 'εὐθύς', 'σύ', 'ἐγώ',
       'ἔρχομαι', 'ἐξέρχομαι', 'ἔρημος', 'πνεῦμα', 'κηρύσσω', 'πρός', 'Ἰησοῦς',
       'Ἰωάννης', 'βαπτίζω', 'γίνομαι', 'ἐκ', 'Σίμων', 'μετά', 'Γαλιλαία',
       'πᾶς', 'ἐκβάλλω', 'ἀφίημι', 'συναγωγή', 'ὁράω', 'ἔχω', 'ὅτι', 'δέ',
       'οὐ', 'καθαρίζω', 'δαιμόνιον', 'θεός', 'γάρ', 'περί', 'ἀκάθαρτος',
       'τίς', 'φωνή', 'πολύς', 'ὀπίσω', 'ὅλος', 'ὑπό', 'ἀπέρχομαι',
       'εὐαγγέλιον', 'ὅς', 'ὡς', 'ἀπό', 'ἀδελφός', 'ἀλλά', 'χείρ', 'οἶδα',
       'μηδείς

## Display Word Analysis

In [24]:
DF_MERGED_REORDERED=DF_MERGED.reindex(columns=['Word Index','bdag-headword','dodson-entry','pos','gloss','strongs','gk','Lemma Count','Lemma Percentage','Lemma Percentage Cumulative'])
DF_MERGED_REORDERED.style.hide(axis="index").set_properties(
    subset=["gloss", "bdag-headword","dodson-entry", "pos"], **{"text-align": "left"}
).set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'left')]}
]).bar(
    subset=["Lemma Percentage Cumulative"], vmax=100
)

Word Index,bdag-headword,dodson-entry,pos,gloss,strongs,gk,Lemma Count,Lemma Percentage,Lemma Percentage Cumulative
0,ὁ,"ὁ, ἡ, τό",RA,the,3588,3836,95,13.552068,13.552068
1,καί,καί,C,"and, even, also, namely",2532,2779,79,11.269615,24.821683
2,αὐτός,"αὐτός, αὐτή, αὐτό",RP,"he, she, it, they, them, same",846,899,51,7.275321,32.097004
3,εἰς,εἰς,P,"into, in, among, till, for",1519,1650,16,2.282454,34.379458
4,ἐν,ἐν,P,"in, on, among",1722,1877,14,1.997147,36.376605
5,λέγω,λέγω,V,"I say, speak",3004,3306,13,1.854494,38.231098
6,εἰμί,εἰμί,V,"I am, exist",1510,1639,12,1.71184,39.942939
7,εὐθύς,"εὐθύς, εῖα, ύ",A,"straight, immediately",2117,2318,12,1.71184,41.654779
8,σύ,"σύ, σοῦ, σοί, σέ",RP2,you,4771,5148,11,1.569187,43.223966
9,ἐγώ,ἐγώ,RP1,I,1473,1609,9,1.28388,44.507846
