# Word Usage

Determines word usage in book (Matthew).

## Parse MorphGnt File into DataFrame (DF_WORDS)

Loads the MorphGnt file for Matthew and filters words used in chapter 1.

Saves the total word count as `TOTAL_WORD_COUNT`

In [9]:
import pandas as pd
import numpy as np
from pprint import pprint

DF_WORDS = pd.read_csv(
    "../BibleCore/Resources/MorphGnt/61-Mt-morphgnt.txt",
    names=[
        "Scripture Reference",
        "Part of Speech",
        "Inflection",
        "Text",
        "Word",
        "Normalized Word",
        "Lemma",
    ],
    dtype={"Scripture Reference": "str"},
    sep="\\s+",
    index_col=False,
)

# DF_WORDS = DF_WORDS[DF_WORDS['ScriptureReference'] < '0102']

TOTAL_WORD_COUNT = len(DF_WORDS)


print("===== DF_WORDS")
print(DF_WORDS.__class__.__name__)
print("-----")
pprint(vars(DF_WORDS))
print("-----")
pprint(DF_WORDS)

print("===== TOTAL_WORD_COUNT")
print(TOTAL_WORD_COUNT)

===== DF_WORDS
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['Scripture Reference', 'Part of Speech', 'Inflection', 'Text', 'Word',
       'Normalized Word', 'Lemma'],
      dtype='object')
Axis 1: RangeIndex(start=0, stop=18329, step=1)
NumpyBlock: slice(0, 7, 1), 7 x 18329, dtype: object}
-----
      Scripture Reference Part of Speech Inflection        Text        Word  \
0                  010101             N-   ----NSF-      Βίβλος      Βίβλος   
1                  010101             N-   ----GSF-    γενέσεως    γενέσεως   
2                  010101             N-   ----GSM-       Ἰησοῦ       Ἰησοῦ   
3                  010101             N-   ----GSM-     χριστοῦ     χριστοῦ   
4                  010101             N-   ----GSM-        υἱοῦ        υἱοῦ   
...                   ...            ...        ...         ...         ...   
18324              012820             P

## Extract Lemma Series from DataFrame (S_LEMMA)

In [10]:
S_LEMMA = DF_WORDS["Lemma"]

print("===== S_LEMMA")
print(S_LEMMA.__class__.__name__)
print("-----")
pprint(vars(S_LEMMA))
print("-----")
pprint(S_LEMMA)

===== S_LEMMA
Series
-----
{'_attrs': {},
 '_cacher': ('Lemma',
             <weakref at 0x0000021565B57790; to 'pandas.core.frame.DataFrame' at 0x00000215654E5C50>),
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': SingleBlockManager
Items: RangeIndex(start=0, stop=18329, step=1)
NumpyBlock: 18329 dtype: object,
 '_name': 'Lemma'}
-----
0           βίβλος
1          γένεσις
2           Ἰησοῦς
3          Χριστός
4             υἱός
           ...    
18324          ἕως
18325            ὁ
18326    συντέλεια
18327            ὁ
18328         αἰών
Name: Lemma, Length: 18329, dtype: object


## Group Lemmas by Value (GB_LEMMA_VALUES)

In [11]:
GB_LEMMA_VALUES = S_LEMMA.groupby(S_LEMMA.values)

print("===== GB_LEMMA_VALUES")
print(GB_LEMMA_VALUES.__class__.__name__)
print("-----")
pprint(vars(GB_LEMMA_VALUES))
print("-----")
pprint(GB_LEMMA_VALUES)

===== GB_LEMMA_VALUES
SeriesGroupBy
-----
{'_grouper': <pandas.core.groupby.ops.BaseGrouper object at 0x00000215651CBED0>,
 '_selection': None,
 'as_index': True,
 'axis': 0,
 'dropna': True,
 'exclusions': frozenset(),
 'group_keys': True,
 'keys': array(['βίβλος', 'γένεσις', 'Ἰησοῦς', ..., 'συντέλεια', 'ὁ', 'αἰών'],
      shape=(18329,), dtype=object),
 'level': None,
 'obj': 0           βίβλος
1          γένεσις
2           Ἰησοῦς
3          Χριστός
4             υἱός
           ...    
18324          ἕως
18325            ὁ
18326    συντέλεια
18327            ὁ
18328         αἰών
Name: Lemma, Length: 18329, dtype: object,
 'observed': False,
 'sort': True}
-----
<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000215612EF110>


## Get Word Count for Each Lemma (S_LEMMA_VALUE_COUNTS)

In [12]:
S_LEMMA_VALUE_COUNTS = GB_LEMMA_VALUES.count()

print("===== S_LEMMA_VALUE_COUNTS")
print(S_LEMMA_VALUE_COUNTS.__class__.__name__)
print("-----")
pprint(vars(S_LEMMA_VALUE_COUNTS))
print("-----")
pprint(S_LEMMA_VALUE_COUNTS)

===== S_LEMMA_VALUE_COUNTS
Series
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': SingleBlockManager
Items: Index(['Αἴγυπτος', 'Βαβυλών', 'Βαραββᾶς', 'Βαραχίας', 'Βαρθολομαῖος',
       'Βαριωνᾶ', 'Βεελζεβούλ', 'Βηθανία', 'Βηθλέεμ', 'Βηθσαϊδά',
       ...
       'ῥαπίζω', 'ῥαφίς', 'ῥύμη', 'ῥύομαι', 'ῥῆμα', 'Ῥαμά', 'Ῥαχάβ', 'Ῥαχήλ',
       'Ῥοβοάμ', 'Ῥούθ'],
      dtype='object', length=1680)
NumpyBlock: 1680 dtype: int64,
 '_name': 'Lemma'}
-----
Αἴγυπτος        4
Βαβυλών         4
Βαραββᾶς        5
Βαραχίας        1
Βαρθολομαῖος    1
               ..
Ῥαμά            1
Ῥαχάβ           1
Ῥαχήλ           1
Ῥοβοάμ          2
Ῥούθ            1
Name: Lemma, Length: 1680, dtype: int64


## Create DataFrame for Lemma Analysis (DF_ANALYSIS)

In [13]:
DF_ANALYSIS = S_LEMMA_VALUE_COUNTS.to_frame(name="Lemma Count")
DF_ANALYSIS.index.name = "Lemma"

print("===== DF_ANALYSIS")
print(DF_ANALYSIS.__class__.__name__)
print("-----")
pprint(vars(DF_ANALYSIS))
print("-----")
pprint(DF_ANALYSIS)

===== DF_ANALYSIS
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['Lemma Count'], dtype='object')
Axis 1: Index(['Αἴγυπτος', 'Βαβυλών', 'Βαραββᾶς', 'Βαραχίας', 'Βαρθολομαῖος',
       'Βαριωνᾶ', 'Βεελζεβούλ', 'Βηθανία', 'Βηθλέεμ', 'Βηθσαϊδά',
       ...
       'ῥαπίζω', 'ῥαφίς', 'ῥύμη', 'ῥύομαι', 'ῥῆμα', 'Ῥαμά', 'Ῥαχάβ', 'Ῥαχήλ',
       'Ῥοβοάμ', 'Ῥούθ'],
      dtype='object', name='Lemma', length=1680)
NumpyBlock: slice(0, 1, 1), 1 x 1680, dtype: int64}
-----
              Lemma Count
Lemma                    
Αἴγυπτος                4
Βαβυλών                 4
Βαραββᾶς                5
Βαραχίας                1
Βαρθολομαῖος            1
...                   ...
Ῥαμά                    1
Ῥαχάβ                   1
Ῥαχήλ                   1
Ῥοβοάμ                  2
Ῥούθ                    1

[1680 rows x 1 columns]


## Add Word Percentage to Analysis (DF_ANALYSIS)

In [14]:
DF_ANALYSIS["Lemma Percentage"] = (DF_ANALYSIS["Lemma Count"] / TOTAL_WORD_COUNT) * 100

print("===== DF_ANALYSIS")
print(DF_ANALYSIS.__class__.__name__)
print("-----")
pprint(vars(DF_ANALYSIS))
print("-----")
pprint(DF_ANALYSIS)

===== DF_ANALYSIS
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {'Lemma Count': Lemma
Αἴγυπτος        4
Βαβυλών         4
Βαραββᾶς        5
Βαραχίας        1
Βαρθολομαῖος    1
               ..
Ῥαμά            1
Ῥαχάβ           1
Ῥαχήλ           1
Ῥοβοάμ          2
Ῥούθ            1
Name: Lemma Count, Length: 1680, dtype: int64},
 '_mgr': BlockManager
Items: Index(['Lemma Count', 'Lemma Percentage'], dtype='object')
Axis 1: Index(['Αἴγυπτος', 'Βαβυλών', 'Βαραββᾶς', 'Βαραχίας', 'Βαρθολομαῖος',
       'Βαριωνᾶ', 'Βεελζεβούλ', 'Βηθανία', 'Βηθλέεμ', 'Βηθσαϊδά',
       ...
       'ῥαπίζω', 'ῥαφίς', 'ῥύμη', 'ῥύομαι', 'ῥῆμα', 'Ῥαμά', 'Ῥαχάβ', 'Ῥαχήλ',
       'Ῥοβοάμ', 'Ῥούθ'],
      dtype='object', name='Lemma', length=1680)
NumpyBlock: slice(0, 1, 1), 1 x 1680, dtype: int64
NumpyBlock: slice(1, 2, 1), 1 x 1680, dtype: float64}
-----
              Lemma Count  Lemma Percentage
Lemma                                      
Αἴγ

## Add Cumulative Percentage Column (DF_ANALYSYS)

In [15]:
DF_ANALYSIS = DF_ANALYSIS.sort_values("Lemma Percentage", ascending=False)
DF_ANALYSIS["Word Index"] = range(len(DF_ANALYSIS))
DF_ANALYSIS["Lemma Percentage Cumulative"] = DF_ANALYSIS["Lemma Percentage"].cumsum()

DF_ANALYSIS = DF_ANALYSIS[DF_ANALYSIS["Word Index"] < 100]

print("===== DF_ANALYSIS")
print(DF_ANALYSIS.__class__.__name__)
print("-----")
pprint(vars(DF_ANALYSIS))
print("-----")
pprint(DF_ANALYSIS)

===== DF_ANALYSIS
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': <weakref at 0x00000215614D8D60; dead>,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['Lemma Count', 'Lemma Percentage', 'Word Index',
       'Lemma Percentage Cumulative'],
      dtype='object')
Axis 1: Index(['ὁ', 'καί', 'αὐτός', 'λέγω', 'δέ', 'σύ', 'ἐν', 'εἰμί', 'ἐγώ', 'εἰς',
       'οὐ', 'Ἰησοῦς', 'οὗτος', 'ὅτι', 'πᾶς', 'μή', 'ὅς', 'γάρ', 'ἐπί',
       'ἄνθρωπος', 'ἀπό', 'ἔρχομαι', 'τότε', 'υἱός', 'τίς', 'ποιέω', 'ἐκ',
       'οὐρανός', 'κύριος', 'γίνομαι', 'ἔχω', 'μαθητής', 'ὁράω', 'μετά', 'ἤ',
       'εἷς', 'ἀκούω', 'πατήρ', 'ἰδού', 'ἐάν', 'πολύς', 'διά', 'οὖν', 'δίδωμι',
       'ἀποκρίνομαι', 'εἰ', 'βασιλεία', 'λαμβάνω', 'ἐκεῖνος', 'προσέρχομαι',
       'θεός', 'ὄχλος', 'ἕως', 'ἀφίημι', 'ἡμέρα', 'γεννάω', 'ἄν', 'ἐξέρχομαι',
       'γῆ', 'πρός', 'θέλω', 'ὡς', 'δύο', 'ἵνα', 'ἀδελφός', 'προφήτης', 'κατά',
       'ἀλλά', 'εἰσέρχομαι', 'ἐγείρω', 'ἀπέρχομαι', 'βάλλω

## Display Word Analysis

In [16]:
DF_ANALYSIS.style.bar(subset=["Lemma Percentage Cumulative"], vmax=100)

Unnamed: 0_level_0,Lemma Count,Lemma Percentage,Word Index,Lemma Percentage Cumulative
Lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ὁ,2782,15.178133,0,15.178133
καί,1174,6.40515,1,21.583283
αὐτός,918,5.008457,2,26.59174
λέγω,505,2.755197,3,29.346937
δέ,493,2.689727,4,32.036663
σύ,456,2.487861,5,34.524524
ἐν,292,1.593104,6,36.117628
εἰμί,289,1.576736,7,37.694364
ἐγώ,261,1.423973,8,39.118337
εἰς,218,1.189372,9,40.307709
