# Demo Notebook

## Data Frame

In [10]:
import pandas as pd
import numpy as np
from pprint import pprint

print("===== df")
df = pd.read_csv(
    "../BibleCore/Resources/MorphGnt/61-Mt-morphgnt.txt",
    names=[
        "ScriptureReference",
        "PartOfSpeech",
        "Inflection",
        "Text",
        "Word",
        "NormalizedWord",
        "Lemma",
    ],
    dtype={
        "ScriptureReference": "str"
    },
    sep="\\s+",
    index_col=False,
)
print(df.__class__.__name__)
pprint(vars(df))

print("===== col_lemma")
col_lemma = df['Lemma'] 
print(col_lemma.__class__.__name__)
print('-----')
pprint(vars(col_lemma))
print('-----')
pprint(col_lemma)

print("===== col_lemma_boolean")
col_lemma_boolean = col_lemma == 'Χριστός'
print(col_lemma_boolean.__class__.__name__)
print('-----')
pprint(vars(col_lemma_boolean))
print('-----')
pprint(col_lemma_boolean)

print("===== rows")
rows = df.loc[col_lemma_boolean]
print(rows.__class__.__name__)
print('-----')
pprint(vars(rows))
print('-----')
pprint(rows)

===== df
DataFrame
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['ScriptureReference', 'PartOfSpeech', 'Inflection', 'Text', 'Word',
       'NormalizedWord', 'Lemma'],
      dtype='object')
Axis 1: RangeIndex(start=0, stop=18329, step=1)
NumpyBlock: slice(0, 7, 1), 7 x 18329, dtype: object}
===== col_lemma
Series
-----
{'_attrs': {},
 '_cacher': ('Lemma',
             <weakref at 0x00000221D07A7D30; to 'pandas.core.frame.DataFrame' at 0x00000221D048C9D0>),
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': SingleBlockManager
Items: RangeIndex(start=0, stop=18329, step=1)
NumpyBlock: 18329 dtype: object,
 '_name': 'Lemma'}
-----
0           βίβλος
1          γένεσις
2           Ἰησοῦς
3          Χριστός
4             υἱός
           ...    
18324          ἕως
18325            ὁ
18326    συντέλεια
18327            ὁ
18328         αἰών
Name: Lemma, L

## Grouping

In [26]:
print('===== df_sorted = df.sort_values("PartOfSpeech")')
df_sorted = df.sort_values("PartOfSpeech")
print(df_sorted.__class__.__name__)
print('-----')
pprint(vars(df_sorted))
print('-----')
pprint(df_sorted)

print('===== gb_lemma = df.groupby("Lemma")')
gb_lemma = df.groupby("Lemma")
print(gb_lemma.__class__.__name__)
print('-----')
pprint(vars(gb_lemma))
print('-----')
pprint(gb_lemma)

print('===== gb_lemma_count = gb_lemma.count()')
gb_lemma_count = gb_lemma.count()
print(gb_lemma_count.__class__.__name__)
print('-----')
pprint(vars(gb_lemma_count))
print('-----')
pprint(gb_lemma_count)

print("===== gb_lemma_partofspeech = gb_lemma['PartOfSpeech']")
gb_lemma_partofspeech = gb_lemma['PartOfSpeech']
print(gb_lemma_partofspeech.__class__.__name__)
print('-----')
pprint(vars(gb_lemma_partofspeech))
print('-----')
pprint(gb_lemma_partofspeech)

print('===== gb_lemma_partofspeech_count = gb_lemma_partofspeech.count()')
gb_lemma_partofspeech_count = gb_lemma_partofspeech.count()
print(gb_lemma_partofspeech_count.__class__.__name__)
print('-----')
pprint(vars(gb_lemma_partofspeech_count))
print('-----')
pprint(gb_lemma_partofspeech_count)

print('===== gb_lemma_partofspeech_count_sum = gb_lemma_partofspeech_count.sum()')
gb_lemma_partofspeech_count_sum = gb_lemma_partofspeech_count.sum()
print(gb_lemma_partofspeech_count_sum.__class__.__name__)
print('-----')
pprint(gb_lemma_partofspeech_count_sum)

print('===== gb_lemma_partofspeech_pct = gb_lemma_partofspeech_count.apply(lambda i: i / gb_lemma_partofspeech_count_sum)')
gb_lemma_partofspeech_pct = gb_lemma_partofspeech_count.apply(lambda i: (i / gb_lemma_partofspeech_count_sum) * 100.0)
gb_lemma_partofspeech_pct = gb_lemma_partofspeech_pct[gb_lemma_partofspeech_pct > 0.5]
gb_lemma_partofspeech_pct = gb_lemma_partofspeech_pct.sort_index()
print(gb_lemma_partofspeech_pct.__class__.__name__)
print('-----')
pprint(vars(gb_lemma_partofspeech_pct))
print('-----')
pprint(gb_lemma_partofspeech_pct)
print('-----')
pprint(gb_lemma_partofspeech_pct.sum())




===== df_sorted = df.sort_values("PartOfSpeech")
DataFrame
-----
{'_attrs': {},
 '_flags': <Flags(allows_duplicate_labels=True)>,
 '_is_copy': None,
 '_item_cache': {},
 '_mgr': BlockManager
Items: Index(['ScriptureReference', 'PartOfSpeech', 'Inflection', 'Text', 'Word',
       'NormalizedWord', 'Lemma'],
      dtype='object')
Axis 1: Index([ 6567,  7819, 14231,  2233, 14243,  5824, 14260, 14273,  2210,  7796,
       ...
        6917,  7460,  6030,  2437, 12281, 10037, 14549, 15156,  8995, 14865],
      dtype='int64', length=18329)
NumpyBlock: slice(0, 7, 1), 7 x 18329, dtype: object}
-----
      ScriptureReference PartOfSpeech Inflection       Text      Word  \
6567              011231           A-   ----NSF-       πᾶσα      πᾶσα   
7819              011344           A-   ----APN-      πάντα     πάντα   
14231             012403           A-   ----ASF-      ἰδίαν     ἰδίαν   
2233              010533           A-   ----DPM-  ἀρχαίοις·  ἀρχαίοις   
14243             012403           A

## Misc

In [12]:
# for index, row in df.iterrows():
#     print(index)
#     print(row)

# grouped = df.groupby("Lemma")#.size()
# print(grouped.tail())

# print('=====')

# n = 25
# filtered_groups = grouped[grouped > n]
# print(filtered_groups)

# # print('=====')

# print(grouped)

#grouped["Lemma"]

# result_df = df[df['Lemma'].isin(filtered_groups)]
# print(result_df)

#g = grouped["ScriptureReference"]
# for name, group in g:
#     print(name)
#     print(group)
# gsum = g.count()
# gsum
# for name, group in gsum:
#     print(name)
#     print(group)