# Gathering Stats About Available Text Files

In [4]:
from librarian import SAMPLE_DATA_DIR
import pandas as pd
import os
import helpers

# Disable performance warnings from Panda, as they make the output messy and aren't really important in this case
import warnings
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

In [5]:
# Whether to force text to all uppercase before counting
UPPERCASE_ONLY = True

# How much of each report to show: the top N occuring characters
CARE_COUNT = 20

In [6]:
# Print statistics for text files (*.txt) in specified directory and all subidirectories, recursively
def check_stats(dir: str):

    print()
    print(f"Directory     : {dir}")

    # Collect all files and subdirectories, with relative paths
    contents = [os.path.join(dir, c) for c in os.listdir(dir)]    
    files = [c for c in contents if (os.path.isfile(c) and c.lower().endswith(".txt"))]
    subdirs = [c for c in contents if os.path.isdir(c)]

    print(f"Text Files    : {len(files)}")
    print(f"Subdirectories: {len(subdirs)}")

    # Build up character counts for all files in this directory
    counts_df = pd.DataFrame()
    for file in files:
        file_content = helpers.read_text_file(file)
        if UPPERCASE_ONLY:
            file_content = file_content.upper()
            
        this_file_df = pd.DataFrame(list(file_content))
        this_file_counts_s = pd.Series(this_file_df[0].value_counts())
        counts_df[file] = this_file_counts_s

    # Combine counts for all files into one total and percent
    counts_df["COUNT"] = counts_df[list(counts_df.columns)].sum(axis=1)
    counts_df["PERCENT"] = counts_df["COUNT"] / counts_df["COUNT"].sum()

    # Discard the file-specific columns and resort
    counts_df = counts_df[["COUNT", "PERCENT"]]    
    counts_df = counts_df.sort_values("COUNT", ascending=False)
    print(counts_df.head(20))

    # Recurse through subdirectories
    for subdir in subdirs:
        check_stats(subdir)
        
check_stats(SAMPLE_DATA_DIR)


Directory     : sample_data
Text Files    : 0
Subdirectories: 4
Empty DataFrame
Columns: [COUNT, PERCENT]
Index: []

Directory     : sample_data/encoded
Text Files    : 722
Subdirectories: 0
        COUNT   PERCENT
0                      
.   5565904.0  0.019901
,   5544981.0  0.019826
>   5534195.0  0.019788
?   5524803.0  0.019754
:   5376418.0  0.019223
[   5376343.0  0.019223
"   5365075.0  0.019183
\n  5358919.0  0.019161
]   5301791.0  0.018957
;   5259535.0  0.018806
(   5236940.0  0.018725
F   5205168.0  0.018611
'   5174792.0  0.018503
/   5135980.0  0.018364
D   5118232.0  0.018300
<   5084269.0  0.018179
N   5030437.0  0.017986
+   5026422.0  0.017972
!   5008914.0  0.017909
4   4968746.0  0.017766

Directory     : sample_data/intake
Text Files    : 19
Subdirectories: 0
        COUNT   PERCENT
0                      
    1685352.0  0.180441
E    866430.0  0.092764
T    656503.0  0.070288
A    560319.0  0.059990
O    530287.0  0.056775
N    505140.0  0.054082
I    498782.0  