# Gathering Stats About Available Text Files

In [1]:
from librarian import DATA_DIR
import pandas as pd
import os
import helpers

# Disable performance warnings from Panda, as they make the output messy and aren't really important in this case
import warnings
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

  id_matches = re.search('\[eBook #(.*)\]', content)


In [2]:
# Whether to force text to all uppercase before counting
UPPERCASE_ONLY = True

# How much of each report to show: the top N occuring characters
CARE_COUNT = 20

In [3]:
# Print statistics for text files (*.txt) in specified directory and all subidirectories, recursively
def check_stats(dir: str):

    print()
    print(f"Directory     : {dir}")

    # Collect all files and subdirectories, with relative paths
    contents = [os.path.join(dir, c) for c in os.listdir(dir)]    
    files = [c for c in contents if (os.path.isfile(c) and c.lower().endswith(".txt"))]
    subdirs = [c for c in contents if os.path.isdir(c)]

    print(f"Text Files    : {len(files)}")
    print(f"Subdirectories: {len(subdirs)}")

    # Build up character counts for all files in this directory
    counts_df = pd.DataFrame()
    for file in files:
        file_content = helpers.read_text_file(file)
        if UPPERCASE_ONLY:
            file_content = file_content.upper()
            
        this_file_df = pd.DataFrame(list(file_content))
        this_file_counts_s = pd.Series(this_file_df[0].value_counts())
        counts_df[file] = this_file_counts_s

    # Combine counts for all files into one total and percent
    counts_df["COUNT"] = counts_df[list(counts_df.columns)].sum(axis=1)
    counts_df["PERCENT"] = counts_df["COUNT"] / counts_df["COUNT"].sum()

    # Discard the file-specific columns and resort
    counts_df = counts_df[["COUNT", "PERCENT"]]    
    counts_df = counts_df.sort_values("COUNT", ascending=False)
    print(counts_df.head(CARE_COUNT))

    # Recurse through subdirectories
    for subdir in subdirs:
        check_stats(subdir)
        
check_stats(DATA_DIR)


Directory     : data
Text Files    : 0
Subdirectories: 4
Empty DataFrame
Columns: [COUNT, PERCENT]
Index: []

Directory     : data/encoded
Text Files    : 560
Subdirectories: 0
        COUNT   PERCENT
0                      
!   5683174.0  0.025415
)   5456835.0  0.024403
F   5454256.0  0.024392
S   5420091.0  0.024239
#   5349726.0  0.023924
M   5296445.0  0.023686
(   5255990.0  0.023505
K   5042721.0  0.022551
9   5041673.0  0.022547
%   5019182.0  0.022446
.   4976502.0  0.022255
,   4932772.0  0.022060
[   4926267.0  0.022030
*   4914866.0  0.021979
$   4873885.0  0.021796
\n  4850095.0  0.021690
:   4827538.0  0.021589
6   4779717.0  0.021375
>   4737165.0  0.021185
;   4707334.0  0.021051

Directory     : data/intake
Text Files    : 40
Subdirectories: 1
        COUNT   PERCENT
0                      
    2503911.0  0.174556
E   1347168.0  0.093916
T   1017481.0  0.070932
A    878215.0  0.061223
O    825642.0  0.057558
N    775741.0  0.054080
I    756971.0  0.052771
R    691653.