# Gathering Stats About Available Text Files

In [14]:
from librarian import DATA_DIR
import pandas as pd
import os
import helpers

# Disable performance warnings from Panda, as they make the output messy and aren't really important in this case
import warnings
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

In [15]:
# Whether to force text to all uppercase before counting
UPPERCASE_ONLY = True

# How much of each report to show: the top N occuring characters
CARE_COUNT = 20

In [16]:
# Print statistics for text files (*.txt) in specified directory and all subidirectories, recursively
def check_stats(dir: str):

    print()
    print(f"Directory     : {dir}")

    # Collect all files and subdirectories, with relative paths
    contents = [os.path.join(dir, c) for c in os.listdir(dir)]    
    files = [c for c in contents if (os.path.isfile(c) and c.lower().endswith(".txt"))]
    subdirs = [c for c in contents if os.path.isdir(c)]

    print(f"Text Files    : {len(files)}")
    print(f"Subdirectories: {len(subdirs)}")

    # Build up character counts for all files in this directory
    counts_df = pd.DataFrame()
    for file in files:
        file_content = helpers.read_text_file(file)
        if UPPERCASE_ONLY:
            file_content = file_content.upper()
            
        this_file_df = pd.DataFrame(list(file_content))
        this_file_counts_s = pd.Series(this_file_df[0].value_counts())
        counts_df[file] = this_file_counts_s

    # Combine counts for all files into one total and percent
    counts_df["COUNT"] = counts_df[list(counts_df.columns)].sum(axis=1)
    counts_df["PERCENT"] = counts_df["COUNT"] / counts_df["COUNT"].sum()

    # Discard the file-specific columns and resort
    counts_df = counts_df[["COUNT", "PERCENT"]]    
    counts_df = counts_df.sort_values("COUNT", ascending=False)
    print(counts_df.head(CARE_COUNT))

    # Recurse through subdirectories
    for subdir in subdirs:
        check_stats(subdir)
        
check_stats(DATA_DIR)


Directory     : data
Text Files    : 0
Subdirectories: 4
Empty DataFrame
Columns: [COUNT, PERCENT]
Index: []

Directory     : data/encoded
Text Files    : 480
Subdirectories: 0
      COUNT   PERCENT
0                    
R  11608081  0.106502
O  11311122  0.103777
S  11181393  0.102587
N  11113119  0.101960
H  11010226  0.101016
A  10964115  0.100593
T  10667094  0.097868
I  10610201  0.097346
E  10448545  0.095863
   10080592  0.092487

Directory     : data/intake
Text Files    : 40
Subdirectories: 0
        COUNT   PERCENT
0                      
    2503911.0  0.174556
E   1347168.0  0.093916
T   1017481.0  0.070932
A    878215.0  0.061223
O    825642.0  0.057558
N    775741.0  0.054080
I    756971.0  0.052771
R    691653.0  0.048218
S    651745.0  0.045435
H    562951.0  0.039245
D    453590.0  0.031621
L    434533.0  0.030293
C    331506.0  0.023110
\n   298680.0  0.020822
U    291889.0  0.020349
M    270509.0  0.018858
F    264047.0  0.018408
P    231407.0  0.016132
G    229045.