In [None]:
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

english_stop_words = stopwords.words('english')

# Basic Corpus Stats

This script provides some basic corpus stats for the Thomas T. Eckert collection telegrams. In addition to calculating these stats for the whole corpus, the script calculates the stats for each telegram ledger category.

### References

Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied text analysis with Python: Enabling language-aware data products with machine learning (First edition). O’Reilly Media, Inc.

Bird, S., Klein, E., & Loper, E. (2009). Natural language processing with Python (First edition). O’Reilly.


## 1) Load Corpus
This script uses NLTK's [CategorizedPlaintextCorpusReader](https://www.nltk.org/api/nltk.corpus.reader.html?highlight=categorizedplaintextcorpusreader#nltk.corpus.reader.CategorizedPlaintextCorpusReader) to create a corpus of telegrams. Based on folder structure, the script can consume:
- all the telegrams ledgers
- telegram ledgers that contain only telegrams in the clear (i.e., 'coded_telegrams')
- telegram ledgers that contain only telegrams in code (i.e., 'clear_telegrams')
- telegram ledgers that contain both telegram in the clear and telegrams in code (i.e., 'clear_and_coded_telegrams')


In [None]:
doc_pattern = r'.*/preprocessed_.*.txt'
category_pattern = r'.*?/(\w+_telegrams)/'
path_to_corpus = os.getenv('ECKERT_PAPERS_CORPUS_PATH')
telegram_corpus = CategorizedPlaintextCorpusReader(
    path_to_corpus,
    doc_pattern,
    cat_pattern=category_pattern
)

To see all of the category labels, use the corpus class method `categories()`.

In [None]:
categories = telegram_corpus.categories()
categories

## 2) Generate General Stats
After the corpus is setup, we can more easily describe the corpus in terms of general stats. For the whole corpus as well as each corpus category, this script calculates the number of files, words, non-stopwords, unique non-stopwords, lexical diversity, and average word use.

In [None]:
number_of_files = []
number_of_words = []
number_of_words_wo_stop_words = []
number_of_unique_stopwords = []
frequency_distribution_of_corpus_no_stop_words = []
lexical_diversity = []
avg_use_of_word = []

# Add data for whole corpus
number_of_files.append(len(telegram_corpus.fileids()))

# create a list of all the words in a corpus
whole_corpus_words = telegram_corpus.words()

number_of_words.append(len(whole_corpus_words))

# filter the corpus words list for english stopwords
corpus_no_stopwords = [word for word in whole_corpus_words if word not in english_stop_words]
number_of_words_no_stopwords = len(corpus_no_stopwords)
number_of_words_wo_stop_words.append(number_of_words_no_stopwords)

# set of unique words in the corpus minus stopwords
set_corpus_no_stopwords = set(corpus_no_stopwords)
number_of_unique_stopwords.append(len(set_corpus_no_stopwords))

lexical_diversity.append(len(set_corpus_no_stopwords)/len(corpus_no_stopwords))
avg_use_of_word.append(len(corpus_no_stopwords)/len(set_corpus_no_stopwords))

for category in categories:
    # number of files in the current category
    number_of_files.append(len(telegram_corpus.fileids(categories=category)))
    
    # number of words in the corpus
    corpus_words = telegram_corpus.words(categories=category)
    number_of_words.append(len(corpus_words))
    
    # for effeciency comparisons, remove stopwords 
    corpus_no_stopwords = [word for word in corpus_words if word not in english_stop_words]
    number_of_words_no_stopwords = len(corpus_no_stopwords)
    number_of_words_wo_stop_words.append(number_of_words_no_stopwords)
    
    # how many of the unique words are there, excluding stopwords? 
    set_corpus_no_stopwords = set(corpus_no_stopwords)
    number_of_unique_stopwords.append(len(set_corpus_no_stopwords))
    
    lexical_diversity.append(len(set_corpus_no_stopwords)/len(corpus_no_stopwords))
    
    avg_use_of_word.append(len(corpus_no_stopwords)/len(set_corpus_no_stopwords))

In [None]:
indices = ['whole_corpus', 'clear_and_coded_telegrams', 'clear_telegrams', 'coded_telegrams']
category_data = {
    'Number of Files': number_of_files,
    'Number of Words': number_of_words,
    'Number of Non-Stopwords': number_of_words_wo_stop_words,
    'Number of Unique Non-Stopwords': number_of_unique_stopwords,
    'Lexical Diversity': lexical_diversity,
    'Average Word Use': avg_use_of_word
}
caption_text = "Table 1. General Statistics for the Telegrams in The Thomas T. Eckert Papers"
category_comparison_data_frame = pd.DataFrame(data=category_data, index=indices).style.set_caption(caption_text).set_table_styles([
    {'selector': 'caption', 'props': [('font-size','18px'),('color','black'), ('font-weight','bold')]},
    {'td': 'caption', 'props': [('margin','1em')]}
])
category_comparison_data_frame