In [1]:
import csv
import json
import os
import re
import string
import unittest

# Preprocess Text

To simplify the text anaylsis of the U.S. Civil Telegrams, the telegrams need to be preprocessed to remove volunteer generated metadata tags (e.g., `<deletion>`, `<insertion>`, and `<unclear>`), punctuation, as well as converting all characters to lower case.

As these changes are applied, the changes are saved in a dictionary. The structure of this dictionary is:

```
telegram_changes = {
    removed_tags: [], # array of tuples (e.g., (starting index, text of the selection))
    removed_punctuation: [], # array of tuples (e.g., (starting index, ending index, text of the selection))
    lower_cased_characters: [], # array of tuples (e.g., (starting index, text of the selection))
    spaces_removed: []
}
```
This script runs on one ledger at a time.


## Inputs:
- path to unprocessed text files
- path to folder that will contain the outputs of this script

## Outputs:
This script generates the following file structure:

- folder of each telegram
    - txt file containing the unprocessed telegram
    - txt file containing the processed telegram
    - json file containing the dictionary of changes


In [2]:
class TestStringMethods(unittest.TestCase):
    def test_remove_annotation_tags(self):
        example_string = "<unclear>Ft Monroe</unclear> 4th 120 PM  Recd Jul 4 ' 62 \n<insertion>Norfolk</insertion> <deletion>July</deletion> Fourth twelve thirty PM"
        string_to_match = "Ft Monroe 4th 120 PM  Recd Jul 4 ' 62 \nNorfolk July Fourth twelve thirty PM"
        removed_tags_example_string = remove_annotation_tags(example_string, [])
        
        self.assertEqual(removed_tags_example_string[0], string_to_match)
        
    def test_remove_punctuation(self):
        example_string = "Ft Monroe 4th 120 PM  Recd Jul 4 ' 62 \nNorfolk July Fourth twelve thirty PM"
        string_to_match = "Ft Monroe 4th 120 PM  Recd Jul 4  62 \nNorfolk July Fourth twelve thirty PM"
        removed_punctuation_example_string = remove_punctuation(example_string, [])
        
        self.assertEqual(removed_punctuation_example_string[0], string_to_match)
    
    def test_lowercase_characters(self):
        example_string = "Ft Monroe 4th 120 PM  Recd Jul 4 ' 62 \nNorfolk July Fourth twelve thirty PM"
        string_to_match = "ft monroe 4th 120 pm  recd jul 4 ' 62 \nnorfolk july fourth twelve thirty pm"
        lower_case_example_string = lowercase_characters(example_string, [])
        
        self.assertEqual(lower_case_example_string[0], string_to_match)  
    
    def test_dehydrated_telegram(self):
        example_string = "<unclear>Ft Monroe</unclear> 4th 120 PM  Recd Jul 4 ' 62 \n<insertion>Norfolk</insertion> <deletion>July</deletion> Fourth twelve thirty PM"
        string_to_match = "ft monroe 4th 120 pm  recd jul 4  62 \nnorfolk july fourth twelve thirty pm"
        dehydrated_telegram_example = dehydrate(example_string)
        self.assertEqual(dehydrated_telegram_example[0], string_to_match)
    
    def test_dehydrated_telegram(self):
        example_string = "<unclear>Ft Monroe</unclear> 4th 120 PM  Recd Jul 4 ' 62 \n<insertion>Norfolk</insertion> <deletion>July</deletion> Fourth twelve thirty PM"
        dehydrated_telegram_example = dehydrate(example_string)
        rehydtrated_telegram_example = rehydrate(dehydrated_telegram_example[0], dehydrated_telegram_example[1])
        
        self.assertEqual(rehydtrated_telegram_example, example_string)
    
    def test_remove_extra_spaces__two_spaces(self):
        example_string = "Ft Monroe 4th 120 PM  Recd Jul 4 ' 62 \nNorfolk July Fourth twelve thirty PM"
        string_to_match = "Ft Monroe 4th 120 PM Recd Jul 4 ' 62 \nNorfolk July Fourth twelve thirty PM"
        example_string_removed_extra_spaces = remove_extra_spaces(example_string,[])
        
        self.assertEqual(example_string_removed_extra_spaces[0], string_to_match)
    
    def test_remove_extra_spaces__three_spaces(self):
        example_string = "Ft Monroe 4th 120 PM   Recd Jul 4 ' 62 \nNorfolk July Fourth twelve thirty PM"
        string_to_match = "Ft Monroe 4th 120 PM Recd Jul 4 ' 62 \nNorfolk July Fourth twelve thirty PM"
        example_string_removed_extra_spaces = remove_extra_spaces(example_string,[])
        
        self.assertEqual(example_string_removed_extra_spaces[0], string_to_match)
    
    def test_remove_extra_spaces__multiple_spaces(self):
        example_string = "Ft Monroe 4th 120 PM   Recd Jul 4 ' 62 \nNorfolk July  Fourth twelve thirty PM"
        string_to_match = "Ft Monroe 4th 120 PM Recd Jul 4 ' 62 \nNorfolk July Fourth twelve thirty PM"
        example_string_removed_extra_spaces = remove_extra_spaces(example_string,[])
    
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

EEEEEEE
ERROR: test_dehydrated_telegram (__main__.TestStringMethods)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-2-79193fea33ae>", line 31, in test_dehydrated_telegram
    dehydrated_telegram_example = dehydrate(example_string)
NameError: name 'dehydrate' is not defined

ERROR: test_lowercase_characters (__main__.TestStringMethods)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-2-79193fea33ae>", line 19, in test_lowercase_characters
    lower_case_example_string = lowercase_characters(example_string, [])
NameError: name 'lowercase_characters' is not defined

ERROR: test_remove_annotation_tags (__main__.TestStringMethods)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-2-79193fea33ae>", line 5, in test_remove_annotation_tags
    removed_tags_e