In [None]:
import re
import os
import unittest
import nltk
import numpy as np
import pandas as pd
import csv
import pdb

# Identifying Metadata with Regex
This script leverages regular experessions (regex) to identify metadata within the Thomas T. Eckert telegram collection.

## Load Corpus

In [None]:
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

english_stop_words = stopwords.words('english')
doc_pattern = r'.*/preprocessed_.*.txt'
category_pattern = r'.*?/(\w+_telegrams)/'
path_to_corpus = '/Volumes/data_work/dcw_text_mining/eckert_papers_corpus/'
telegram_corpus = CategorizedPlaintextCorpusReader(
    path_to_corpus,
    doc_pattern,
    cat_pattern=category_pattern
)

## Load Metadata Resources
First, the script will load resources we already have for identifying metadata. In this case, a csv of known people and locations.

In [None]:
path_to_people_file = "/Volumes/data_work/dcw_text_mining/metadata_id_resources/dcw_names.csv"
path_to_locations_file = "/Volumes/data_work/dcw_text_mining/metadata_id_resources/dcw_locations.csv"
path_to_state_abrrev_file = "/Volumes/data_work/dcw_text_mining/metadata_id_resources/postal_abbreviations_for_states_territories.csv"
path_to_title_file = "/Volumes/data_work/dcw_text_mining/metadata_id_resources/titles.csv"

people = pd.read_csv(path_to_people_file)

# The surnmames field is slightly normalized, e.g., replace é with e
surnames = people['Surnames']

In [None]:
titles = pd.read_csv(path_to_title_file)
abbrevs = titles.abbreviation.to_numpy()
full_titles = [title.lower() for title in titles.definition.to_numpy()]
all_titles = np.append(abbrevs, full_titles)
title_soup = [title.split() for title in all_titles]
title_tokens = [num for elem in title_soup for num in elem]

In [None]:
locations = pd.read_csv(path_to_locations_file)
location_terms = locations['Term']

# https://about.usps.com/who-we-are/postal-history/state-abbreviations.pdf
state_abbreviations = pd.read_csv(path_to_state_abrrev_file)
states = []
for state in state_abbreviations["State"]:
    states.append(state.lower())

for state in state_abbreviations["1831"]:
    states.append(state.lower())
    
for state in state_abbreviations["1874"]:
    states.append(state.lower())

state_soup = set(states)

In [None]:
simple_ordinals = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'seventeenth', 'eighteenth', 'nineteenth', 'tenth', 'twentieth', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth']

## Regular Expression Library

In [None]:
morning_time = r"\d{1,2}\W{0,1}\d{0,2} (am|a m)"
strict_meridiem = r"\b12\W{0,1}\b (\bm\b|\bmid\b)"
full_date_pattern = re.compile(r"""
    (\b
        (jan|jany|jany|feby|feb|march|mch|mar|april|apl|apr|may|june|july|jul|august|aug|september|sept|october|oct|nov|dec)
    \b)
    \s{,1} # up to one whitespace deliminator
    (?P<day>\d{,2}
        (?P<day_suffix>st|d|th|nd|rd)
    *)
    \s{,1} # up to one whitespace deliminator
    (?P<year>\b(18){0,1}([5-7][0-9]){0,1}\b)
""", re.VERBOSE)

salutations = ['obt servt', 'signed']

## Methods for finding regular expression matches
The following methods identify patterns within a telegram based on the supplied regular expressions. Matches (i.e., metadata extractions) are stored in a telegram objects until the match can be exported to a csv file.

### Helper methods

In [None]:
def create_temp_telegram_object():
    temp_telegram_object = {}
    temp_telegram_object["Dates"] = []
    temp_telegram_object["Times"] = []
    temp_telegram_object["People"] = []
    temp_telegram_object["Locations"] = []
    return temp_telegram_object

In [None]:
def check_for_duplicates_by_location(key, temp_telegram_object, start, end):
    if len(temp_telegram_object[key]) == 0:
        return
    for element in temp_telegram_object[key]:
        matching_indices = (int(start) == int(element['start']) and int(end) == int(element['end']))
        if matching_indices:
            return True
        else:
            return False

In [None]:
def last_word(word_end_index, line):
    empty_token_list = len(line[word_end_index:-1].split()) == 0
    last_index_math = word_end_index == (len(line) - 1)
    return (empty_token_list or last_index_math)

In [None]:
def next_token(line, end_prev):
    last_index = len(line) - 1
    if last_index == end_prev:
        return None
    beginning_of_next_token = end_prev + 1
    if last_index == beginning_of_next_token:
        return None
    rest_of_telegram = line[beginning_of_next_token:].split()
    if len(rest_of_telegram) == 0:
        return None
    next_token = line[beginning_of_next_token:].split()[0]
    len_next_token = len(next_token)
    end_of_next_token = beginning_of_next_token + len_next_token
    return (next_token, beginning_of_next_token, end_of_next_token)

In [None]:
def previous_token(line, start_proceeding):
    end_of_previous_token = start_proceeding - 1
    previous_token = line[:end_of_proceeding_token].split()[0]
    len_previous_token = len(previous_token)
    beginning_of_previous_token = previous_token - len_previous_token
    return (previous_token, beginning_of_previous_token, end_of_previous_token)

### Methods for finding candidates for people, sender, and recipient tokens

In [None]:
def update_people_in_temp_telegram_object(temp_telegram_object, data_type, text, controlled, lc_number, start, recd, end):
    if not check_for_duplicates_by_location("People", temp_telegram_object, start, end):
        temp_telegram_object["People"].append({
            "data_type": data_type,
            "text": text,
            "controlled": controlled,
            "lc_number": lc_number,
            "start": start,
            "recd": recd,
            "end": end
        })

In [None]:
def check_pos_tag(line, token_end):
    token_index = len(line[0:token_end].split())
    pos_token = nltk.pos_tag(line.split())[token_index - 1]
    if pos_token[1] == 'JJ':
        return False
    else:
        return True

In [None]:
def update_or_add_person(line, temp_telegram_object, start, end, token):
    captured = False
    for element in temp_telegram_object["People"]:
        if int(start) == int(element['start']) and int(end) == int(element['end']):
            element['data_type'] = 'Sender'
            captured = True
            break
    if not captured:
        if check_pos_tag(line, end):
            update_people_in_temp_telegram_object(
                temp_telegram_object,
                'Sender',
                token,
                '',
                '',
                start,
                False,
                end
            )
    return temp_telegram_object

In [None]:
# Purpose: filter initials
def find_surname_candidate(canidate, beginning_index, end_index, line):
    if len(canidate) > 2:
        return (canidate, beginning_index, end_index)
    else:
        next_canidate = next_token(line, end_index)
        if next_canidate is None:
            return None
        return find_surname_candidate(next_canidate[0], next_canidate[1], next_canidate[2], line)
        

In [None]:
def find_titles(canidate, beginning_index, end_index, line):
    if canidate not in title_tokens:
        return (canidate, beginning_index, end_index)
    else:
        canidate = next_token(line, end_index)
        return find_titles(canidate[0], canidate[1], canidate[2], line)

In [None]:
def find_person_by_title(line, temp_telegram_object):
    # using known titles search for individuals
    for title in all_titles:
        title_pattern = re.compile(r"\b{}\b".format(title))
        
        for match in re.finditer(title_pattern, line):
            data_type = "People"
            recd_status = False
            lc_number = ""
            
            # is the title the last word in the telegram?
            if last_word(match.end(), line):
                return
    
            # stores a tuple of (next_token, beginning index, end index)
            surname_canidate = next_token(line, match.end())
            
            # is the word after the title the last word in the telegram?
            if last_word(surname_canidate[2], line):
                return
            
            # check if the 'word' after the title is an intial
            # if None is returned, the script has likely encountered the end of the telegram 
            surname_canidate = find_surname_candidate(surname_canidate[0], surname_canidate[1], surname_canidate[2], line)
            if surname_canidate is None:
                continue
            # do any of the token match known surnames
            surname_df = people[people.Surnames ==  surname_canidate[0].title()]
            
            # check number of surname matches
            if len(surname_df) > 0:
                if len(surname_df) > 1:
                    if check_pos_tag(line, surname_canidate[2]):
                        update_people_in_temp_telegram_object(
                            temp_telegram_object,
                            data_type,
                            surname_canidate[0],
                            '',
                            'multi-match',
                            surname_canidate[1],
                            recd_status,
                            surname_canidate[2]
                        )
                else:
                    # only 1 matching known surname
                    lc_number = surname_df['LC_number'].values[0]
                    if check_pos_tag(line, surname_canidate[2]):
                        update_people_in_temp_telegram_object(
                            temp_telegram_object,
                            data_type,
                            surname_canidate[0],
                            '',
                            lc_number,
                            surname_canidate[1],
                            recd_status,
                            surname_canidate[2]
                        )
                    
            # if ends with an s, remove s and check    
            else:
                if surname_canidate[0][-1] is 's':
                    post_token_wo_s = surname_canidate[0][0:-1]
                    surname_wo_s_df = people[people.Surnames == post_token_wo_s.title()]
                    if len(surname_wo_s_df) > 0:
                        if check_pos_tag(line, surname_canidate[2]):
                            update_people_in_temp_telegram_object(
                                temp_telegram_object,
                                data_type,
                                post_token_wo_s,
                                '',
                                lc_number,
                                surname_canidate[1],
                                recd_status,
                                surname_canidate[2] - 1
                            )
                    else:
                        if check_pos_tag(line, surname_canidate[2]):
                            update_people_in_temp_telegram_object(
                                temp_telegram_object,
                                data_type,
                                surname_canidate[0],
                                '',
                                lc_number,
                                surname_canidate[1],
                                recd_status,
                                surname_canidate[2]
                            )
                else:
                    if check_pos_tag(line, surname_canidate[2]):
                        update_people_in_temp_telegram_object(
                            temp_telegram_object,
                            data_type,
                            surname_canidate[0],
                            '',
                            lc_number,
                            surname_canidate[1],
                            recd_status,
                            surname_canidate[2]
                        )

In [None]:
def assign_sender_by_salutation(line, temp_telegram_object):
    surname_canidate = ""
    salutation_pattern = re.compile(r"\b(signed|obt servt|very respy)\b")
    for match in re.finditer(salutation_pattern, line):
        # is the salutation the last word in the telegram?
        # to do: we don't account for this edge case
        if last_word(match.end(), line):
            return
        
        # we have a salutation, now we need search for titles and initials
        # that may reside between the salutation and the sender's surname.
        
        # next_token is a tuple of (next_token, beginning index, end index)
        canidate = next_token(line, match.end())
        
        # filter out possible titles
        surname_canidate = find_titles(canidate[0], canidate[1], canidate[2], line)
        
        # filter out possible initials
        surname_canidate = find_surname_candidate(surname_canidate[0], surname_canidate[1], surname_canidate[2], line)
        if surname_canidate:
            update_or_add_person(
                line,
                temp_telegram_object,
                surname_canidate[1],
                surname_canidate[2],
                surname_canidate[0]
            )

In [None]:
def find_known_people(line, temp_telegram_object):
    split_telegram = line.split()
    # do any of the tokens match known surnames?
    found_surnames = [surname.lower() for surname in surnames if isinstance(surname, str) and surname.lower() in split_telegram]
    # TO DO: account for multi-word last names
    
    for surname in set(found_surnames):
        # now we can search for where in the telegram the match is found
        for match in re.finditer(surname, line):
            data_type = "People"
            recd_status = False
            lc_number = ""
            
            # if the match is not the first word, filter for location indicators
            if match.start() != 0:
                previous_token = line[0:match.start()].split()[-1]
                if len(line[0:match.start()].split()) > 0:
                    # can we also filter on grammar?
                    if previous_token in ['camp', 'ft', 'fort', 'qrs', 'hdqrs']:
                        return
            
            # this is no the last word, check the next token doesn't indicate place
            if len(line[match.end():-1].split()) > 0:
                post_token = line[match.end():-1].split()[0]
                if post_token in ['city', 'road', 'river']:
                    return
            
            surname_df = people[people.Surnames.str.lower() == surname]
            
            if len(surname_df) == 1:
                lc_number = surname_df['LC_number'].values[0]
            elif len(surname_df) > 1:
                lc_number = 'multi-match'            
            else:
                lc_number = ''
            if check_pos_tag(line, match.end()):
                update_people_in_temp_telegram_object(
                    temp_telegram_object,
                    data_type,
                    match.group(0).rstrip(),
                    '',
                    lc_number,
                    match.start(),
                    recd_status,
                    match.end()
                )

In [None]:
def identify_people(line, temp_telegram_object):
    find_known_people(line, temp_telegram_object)
    find_person_by_title(line, temp_telegram_object)
    
    # deduplication is needed
    people_dedup = []
    for person in temp_telegram_object["People"]:
        if len(people_dedup) == 0:
            people_dedup.append(person)
        else:
            captured = False
            for new_person in people_dedup:
                if new_person['start'] == person['start'] and new_person['end'] == person['end']:
                    captured = True
                    
            if not captured:
                people_dedup.append(person)
    temp_telegram_object["People"] = people_dedup            
        
    assign_sender_by_salutation(line, temp_telegram_object)    

### Check for locations

In [None]:
# updates to current entries should be handled in a different function
def update_locations_in_temp_telegram_object(temp_telegram_object, data_type, text, controlled, lc_number, start, recd, end):
    if not check_for_duplicates_by_location("Locations", temp_telegram_object, start, end):
        temp_telegram_object["Locations"].append({
            "data_type": data_type,
            "text": text,
            "controlled": controlled or '',
            "lc_number": lc_number,
            "start": start,
            "recd": recd,
            "end": end
        })

In [None]:
# process a token matching known state abbreviation/codes
# this method needs work before being included
def state_found(index, line):
    split_telegram = line.split()
    city_canidate = split_telegram[index - 1]
    
    pos_of_state_candidate = nltk.pos_tag(split_telegram)[index]
    if city_canidate in simple_ordinals or pos_of_state_candidate[1] in ['IN', 'PRP', 'CC']:
        return
    
    full_location = split_telegram[index - 1] + " " + split_telegram[index]
    if full_location in all_titles:
        return
    
    match = re.search(split_telegram[index], line)
    return (match[0], match.start(), match.end())

In [None]:
def lower_location_terms_found(index, line):
    split_telegram = line.split()
    signal = split_telegram[index]
    previous_token = split_telegram[index - 1]
    full_location = previous_token + " " + signal
    
    # don't include false positives like 'lt col'
    if full_location in all_titles:
        return
    print(signal)
    match = re.search(signal, line)
    return (signal, match.start(), match.end())
    

In [None]:
# certain words like fort and creek signal the presence of a location token in the dataset
def fort_signal_found(index, line):
    split_telegram = line.split()
    signal = split_telegram[index]
    next_token = split_telegram[index + 1]
    full_location = signal + " " + next_token
    if full_location in all_titles:
        return
    print("full_location: ", full_location)
    match = re.search(full_location, line)
    if not match:
        return None #an issue with \n and not a continuation for the fort name
    return (match[0], match.start(), match.end())

In [None]:
location_signals = ['fort', 'ft']
lower_location_terms = [location.lower() for location in location_terms]

def find_locations(line, temp_telegram_object):
    split_telegram = line.split()
    data_type = "location"
    controlled = ""
    lc_number = ""
    recd = ""
    for index, word in enumerate(split_telegram):
        if word in location_signals:
            location_canidate = fort_signal_found(index, line)
            if location_canidate is None:
                return
            update_locations_in_temp_telegram_object(
                temp_telegram_object,
                data_type,
                location_canidate[0],
                controlled,
                lc_number,
                location_canidate[1],
                recd,
                location_canidate[2]
            )
        elif word in lower_location_terms:
            
            location_canidate = lower_location_terms_found(index, line)
            
            location_df = locations[locations.Term.str.lower() == location_canidate[0]]
            number_of_lc_matches = len(location_df)
            
            if number_of_lc_matches == 1:
                lc_number = location_df.LC_number.values[0]
            elif number_of_lc_matches > 1:
                lc_number = 'multi-match'
                
            update_locations_in_temp_telegram_object(
                temp_telegram_object,
                data_type,
                location_canidate[0],
                controlled,
                lc_number,
                location_canidate[1],
                recd,
                location_canidate[2]
            )
        

### Check for date and time tokens

In [None]:
# check for duplicate entries
# updates to current entries should be handled in a different function
def update_dates_in_temp_telegram_object(temp_telegram_object, data_type, text, controlled, start, recd, end):
    if not check_for_duplicates_by_location("Dates", temp_telegram_object, start, end):
        temp_telegram_object["Dates"].append({
            "data_type": data_type,
            "text": text,
            "controlled": controlled or '',
            "lc_number": '',
            "start": start,
            "recd": recd,
            "end": end
        })

In [None]:
def update_times_in_temp_telegram_object(temp_telegram_object, data_type, text, controlled, start, recd, end):
    if not check_for_duplicates_by_location("Times", temp_telegram_object, start, end):
        temp_telegram_object["Times"].append({
            "data_type": data_type,
            "text": text,
            "controlled": controlled or '',
            "lc_number": '',
            "start": start,
            "recd": recd,
            "end": end
        })

In [None]:
def detect_recd_prefix(lines, start_index):
    before_time_match = lines[0:start_index].rstrip()
    match = re.search(r"(recd|rcvd|revd)$", before_time_match)
    if match:
        return True
    else:
        return False

In [None]:
def find_morning_time(lines, temp_telegram_object):
    for match in re.finditer(r"\b(?P<hour>[0-1]{0,1}[0-9]|[2]{0,1}[0-4])\W{0,1}\d{0,2}\W?(a)\.?(m)\.?", lines):
        recd_time = detect_recd_prefix(lines, match.start())
        data_type = "Time"
        if recd_time:
            data_type = "Recd Time"
        update_times_in_temp_telegram_object(
            temp_telegram_object,
            data_type,
            match.group(0).rstrip(),
            '',
            match.start(),
            recd_time,
            match.end()
        )
    return temp_telegram_object 

In [None]:
#see mssEC_01_043_p041_tel063.txt, matches "12 mid"
def find_meridiem_time(lines, temp_telegram_object):
    for match in re.finditer(strict_meridiem, lines):
        recd_time = detect_recd_prefix(lines, match.start())
        data_type = "Time"
        if recd_time:
            data_type = "Recd Time"
        update_times_in_temp_telegram_object(
            temp_telegram_object,
            data_type,
            match.group(0).rstrip(),
            '',
            match.start(),
            recd_time,
            match.end()
        )
    return temp_telegram_object

In [None]:
def find_evening_time(lines, temp_telegram_object):
    evening_pattern = r"\b(?P<hour>[0-1]{0,1}[0-9]|[2]{0,1}[0-4])\W{0,1}\d{0,2}\W?(p)\.?(m)\.?"
    for match in re.finditer(evening_pattern, lines):
        recd_time = detect_recd_prefix(lines, match.start())
        data_type = "Time"
        if recd_time:
            data_type = "Recd Time"
        update_times_in_temp_telegram_object(
            temp_telegram_object,
            data_type,
            match.group(0).rstrip(),
            '',
            match.start(),
            recd_time,
            match.end()
        )
    return temp_telegram_object

In [None]:
def prev_day(line, start_index, following_date_recd_status, temp_telegram_object):
    before_time_match = line[0:start_index]
    match = re.search(r"\d{1,2}\w(st|d|th|nd|rd|[^AM]|[^PM])", before_time_match)
    # If the previous date is explicitly recieved,
    # then the current date is a implicitly a sent date.
    if match:
        recd_status = False
        # not a date, it is a time
        if len(line[match.end():-1].split()) > 0:
            post_token = line[match.end():-1].split()[0]
            if post_token == 'am' or post_token == 'pm':
                return
        if following_date_recd_status:
            data_type = "Sent Date"
            update_dates_in_temp_telegram_object(
                temp_telegram_object,
                data_type,
                match.group(0).rstrip(),
                '',
                match.start(),
                recd_status,
                match.end()
            )
        else:
            data_type = "Date"
            update_dates_in_temp_telegram_object(
                temp_telegram_object,
                data_type,
                match.group(0).rstrip(),
                '',
                match.start(),
                recd_status,
                match.end()
            )
        return True
    else:
        return False

In [None]:
def find_date(line, temp_telegram_object):
    for match in re.finditer(full_date_pattern, line):
        data_type = "Full Date"
        recd_status = detect_recd_prefix(line, match.start())
        if recd_status:
            data_type = "Full Recd Date"
        if len(line[match.end():-1].split()) > 0:
            post_token = line[match.end():-1].split()[0]
            if post_token == 'am' or post_token == 'pm':
                return
        update_dates_in_temp_telegram_object(
            temp_telegram_object,
            data_type,
            match.group(0).rstrip(),
            '',
            match.start(),
            recd_status,
            match.end()
        )
        prev_day(line, match.start(), recd_status, temp_telegram_object)
         
    return temp_telegram_object

In [None]:
gold_standard = ["telegrams/clear_and_coded_telegrams/mssEC_03/mssEC_03_019_p017_tel016/preprocessed_mssEC_03_019_p017_tel016.txt",
"telegrams/clear_and_coded_telegrams/mssEC_03/mssEC_03_031_p029_tel036/preprocessed_mssEC_03_031_p029_tel036.txt",
"telegrams/clear_and_coded_telegrams/mssEC_03/mssEC_03_041_p039_tel050/preprocessed_mssEC_03_041_p039_tel050.txt",
"telegrams/clear_and_coded_telegrams/mssEC_03/mssEC_03_042_p040_tel052/preprocessed_mssEC_03_042_p040_tel052.txt",
"telegrams/clear_and_coded_telegrams/mssEC_03/mssEC_03_043_p041_tel054/preprocessed_mssEC_03_043_p041_tel054.txt",
"telegrams/clear_and_coded_telegrams/mssEC_03/mssEC_03_050_p048_tel063/preprocessed_mssEC_03_050_p048_tel063.txt",
"telegrams/clear_and_coded_telegrams/mssEC_03/mssEC_03_054_p052_tel069/preprocessed_mssEC_03_054_p052_tel069.txt",
"telegrams/clear_and_coded_telegrams/mssEC_03/mssEC_03_063_p061_tel081/preprocessed_mssEC_03_063_p061_tel081.txt",
"telegrams/clear_and_coded_telegrams/mssEC_03/mssEC_03_065_p063_tel083/preprocessed_mssEC_03_065_p063_tel083.txt",
"telegrams/clear_telegrams/mssEC_05/mssEC_05_006_007_pp002_003_tel002/preprocessed_mssEC_05_006_007_pp002_003_tel002.txt",
"telegrams/clear_telegrams/mssEC_05/mssEC_05_025_p021_tel016/preprocessed_mssEC_05_025_p021_tel016.txt",
"telegrams/clear_telegrams/mssEC_05/mssEC_05_059_p055_tel058/preprocessed_mssEC_05_059_p055_tel058.txt",
"telegrams/clear_telegrams/mssEC_05/mssEC_05_176_p172_tel185/preprocessed_mssEC_05_176_p172_tel185.txt",
"telegrams/clear_telegrams/mssEC_05/mssEC_05_242_p238_tel262/preprocessed_mssEC_05_242_p238_tel262.txt",
"telegrams/clear_telegrams/mssEC_08/mssEC_08_030_031_pp024_025_tel030/preprocessed_mssEC_08_030_031_pp024_025_tel030.txt",
"telegrams/clear_telegrams/mssEC_08/mssEC_08_239_p233_tel275/preprocessed_mssEC_08_239_p233_tel275.txt",
"telegrams/clear_telegrams/mssEC_08/mssEC_08_243_244_245_pp237_238_239_tel281/preprocessed_mssEC_08_243_244_245_pp237_238_239_tel281.txt",
"telegrams/clear_telegrams/mssEC_08/mssEC_08_258_p252_tel290/preprocessed_mssEC_08_258_p252_tel290.txt",
"telegrams/clear_telegrams/mssEC_08/mssEC_08_268_p262_tel301/preprocessed_mssEC_08_268_p262_tel301.txt",
"telegrams/clear_telegrams/mssEC_11/mssEC_11_011_p005_tel009/preprocessed_mssEC_11_011_p005_tel009.txt",
"telegrams/clear_telegrams/mssEC_11/mssEC_11_065_p059_tel095/preprocessed_mssEC_11_065_p059_tel095.txt",
"telegrams/clear_telegrams/mssEC_11/mssEC_11_067_p061_tel100/preprocessed_mssEC_11_067_p061_tel100.txt",
"telegrams/clear_telegrams/mssEC_11/mssEC_11_113_p107_tel188/preprocessed_mssEC_11_113_p107_tel188.txt",
"telegrams/clear_telegrams/mssEC_11/mssEC_11_291_p285_tel459/preprocessed_mssEC_11_291_p285_tel459.txt",
"telegrams/clear_telegrams/mssEC_11/mssEC_11_376_p370_tel593/preprocessed_mssEC_11_376_p370_tel593.txt"]

In [None]:
def deduplication(key, temp_telegram_object):
    deduplication_array = []
    for element in temp_telegram_object[key]:
        if len(deduplication_array) == 0:
            deduplication_array.append(element)
        else:
            captured = False
            for new_element in deduplication_array:
                if new_element['start'] == element['start'] and new_element['end'] == element['end']:
                    captured = True
            if not captured:
                deduplication_array.append(element)
    temp_telegram_object[key] = deduplication_array
    return temp_telegram_object
    

In [None]:
def find_metadata(telegram_number):
    file_id = telegram_corpus.fileids()[telegram_number]
    
    # retrieve telegram text
    telegram_text = telegram_corpus.raw(file_id)
    
    # create telegram object to track metadata extractions
    telegram_object = create_temp_telegram_object()

    print(telegram_text)
    print("")

    find_morning_time(telegram_text, telegram_object)
    telegram_object = deduplication("Times", telegram_object)
    
    find_evening_time(telegram_text, telegram_object)
    telegram_object = deduplication("Times", telegram_object)
    
    find_date(telegram_text, telegram_object)
    telegram_object = deduplication("Dates", telegram_object)
    
    find_locations(telegram_text, telegram_object)
    telegram_object = deduplication("Locations", telegram_object)
    
    identify_people(telegram_text, telegram_object)
    
    # setup Pandas DataFrame
    data_type = []
    text = []
    start = []
    end = []
    lc_number = []
    
    for entry in telegram_object['Times']:
        data_type.append(entry['data_type'])
        text.append(entry['text'])
        start.append(entry['start'])
        end.append(entry['end'])
        lc_number.append(entry['lc_number'])
    for entry in telegram_object['Dates']:
        data_type.append(entry['data_type'])
        text.append(entry['text'])
        start.append(entry['start'])
        end.append(entry['end'])
        lc_number.append(entry['lc_number'])
    for entry in telegram_object['People']:
        data_type.append(entry['data_type'])
        text.append(entry['text'])
        start.append(entry['start'])
        end.append(entry['end'])
        lc_number.append(entry['lc_number'])
    for entry in telegram_object['Locations']:
        data_type.append(entry['data_type'])
        text.append(entry['text'])
        start.append(entry['start'])
        end.append(entry['end'])
        lc_number.append(entry['lc_number'])
        
    data = {'type': data_type, 'text': text, "controlled": '', "lc_number": lc_number, 'start': start, 'end': end}

    df = pd.DataFrame(data=data)

    return {'file_id': file_id, 'df': df}

telegrams = range(len(telegram_corpus.fileids()))

for telegram_number in telegrams:
    found_metadata = find_metadata(telegram_number) # which is file_id
    telegram_name = found_metadata['file_id'].split('/')[3]
    ledger_name = telegram_name[0:12]
    print("Telegram: ", telegram_name)
    print(found_metadata['df'].sort_values(by=['start']))
    found_metadata['df'].sort_values(by=['start']).to_csv('/Volumes/data_work/dcw_text_mining/example_metadata/' + telegram_name + '.csv')
    
    print("-------------------------------")
    print("")
    print("")

In [None]:
class TestStringMethods(unittest.TestCase):    
    # --------------------------------
    # METHOD find_evening_time()
    # --------------------------------
    def test_evening_time_without_puncuation(self):
        example_line = "louisville 9th 2 pm recd feb 9th 62 830 pm"
        self.assertEqual(len(find_evening_time(example_line, create_temp_telegram_object())["Times"]), 2)

    def test_evening_time_odd_puncuation(self):
        example_line = "mcclellans 11.15 pm june 26 62"
        self.assertEqual(len(find_evening_time(example_line, create_temp_telegram_object())["Times"]), 1)
        
    def test_evening_hour_single_number(self):
        example_line = "mcclellans 25 5 pm recd may 25 "
        self.assertEqual(len(find_evening_time(example_line, create_temp_telegram_object())["Times"]), 1)
    
    def test_evening_pm_lowercase(self):
        example_line = "mcclellans 25 5 pm  recd may 25 "
        self.assertEqual(len(find_evening_time(example_line, create_temp_telegram_object())["Times"]), 1)
    
    def test_evening_pm_mixcase_no_space(self):
        example_line = "ap chandler ny 3pm washn 12"
        self.assertEqual(len(find_evening_time(example_line, create_temp_telegram_object())["Times"]), 1)
    
    # --------------------------------
    # METHOD find_meridiem_time()
    # --------------------------------
    def test_meridem_time(self):
        example_line = "louisville 12 mid 15th"
        self.assertEqual(len(find_meridiem_time(example_line, create_temp_telegram_object())["Times"]), 1)
        example_line = "louisville 12 m 15th"
        self.assertEqual(len(find_meridiem_time(example_line, create_temp_telegram_object())["Times"]), 1)
    
    def test_meridem_time_doesn_detect_impossible_times(self):
        example_line = "from louisville 19th  recd feb 20th 62\n12 m"
        text_found = find_meridiem_time(example_line, create_temp_telegram_object())["Times"][0]["text"]
        self.assertEqual(text_found, "12 m")  
    
    # --------------------------------
    # METHOD find_evening_time()
    # --------------------------------
    def test_morning_time(self):
        example_line_1 = "frederick 9 am 22d recd feb 22 62"
        example_line_2 = "louisville 10 am 13th"
        example_line_3 = "louisville 1035 am 13th"
        example_line_4 = "louisville 1035am 13th"
        self.assertEqual(len(find_morning_time(example_line_1, create_temp_telegram_object())["Times"]), 1)
        self.assertEqual(len(find_morning_time(example_line_2, create_temp_telegram_object())["Times"]), 1)
        self.assertEqual(len(find_morning_time(example_line_3, create_temp_telegram_object())["Times"]), 1)
        self.assertEqual(len(find_morning_time(example_line_4, create_temp_telegram_object())["Times"]), 1)
    
    # --------------------------------
    # METHOD find_date()
    # --------------------------------
    def test_date_finder(self):
        example_line = 'nashville tenn march 21 62'
        example_line2 = 'st louis 18 12 m recd Feb 18 62'
        self.assertEqual(len(find_date(example_line, create_temp_telegram_object())["Dates"]), 1)
    
    @unittest.skip("Pending Test")
    def test_no_time_included_in_date(self):
        example_line = "pattersons creek va 10th 6 pm recd feb 10th 10 pm"
    
    # If two dates are supplied, but only one month and year, 
    # the first date is date sent, second is date recieved.
    def test_date_finder_lone_sent_date(self):
        example_line = 'st louis 18 12 m recd feb 18 62'
        example_line_2 = 'pittsburg tenn 1st 3 pm recd may 2nd'
        self.assertEqual(len(find_date(example_line, create_temp_telegram_object())["Dates"]), 2)
        self.assertEqual(len(find_date(example_line_2, create_temp_telegram_object())["Dates"]), 2)
    
    def test_month_do_not_select_words_that_contain_month(self):
        example_line = "hello maynadier example"
        self.assertEqual(len(find_date(example_line, create_temp_telegram_object())["Dates"]), 0)
    
    def test_find_two_dates_in_a_line(self):
        example_line = "louisville 19th recd feb 20 62"
        self.assertEqual(len(find_date(example_line, create_temp_telegram_object())["Dates"]), 2)
    
    def test_find_date__do_not_include_beginning_time_as_a_date(self):
        example_line = "ft monroe 1st 1130 read july 1st 1140 am"
        found_date = find_date(example_line, create_temp_telegram_object())["Dates"]
        self.assertEqual(found_date[0]['text'], "july 1st")
    
    def test_find_date__with_month_abbreviation_mark(self):
        example_line = "a h caldwell no washn oct 8th 1866"
        found_date = find_date(example_line, create_temp_telegram_object())["Dates"]
        self.assertEqual(found_date[0]['text'], "oct 8th 1866")
    
    #     is the June here coded? mssEC_20_038_p032_tel062
    def test_find_date__do_not_include_text_after_month(self):
        example_line = "june wedlock nancy for stephen yam"
        found_date = find_date(example_line, create_temp_telegram_object())["Dates"]
        self.assertEqual(found_date[0]['text'], "june")
    
    @unittest.skip("Pending Test")
    def test_do_not_include_extra_letters_from_next_line(self):
        example_line = "cairo march 27 1862  rcvd 27 march\nh a wise navy dept quiet"
        found_date = find_date(example_line, create_temp_telegram_object())["Dates"]
        self.assertEqual(found_date[1]['text'], "27 march")
        
    # --------------------------------    
    # METHOD detect_recd_prefix()
    # --------------------------------
    def test_detect_recd_prefix(self):
        # TO DO: What if the Recvd date is before the sent date?
        example_line_1 = "louisville 9th 2 pm recd feb 9th 62 830 pm"
        example_line_2 = "hampton march 31 62 rcvd apl 1"
        self.assertEqual(detect_recd_prefix(example_line_1, 25), True)
        self.assertEqual(detect_recd_prefix(example_line_2, 25), True)
    
    # Case: mssEC_01_180_181_pp178_179_tel251.txt
    def test_recd_prefix_should_only_apply_to_neighbor_date_objects(self):
        example_line_3 = "ft monroe 4th 120 pm recd jul 4 62 \nnorfolk july fourth twelve thirty pm"
        self.assertEqual(detect_recd_prefix(example_line_3, 49), False)
        
    # --------------------------------    
    # METHOD update_people()
    # --------------------------------
    def test_update_people_in_temp_telegram_object(self):
        telegram_obj = create_temp_telegram_object()
        update_people_in_temp_telegram_object(
            telegram_obj,
            "People",
            "halleck",
            "",
            "",
            "817",
            False,
            824
        ) 
        update_people_in_temp_telegram_object(
            telegram_obj,
            "People",
            "halleck",
            "",
            "",
            817,
            False,
            824
        ) 
        self.assertEqual(len(telegram_obj["People"]), 1)
        
    def test_next_token(self):
        line = "hello what is the weather"
        next_token_data = next_token(line, 5)
        self.assertEqual(next_token_data[0], "what")
        self.assertEqual(next_token_data[1], 6)
        self.assertEqual(next_token_data[2], 10)
    
    def test_find_surname_candidate(self):
        canidate = "r"
        beginning_index = 4
        end_index = 5
        line = "gen r b marcy"
        surname_canidate = find_surname_candidate(canidate, beginning_index, end_index, line)
        # (canidate, beginning_index, end_index)
        self.assertEqual(surname_canidate[0], "marcy")
        self.assertEqual(surname_canidate[1], 8)
        self.assertEqual(surname_canidate[2], 13)
        
    def test_find_person_by_title(self):
        line = "gen r b marcy do not send the regular infantry"
        telegram_obj = create_temp_telegram_object()
        person = find_person_by_title(line, telegram_obj)
        self.assertEqual(len(telegram_obj["People"]), 1)
        
if __name__ == '__main__':
    
    unittest.main(argv=['first-arg-is-ignored'], exit=False)