## Formatting Gold Standard Data

This script reformats the gold standard data into individual files for each telegram.

In [None]:
import nltk
import numpy as np
import pandas as pd
import csv
import re

In [None]:
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

english_stop_words = stopwords.words('english')
doc_pattern = r'.*/preprocessed_.*.txt'
category_pattern = r'.*?/(\w+_telegrams)/'
path_to_corpus = '/Volumes/data_work/dcw_text_mining/eckert_papers_corpus/'
telegram_corpus = CategorizedPlaintextCorpusReader(
    path_to_corpus,
    doc_pattern,
    cat_pattern=category_pattern
)

In [None]:
clear_df = pd.read_csv('')
clear_df

In [None]:
file_names = clear_df['text_file_name'].array

In [None]:
coded = ['mssEC_04', 'mssEC_16', 'mssEC_17', 'mssEC_18', 'mssEC_19', 'mssEC_20', 'mssEC_21', 'mssEC_22', 'mssEC_23', 'mssEC_24', 'mssEC_25']
clear = ['mssEC_01', 'mssEC_02', 'mssEC_05', 'mssEC_06', 'mssEC_07', 'mssEC_08', 'mssEC_09', 'mssEC_10', 'mssEC_11', 'mssEC_12', 'mssEC_13', 'mssEC_14', 'mssEC_26', 'mssEC_27', 'mssEC_28', 'mssEC_29', 'mssEC_30', 'mssEC_31', 'mssEC_32', 'mssEC_33']
coded_and_clear = ['mssEC_03', 'mssEC_15', 'mssEC_34', 'mssEC_35']

In [None]:
def create_temp_telegram_object():
    #  type,text,controlled,lc_number,start,end
        temp_telegram_object = {}
        temp_telegram_object["type"] = []
        temp_telegram_object["text"] = []
        temp_telegram_object["start"] = []
        temp_telegram_object["end"] = []
        return temp_telegram_object

In [None]:
def final_export_type_label(export_column_header):
    if export_column_header == "Date Sent":
        return "date_sent"
    elif export_column_header == "Recipient":
        return "recipient"
    elif export_column_header == "Date Received":
        return "date_received"
    elif export_column_header == "Place Received":
        return "location_received"
    elif export_column_header == "Time Received":
        return "time_received"
    elif export_column_header == "Receiving Telegrapher":
        return "receiving_telegrapher"
    elif export_column_header == "Sender":
        return "sender"
    elif export_column_header == "Date Sent":
        return "date_sent"
    elif export_column_header == "Place Sent From":
        return "location_sent_from"
    elif export_column_header == "Time Sent":
        return "time_sent"
    elif export_column_header == "Sending Telegrapher":
        return "sending_telegrapher"
    elif export_column_header == "Full Text OCR":
        return "transcribed_text"    

In [None]:
def find_indices(text, ocr_text):
    matches = re.findall(text, ocr_text)
    if len(matches) > 1:
        print("More than one match")
    for match in re.finditer(text, ocr_text):
        return((match[0], match.start(), match.end()))

In [None]:
gold_standard_path = ''
csv_headers = ['Recipient', 'Date Received', 'Place Received', 'Time Received', 'Receiving Telegrapher', 'Sender', 'Date Sent', 'Place Sent From', 'Time Sent', 'Sending Telegrapher']

for file in file_names:
    ledger_name = file[0:8]
    
    # evaluate the type of ledger to determine the file path
    if ledger_name in coded:
        file_id_prefix = 'telegrams/coded_telegrams/'
    elif ledger_name in clear:
        file_id_prefix = 'telegrams/clear_telegrams/'
    elif ledger_name in coded_and_clear:
        file_id_prefix = 'telegrams/clear_and_coded_telegrams/'
    
    file_id = file_id_prefix + ledger_name + '/' + file[0:-4] + '/' + file
    
    # retrieve telegram text
    telegram_text = telegram_corpus.raw(file_id)
    print(file_id)
    print(telegram_text)
    
    # find panadas row
    file_df = clear_df[clear_df['text_file_name'] == file]
    temp_telegram = create_temp_telegram_object()
    
    for header in csv_headers:
        text_value = file_df[header].values[0]
        full_ocr_text = file_df['Full Text OCR'].values[0]
        if text_value != "FALSE" and text_value != "None":
            match_object = find_indices(text_value.rstrip(), full_ocr_text)
            data_type = final_export_type_label(header)
            if match_object is None:
                pdb.set_trace()       
            temp_telegram["type"].append(data_type)
            temp_telegram["text"].append(match_object[0])
            temp_telegram["start"].append(match_object[1])
            temp_telegram["end"].append(match_object[2])
            
    gold_df = pd.DataFrame(data=temp_telegram)
    print(gold_df)
    formatted_gold_data_path = gold_standard_path + file[0:-3] + 'csv'
    gold_df.to_csv(formatted_gold_data_path, index=False)
    print("")
    print("-----------------------")