# Text Parser Testing document

This document is going to read the text file found in ../generated_data/Logs2020OCR_avepdf.com_horizontal.txt which was generated using the following website:


[avepdf.com](http://avepdf.com/)

In [None]:
import re # Regulra expresion library

In [None]:
filename='../generated_data/Logs2020OCR_avepdf.com_horizontal.txt'
#filename='../generated_data/Logs2019OCR_avepdf.com_horizontal.txt'

In [None]:
# read file in as a list of lines
def readfile(filename):
    lines = []
    with open(filename) as f:
        lines = f.readlines()

    for ind in range(len(lines)):
        lines[ind] = lines[ind].lstrip().rstrip()
    return lines

lines = readfile(filename)

In [None]:
# Parse cells using 20-1 regular expresisons
# Result is a list of list where each call is a list of lines for the call.

def txt2calls(lines):
    calls = []

    call = []
    for line in lines:
        if line == '':
            pass
        else:
            if re.match("[1-2][9,0]-[0-9]+\s", line[:10]):
                if len(call) > 0:
                    calls.append(call)
                    call = []
            call.append(line)
    if len(call) > 0:
        call.append(line)
    return calls

calls = txt2calls(lines)

In [None]:
# #Preview call parsing
# from ipywidgets import interact

# def view_calls(index=0):
#     for line in calls[index]:
#         print(line)
    
# target = interact(view_calls, index=(0,len(calls)-1))

In [None]:
def parseHeader(line):
    '''Parse the first line of each call. Use error checking to find components.'''
#     print(line)
    index_past = 0
        
    callNumberMatch = re.search("[19,20]-\d+", line)
    if callNumberMatch == None:
        return

    callNumber = callNumberMatch.group()
    index_past = callNumberMatch.span()[1]
    callNumber = line[:index_past]
        
    timeMatch = re.search("\d\d\d\d", line[index_past:])
    if timeMatch is not None:
        callTime = timeMatch.group()
        index_past += timeMatch.span()[1]+1
    else:
        callTime = None
        
    spaces = 0
    index = len(line)-1
    while spaces < 8:
        if index > 0 and index < len(line):
            if line[index] == " ":
                spaces += 1
            else:
                spaces = 0
        else:
            break
        index -= 1
    callReason = re.sub("\s+", " ", line[index_past:index])
    if len(callReason) > 1:
        if callReason[0] == " ":
            callReason = callReason[1:]
        if callReason[-1] == " ":
            callReason = callReason[:-1]
    else:
        callReason=""
    
    callAction = re.sub("\s+", " ", line[index+8:])
    if len(callAction) > 1:
        if callAction[0] == " ":
            callAction = callAction[1:]
        if callAction[-1] == " ":
            callAction = callAction[:-1]
    else:
        callAction=""
        
    return [callNumber, callTime, callReason, callAction]

def get_unit_times(unit_str):
#     print(unit_str)
    times = re.sub(' +', ' ',unit_str).split(' ')
    tm_dict = {}
    for tm in times:
        if '-' in tm:
            vals = tm.split('-')
            tm_dict[vals[0]] = vals[-1]
    return tm_dict

def parse_call_list(call):
    '''Parse a call list into a dictionary'''
    
    if len(call) == 0:
        return
    
    my_call = {}
    
    header = parseHeader(call[0])
    if header: 
        my_call['callNumber'] = header[0]
        my_call['callTime'] = header[1]
        my_call['callReason'] = header[2]
        my_call['callAction']= header[3]
#     my_call['header'] = call[0]
    
    ind = 0
    individual = ''
    while ind < len(call):
        line = call[ind]
        myline = line.split(':')
        if len(myline) == 2:
            tag = myline[0].rstrip().lstrip()
            tag = re.sub(' +', ' ',tag)
            value = myline[1].rstrip().lstrip()
            if tag == 'Narrative':
                if tag in my_call:
                    narrative = my_call['Narrative']
                else:
                    narrative = ''
                ind +=1
                while ind < len(call):
                    myline = call[ind].split(':')
                    if len(myline) == 1:
                        narrative += re.sub(' +', ' ',call[ind]) + " "
                    ind += 1
                my_call['Narrative'] = narrative
            else:
                if tag == "Unit":
                    ind += 1;
                    if ind < len(call):
                        unit_info = get_unit_times(call[ind])
                        if 'Units' in my_call:
                            my_call['Units'].append((value, unit_info))
                        else:
                            my_call['Units'] = [(value, unit_info)]
                else:
                    if tag == "Operator" or tag == "Owner":
                        individual=tag+"_"
                    my_call[tag] = value
        else:
            if len(myline) > 2:
                #print(line)
                tags = re.findall('[\S]+:', line)

                for tag in reversed(tags):
                    start = line.rindex(tag)
                    value = line[start+len(tag):].rstrip().lstrip()

                    if len(value) > 0:
                        tag = tag[:-1]
                        my_call[individual+tag] = value
                        line = line[:line.rindex(tag)]

            else:
                print("Parse Error:  "+line)
                print("")
        ind +=1

    return(my_call)

In [None]:
def parse_all_calls(calls):
    '''Parse all of the calls'''
    call_dicts =[]
    unit = ''
    for call in calls:
        my_call = parse_call_list(call)
        call_dicts.append(my_call)
    return call_dicts

call_dicts = parse_all_calls(calls)

In [None]:
# #Preview call parsing
# from ipywidgets import interact

# def view_call_dicts(index=0):
#     for key in call_dicts[index]:
#         print(f"{key}: {call_dicts[index][key]}")
    
# target = interact(view_call_dicts, index=(0,len(calls)-1))

In [None]:
#Review call parcing data
from ipywidgets import interact

def view_both(index=0):
    print("RAW CALL DATA\n")
    for line in calls[index]:
        print(line)
    print('========================================')
    print("PARSED DATA\n")
    for key in call_dicts[index]:
        print(f"{key}: {call_dicts[index][key]}")
    
target = interact(view_both, index=(0,len(calls)-1))

In [None]:
#Combine everything into one call
def parsefile(filename):
    lines = readfile(filename)
    calls = txt2calls(lines)
    call_dicts = parse_all_calls(calls)
    return call_dicts

In [None]:
filename='../generated_data/Logs2019OCR_avepdf.com_horizontal.txt'
dict2019 = parsefile(filename)
filename='../generated_data/Logs2020OCR_avepdf.com_horizontal.txt'
dict2020 = parsefile(filename)

call_dicts = dict2019+dict2020

In [None]:
#Count number of STOPS by "Craig"
total_stops = 0 
tot = 0
total_race = {}
norace = 0
my_stops = []
for call in call_dicts:
    if 'Call Taker' in call:
        if "hammer" in call['Call Taker'].lower():
            tot += 1
#             print(call['callReason'])
            if "STOP" in call['callReason']:
                my_stops.append(call)
                total_stops += 1
                if 'Operator_Race' in call:
                    race = call['Operator_Race'].lower()
                    if race in total_race:
                        total_race[race] = total_race[race]+1
                    else:
                        total_race[race] = 1
                else:
                    norace +=1

In [None]:
total_stops

In [None]:
tot

In [None]:
total_race

In [None]:
#Make a lists of different tags.  Low numbers indicate errors.
alltags = {}

for call in call_dicts:
    for tag in call:
        if tag in alltags:
            alltags[tag] = alltags[tag] + 1
        else:
            alltags[tag] = 1
        
        if len(tag) > 3:
            times = ['Disp', 'Arvd', 'Enrt', 'Clrd']
            if tag[:4] in times:
                print(call)
                print("")

In [None]:
alltags

In [None]:
len(alltags)

In [None]:
for tag in alltags:
    if len(tag) > 3:
        if tag[:4] in ['Disp', 'Ariv', 'Enrt', 'Clrd']:
            print(tag)

In [None]:
len(alltags)