In [1]:
import glob
import re
import os
import spacy
import xlsxwriter

In [2]:
nlp = spacy.load('en_core_web_sm')

In [4]:
filename = './../Data/18.txt/18\\1800-01-15_t18000115-10-verdict52.txt'
basename = os.path.basename(filename)

In [28]:
def extract_text(filename):
    text_list = []
    with open(filename, 'r', encoding='utf-8') as infile:
        for line in infile:
            if line == '\n':
                continue
            text_list.append(line[:-1])
    return text_list

def extract_speech(text_list):
    speech_list = []
    speech_fragment_bool = False
    for line in text_list:
        # Extract answers to questions
        #re.split('\\bTHE_WORD_YOU_WANT\\b',string)[-1]
        if re.search("Q.* A.", line): 
            # Split on "A." to get last element, which is answer
            split_line = line.split('A.')
            answer = split_line[-1]
            speech_list.append(answer)
            speech_fragment_bool = True

        # Extract name
        if line.isupper() and not "GUILTY" in line:
            speech_list.append(line)
            #print(line)

        # Extract text after sworn
        if re.search("sworn. *", line):
            split_line = line.split('sworn.')
            speech = split_line[-1]
            speech_list.append(speech)
            speech_fragment_bool = True

    return speech_list, speech_fragment_bool

def match_speaker_speech(speech_list):
    speaker_speech_tuples = []
    first_speech_item_bool = False
    gathered_speech = []
    name = None
    for index, line in enumerate(speech_list):
        if line.isupper(): # If its a name
            if speech_list[index-1].isupper(): # If the previous item is also a name 
                # Then extract first and last name
                first_name = speech_list[index-1]
                last_name = line
                name = first_name + last_name
                # And change first_speech_item_bool so that it knows that the next iteration 
                # is a first item
                first_speech_item_bool = True
            continue

        # For first speech turn in text
        if first_speech_item_bool == True and len(gathered_speech) == 0:
            gathered_speech.append(line)
            first_speech_item_bool = False
        
        # For first speech item of next speaker
        elif first_speech_item_bool == True and len(gathered_speech) > 0:
            # Join speech of previous speaker and append info to list
            speech_string = ' '.join(gathered_speech)
            speaker_speech_tuples.append((name, speech_string))
            # Start new list to gather speech of new speaker
            gathered_speech = [line]
            first_speech_item_bool = False
        
        # Append speech normally
        elif first_speech_item_bool == False and name != None:
            gathered_speech.append(line)

    
    return speaker_speech_tuples


def sentenize(speaker_speech_tuples, basename=None):
    
    speaker_speech_list = []
    sent_id = 0
    for name, speech in speaker_speech_tuples:
        sentence_list = []
        doc = nlp(speech)
        sentences = doc.sents
        for sentence in sentences:
            speaker_speech_list.append((basename, sent_id, name, str(sentence)))
            sent_id += 1
            
    return speaker_speech_list


def write_to_xlsx(header, data, filepath_out):
    """
    Takes a file and writes it to an Excel format.

    :param header: list of words that will form the header
    :param data: data that will be written

    :param filepath_out: 
    """

    header_dicts = [{'header': column_name} for column_name in header]

    with xlsxwriter.Workbook(filepath_out) as workbook:
        worksheet = workbook.add_worksheet()

        worksheet.add_table(0, 0, len(data), len(header) - 1,
                            {'data': data, 'columns': header_dicts, 'autofilter': False})

In [30]:
# main
count = 0
for filename in glob.glob('./../Data/18.txt/18/*')[31697:]:
    # Keep track of progress
    count += 1
    if count % 1000 == 0:
        print(str(count) + '\n' + filename)
    
    basename = os.path.basename(filename)
    text_list = extract_text(filename)
    speech_list, speech_fragment_bool = extract_speech(text_list)
    if speech_fragment_bool == True:   # If there is speech in the file, write it to .txt
        speech_list, speech_fragment_bool = extract_speech(text_list)
        speaker_speech_tuples = match_speaker_speech(speech_list)
        speaker_speech_list = sentenize(speaker_speech_tuples, basename)
        
        header = ['filename', 'sent_id', 'speaker', 'text', 'label']
        filepath_out = './../Data/cleaned_xlsx/' + basename + '.xlsx'
        write_to_xlsx(header, speaker_speech_list, filepath_out)


1000
./../Data/18.txt/18\1827-05-31_t18270531-143-punish748.txt
2000
./../Data/18.txt/18\1827-10-25_t18271025-17-punish113.txt
3000
./../Data/18.txt/18\1828-04-10_t18280410-115-punish632.txt
4000
./../Data/18.txt/18\1828-09-11_t18280911-323-punish1661.txt
5000
./../Data/18.txt/18\1829-02-19_t18290219-142-punish812.txt
6000
./../Data/18.txt/18\1829-09-10_t18290910-100-verdict570.txt
7000
./../Data/18.txt/18\1830-01-14_t18300114-33-punish196.txt
8000
./../Data/18.txt/18\1830-07-08_t18300708-40-verdict225.txt
9000
./../Data/18.txt/18\1831-01-06_t18310106-153-punish880.txt
10000
./../Data/18.txt/18\1831-06-30_t18310630-213-punish1121.txt
11000
./../Data/18.txt/18\1831-12-01_t18311201-221-verdict1186.txt
12000
./../Data/18.txt/18\1832-05-17_t18320517-116-punish689.txt
13000
./../Data/18.txt/18\1832-09-06_t18320906-58-punish344.txt
14000
./../Data/18.txt/18\1833-04-11_t18330411-103-punish576.txt
15000
./../Data/18.txt/18\1833-11-28_t18331128-36-punish217.txt
16000
./../Data/18.txt/18\1834-07