In [1]:
import os
import re
import pandas as pd
import hashlib  # for detecting duplicate
from tqdm.notebook import tqdm  # for progress bars


dir_name = "../data_preprocessed/"
dir_name_csv = "../data_preprocessed_csv/"

## known issues with preprocessing

1) Sometimes in a question, we have the questioner being identified:\
`Q.  (BY MR. SMITH) <question>`\
The `(BY MR. SMITH)` should be removed as it is not part of the question. Currently it is not being removed

2) Certain types of headers are not being detected and are considered part of a question/answer. For example:\
`I have no further questions. F U R T H E R E X A M I N A T I O N`

3) The page numbering does not always work correctly. One reason why is from the file `Transcript_for_sync_HG082219.txt`:\
`00010:01 Q. Okay. Did you look at the LLC agreement?`\
In most depositions, the page number has its own line at the top, whereas in this deposition, the page number and first line of text is part of top line of the page.

4) There are certain kinds of duplicates that are not detected. I have not spent any time to work out why. E.g.:\
`82819_Sicilia_Saimesier.csv`, `82819_Sicilia_Saimesier(1).csv`\
`19-0813_Kelly_Boyle_Volume_2.txt`, `19-0813_Kelly_Boyle_Volume_2(1).txt`, `v19-0813_Kelly_Boyle_Volume_2(2).txt`

5) Identifying questioner does not always work. E.g. in `082319.txt`:\
```9        capacity as Trustee.
10    EXAMINATION BY
11    MR. CROKE:
12        Q.    Good morning, Mr. Mathur.```\
Currently I look for the phrase `BY MR. CROKE` on one line, whereas here it is split on two lines

6) No attempt has been made to process all the preamable at the start. Dataframe only starts from 5 lines above the first question.

## functions for preprocessing

In [2]:
def replace_space_with_underscore():
    for f in os.listdir(dir_name):
        new_name = f.replace(" ", "_")
        os.rename(dir_name + f, dir_name + new_name)
    for f in os.listdir(dir_name):
        print(f)


def strip_lines(lines):
    """
    strip spaces, newlines, carriage returns from each of the lines
    """
    lines_new = []

    for line in lines:
        lines_new.append(line.strip(" \n\x0c"))

    return lines_new


def delete_blank_lines(lines):
    """
    given list of lines with \n removed, deletes those lines that are empty
    """
    lines_new = []

    for line in lines:
        if line == "":
            continue
        else:
            lines_new.append(line)

    return lines_new


def remove_time_tags(lines):
    """
    given list of lines, remove time tags at the start or end of the lines
    """
    # added requirement for 2 spaces in pat_end because it is possible a
    # time is mentioned in a question or answer
    # the ?: at the start of each group makes them non-capturing groups
    timestamp = r"(?:\d\d:\d\d(?::\d\d)?(?:[AP]M)?)"
    pat_start = "^" + timestamp + " +(.*)"
    pat_end = "(.*) {2,}" + timestamp + "$"

    lines_new = []

    for line in lines:
        match_start = re.match(pat_start, line)
        match_end = re.match(pat_end, line)

        if match_start:
            lines_new.append(match_start.groups()[0])
        elif match_end:
            lines_new.append(match_end.groups()[0])
        else:
            lines_new.append(line)

    return lines_new


def is_start_of_question(text):
    """
    determine if the text is the start of a question
    """
    pat = r"^Q[\. ] +[^ ]"

    match = re.match(pat, text)
    if match:
        return True
    else:
        return False


def is_identifying_questioner(text):
    """
    given text, determine if it is of the form:
    BY MS. SMITH:
    """
    pat = r"B[Yy][ -](M[RrSs]\. ?([A-Z][A-Za-z-]+\b ?)+|([A-Z][A-Za-z-]+\b ?)+(, ESQ))"
    match = re.search(pat, text)

    if match:
        return match.groups()[0].upper()
    else:
        return False


def find_first_question(lines):
    """
    find first question in deposition.
    returns index from lines if found
    """

    found = False
    for i, line in enumerate(lines):
        splits = split_into_num_text(line)
        if splits is None:
            continue

        _, text = splits

        if is_start_of_question(text):
            found = i
            break

    return found


def is_start_of_answer(text):
    """
    determine if the text is the start of an answer
    """
    pat = r"^A[\. ] +[^ ]"

    match = re.match(pat, text)
    if match:
        return True
    else:
        return None


def find_bad_files(dir_name):
    """
    a file is considered bad if the function find_first_question returns false
    manually checking suggests these files are not depositions
    """
    bad_files = []

    for filename in os.listdir(dir_name):
        with open(dir_name + filename, "r", encoding="windows-1252") as f:
            lines = f.readlines()

        lines = strip_lines(lines)
        lines = delete_blank_lines(lines)
        lines = remove_time_tags(lines)

        if not find_first_question(lines):
            bad_files.append(filename)

    return bad_files


def is_page_number(line):
    """
    given line from file, determine if corresponds to a page numbering
    if yes, return the page number
    if not, return 0
    """
    pat = r"0*(\d+)$"
    match = re.match(pat, line)
    if not match:
        return 0
    else:
        return int(match.groups()[0])


def find_current_page_number(lines):
    """
    given lines from file (after selecting core) determine current page number
    """
    # keep reading through lines until you find a page number
    # this will be the page number of the second page in lines
    # so subtract 1 to get page number of current page
    for line in lines:
        page_number = is_page_number(line)
        if page_number:
            return page_number - 1


def split_into_num_text(line):
    """
    given line, split it into line number and the text.
    """
    pat = r"^(\d*:)?(\d+) +(.*)$"
    match = re.match(pat, line)
    if match:
        return int(match.groups()[1]), match.groups()[2]
    else:
        return None


def is_start_of_side_chat(text):
    """
    given text, determine if it is first line of 'side chat'
    e.g. the second line in following:
    Q.  Doesn't this document imply x did y?
         MR. SMITH: Objection form.
    if yes, return the person speaking
    if not, return False
    """
    # this pattern is known to match too many things, e.g. WITNESS NAME:
    pat = r"^([A-Z]+\.?( [A-Z]+)+): +[\w\(-]"
    match = re.match(pat, text)

    if match:
        return match.groups()[0]
    else:
        return False


def is_start_of_brackets(text):
    """
    given text, determine it is start of text that is contained in brackets
    """
    pat = r"^\([^\(\)]*\)?$"
    match = re.match(pat, text)

    if match:
        return match.group()
    else:
        return False


def is_only_symbols(text):
    """
    given string, determine if it is made up of only symbols
    intention is to find things like linebreaks '- - -'
    """
    pat = "[a-zA-Z0-9]"
    match = re.search(pat, text)

    if match:
        return False
    # -- is used to indicate somebody was interrupted just before they were going to speak
    elif text == "--":
        return False
    else:
        return True


def create_dataframe_from_file(filename):
    """
    given a deposition in filename, create a dataframe
    """
    with open(dir_name + filename, "r", encoding="windows-1252") as f:
        lines = f.readlines()

    lines = strip_lines(lines)
    lines = delete_blank_lines(lines)
    lines = remove_time_tags(lines)

    return create_dataframe_from_lines(lines)


def create_dataframe_from_lines(lines):
    """
    this is the most important function in preprocessing. it loops through all the lines,
    checks what kind of line it is, and combines that line with previous lines as appropriate.
    then outputs a dataframe
    """
    # choose starting point as 5 lines above first question. this is because the questioner is almost always
    # identified within the few lines before first question
    start = find_first_question(lines) - 5
    if start is None:
        print("could not find start")
        return None

    # initialize various parameters
    ongoing_indice = 0
    current_page_number = find_current_page_number(lines[start:])
    current_line_number = 0
    ongoing_page_number = 0
    ongoing_line_number = 0
    ongoing_text = ""
    ongoing_line_type = None
    ongoing_questioner = None
    ongoing_speaker = None

    # the indice and time_added columns are included to help with debugging. they have no use for end-users
    columns = [
        "indice",
        "page_number",
        "line_number",
        "text",
        "text_type",
        "speaker",
        "time_added",
    ]
    data = []

    for i, line in enumerate(lines):
        # ignore preamble
        if i < start:
            continue

        # page_numbering is now determined by using line numbers and first page number
        # see few lines below
        if is_page_number(line):
            continue

        splits = split_into_num_text(line)
        if splits is None:
            continue

        line_number, text = splits

        if line_number < current_line_number:
            current_page_number += 1
        current_line_number = line_number

        if is_start_of_question(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "start_question",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "q"

            questioner = is_identifying_questioner(text)
            if questioner:
                ongoing_questioner = questioner
                # include something here to remove the identification of questioner from question
            ongoing_speaker = ongoing_questioner

        elif is_start_of_answer(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "start_answer",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "a"
            ongoing_speaker = "THE WITNESS"

        elif is_start_of_side_chat(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "start_chat",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "side_chat"
            ongoing_speaker = is_start_of_side_chat(text)

        elif is_identifying_questioner(text):
            ongoing_questioner = is_identifying_questioner(text)

        elif is_start_of_brackets(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "is brackets",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "brackets"
            ongoing_speaker = None

        elif is_only_symbols(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "symbols",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "symbols"
            ongoing_speaker = None

        else:
            ongoing_text += " " + text

    data.append(
        [
            ongoing_indice,
            ongoing_page_number,
            ongoing_line_number,
            ongoing_text,
            ongoing_line_type,
            ongoing_speaker,
            "end",
        ]
    )

    return pd.DataFrame(data, columns=columns)


def remove_a_q_from_text(text):
    """
    given a string, remove 'A. ' and 'Q. ' at the start of the string
    """
    pat = r"^[A|Q]\.? +(.*$)"
    match = re.match(pat, text)

    if match:
        return match.groups()[0]
    else:
        return text


def remove_a_q_from_text_in_frame(df):
    """
    given dataframe outputted from 'create_dataframe_from...',
    remove the 'A. ' and 'Q. ' at the beginning of texts of type 'a' or 'q'
    """
    df_temp = df.copy()

    aq_indices = df_temp.text_type.isin(["a", "q"])
    df_temp.loc[aq_indices, "text"] = df_temp.loc[aq_indices, "text"].map(
        remove_a_q_from_text
    )
    return df_temp


def remove_names_from_sidechat_text(df):
    """
    given dataframe outputted from 'create_dataframe_from...',
    remove the 'MR SMITH: ' or 'THE WITNESS: ' or similar
    from start of side chat
    """
    df_temp = df.copy()

    indices = df_temp.text_type == "side_chat"

    df_temp.loc[indices, "text"] = (
        df_temp.loc[indices, "text"].str.split(pat=": +").map(lambda x: x[1])
    )

    return df_temp


def create_csvs_from_directory(directory):
    """
    given a directory of text files of depositions,
    create csv files that extract data from them
    """
    for filename in tqdm(os.listdir(directory)):
        print(f"starting on {filename}")
        df = create_dataframe_from_file(filename)
        df = remove_a_q_from_text_in_frame(df)
        df = remove_names_from_sidechat_text(df)
        df.to_csv(dir_name_csv + filename[:-4] + ".csv")

## copy files from data_raw

In [4]:
!cp ../data_raw/*.txt ../data_preprocessed/

## replace spaces with underscores in filenames

In [5]:
replace_space_with_underscore()

08-09_Alba_Vidal.txt
08-09_Telesforo_Camacho-Lopez.txt
08-14_Robert_Rudd.txt
08-15_Carla_Brietman.txt
08-15_Harry_Orner.txt
08-16-19_Chris_McGrath_-_Cerneka_vs_Santa_Monica_Props.txt
08-16_Amy_Belz.txt
08-19_Francis_Stubbs.txt
08-20_Sherrilyn_Hensley.txt
08-20_Tyler_Labus.txt
08-23_Tracey_Proietto.txt
08-27_Francis_Switken.txt
08-29-2019_1520_Thursday.txt
081419L.txt
081419SG.txt
081519A-dc.txt
081519_derek_thomas.txt
082119drhoward.txt
082119duke.txt
082319.txt
082319MDavidson(1).txt
082319MDavidson.txt
082619_JAMES_GRAY.txt
082619_NGUYEN.txt
082619_TIEN_PHAM.txt
082619_TIEN_VU.txt
082719AC.txt
082719_BRANDON_CHESNEY.txt
082719_JASMINE_CHOUR.txt
082819TimothyJenningsMD-Lexitas.txt
082819_LE_2019-71488_Killion.txt
082919CC.txt
082919_DR_YARUS1.txt
09-11-19_C._Dayton.txt
09-11-19_C._Dayton_-_Time_Stamped.txt
090319_JAMES_HUNTER.txt
090319_JESSICA_MENDEZ.txt
090319_KHAALIQ_BRANCH.txt
090419_AMINATA_WELCOME(1).txt
090419_AMINATA_WELCOME(2).txt
090419_AMINATA_WELCOME.txt
090419_YUE_CHAN_LI

## manually remove files that aren't depositions
files discovered not to be depositions by manual inspection

In [None]:
filenames = [
    "RT091119-0823_xf.txt",
    "082919_DR_YARUS1.txt",
    "2019-75933.txt",
]

In [None]:
for filename in filenames:
    path = dir_name+filename
    if os.path.exists(path):
        os.remove(path)

In [None]:
with open("deleted_manual.txt", "w") as f:
    for file in filenames:
        f.write(file + "\n")

## identify bad files and delete from preprocessed folder
A file is considered bad if the function find_first_question returns false, i.e. if there does not seem to be any questions in the file. Manually checking suggests these files are not depositions.

In [6]:
bad_files = find_bad_files(dir_name)

In [7]:
for item in bad_files:
    print(item)

90903sta.txt
Natsu_Corporation_v._Penn-Star_-_8-23-19_-_Hearing_-_FINAL.txt
State_of_CA_vs._Verizon.txt


In [8]:
# manually view some lines from the bad files to check they are not depositions
for filename in bad_files:
    with open(dir_name + filename, "r", encoding="windows-1252") as f:
        lines = f.readlines()
    lines = strip_lines(lines)
    lines = delete_blank_lines(lines)

    print(filename)
    for line in lines[:400]:
        print(line)
    print(("=" * 50 + "\n") * 5)

90903sta.txt
1
1          IN THE UNITED STATES DISTRICT COURT
2               FOR THE DISTRICT OF MARYLAND
3                    SOUTHERN DIVISION
4
5     REGINA JOHNSON,             )
6     Plaintiff,                  ) Civil Action
7            vs.                  ) No.
8     WAL-MART ASSOCIATES,        ) 8:19-CV-00854-
9     INC., #2799,                ) PJM
10     Defendant.                  )
11
12
13                -     -     -     -     -
14             A statement for the record was taken in
15    the above-entitled matter on Tuesday, September
16    3, 2019, commencing at 9:15 a.m., at the offices
17    of McNamee Hosea, 6411 Ivy Lane, Suite 200,
18    Greenbelt, Maryland, before Melissa G. Fleming,
19    Notary Public.
20                  -    -    -    -    -
21
22
For The Record, Inc.
(301) 870-8025 - www.ftrinc.net -  (800) 921-5555
2
1                  A P P E A R A N C E S
2
3    ON BEHALF OF THE PLAINTIFF:
4            JOHN S. KEARNS, ESQUIRE
5            Law Offices o

In [9]:
for filename in bad_files:
    path = dir_name + filename
    if os.path.exists(path):
        os.remove(path)

In [10]:
with open("deleted_bad.txt", "w") as f:
    for file in bad_files:
        f.write(file + "\n")

## create csv files from depositions

In [11]:
create_csvs_from_directory(dir_name)

  0%|          | 0/162 [00:00<?, ?it/s]

starting on 08-09_Alba_Vidal.txt
starting on 08-09_Telesforo_Camacho-Lopez.txt
starting on 08-14_Robert_Rudd.txt
starting on 08-15_Carla_Brietman.txt
starting on 08-15_Harry_Orner.txt
starting on 08-16-19_Chris_McGrath_-_Cerneka_vs_Santa_Monica_Props.txt
starting on 08-16_Amy_Belz.txt
starting on 08-19_Francis_Stubbs.txt
starting on 08-20_Sherrilyn_Hensley.txt
starting on 08-20_Tyler_Labus.txt
starting on 08-23_Tracey_Proietto.txt
starting on 08-27_Francis_Switken.txt
starting on 08-29-2019_1520_Thursday.txt
starting on 081419L.txt
starting on 081419SG.txt
starting on 081519A-dc.txt
starting on 081519_derek_thomas.txt
starting on 082119drhoward.txt
starting on 082119duke.txt
starting on 082319.txt
starting on 082319MDavidson(1).txt
starting on 082319MDavidson.txt
starting on 082619_JAMES_GRAY.txt
starting on 082619_NGUYEN.txt
starting on 082619_TIEN_PHAM.txt
starting on 082619_TIEN_VU.txt
starting on 082719AC.txt
starting on 082719_BRANDON_CHESNEY.txt
starting on 082719_JASMINE_CHOUR.t

## remove duplicate files
this is done *after* preprocessing because some deposition files are different before preprocessing but the same after proprocessing. For example, have the same deposition with and without time tags.

In [12]:
hashes = {}
for filename in os.listdir(dir_name_csv):
    filehash = hashlib.md5(open(dir_name_csv + filename, "rb").read()).hexdigest()
    if filehash not in hashes:
        hashes[filehash] = [filename]
    else:
        hashes[filehash].append(filename)

In [13]:
duplicates = []
for value in hashes.values():
    if len(value) > 1:
        duplicates.append(value)

for i in duplicates:
    print(i)

['08-16-19_Chris_McGrath_-_Cerneka_vs_Santa_Monica_Props.csv', 'Cerneka_v._Russell_No._8_-_8-16-19_-_McGrath_-_Vol._II_-_FINAL.csv']
['082319MDavidson(1).csv', '082319MDavidson.csv']
['09-11-19_C._Dayton.csv', '09-11-19_C._Dayton_-_Time_Stamped.csv']
['090419_AMINATA_WELCOME(1).csv', '090419_AMINATA_WELCOME(2).csv', '090419_AMINATA_WELCOME.csv']
['090419_YUE_CHAN_LIN(1).csv', '090419_YUE_CHAN_LIN(2).csv', '090419_YUE_CHAN_LIN.csv']
['19-0813_Kelly_Boyle_Volume_2(1).csv', '19-0813_Kelly_Boyle_Volume_2(2).csv']
['2019-75313_-_Buffett.csv', 'NECA-IBEW_v._Precision_Castparts_-_8-28-19_-_Buffett_-_FINAL.csv']
['2019-75313_-_Combs.csv', 'NECA-IBEW_v._Precision_Castparts_-_8-28-19_-_Combs_-_FINAL.csv']
['8-20-19-B.csv', 'USA_v_Boston_-_8-20-19_-_Burns_-_FINAL.csv']
['8-20-19.csv', 'USA_v_Boston_-_8-20-19_-_Zurn_-_FINAL.csv']
['82719_Michael_VanderMaten.csv', 'Capitol_Indemnity_v._Euro_Motorcars_-_8-27-19_-_Maten_-_FINAL.csv']
['90815fec(1).csv', '90815fec.csv']
['90906mur(1).csv', '90906mur.c

In [14]:
# check that files identified as duplicates by hashing are indeed duplicates
# by manually viewing excerpts from them

for duplicate in duplicates:
    for filename in duplicate:
        print(filename)
        df = pd.read_csv(dir_name_csv + filename)
        print(df.head(10).text)
        print("=" * 10)

    print(("=" * 50 + "\n") * 5)

08-16-19_Chris_McGrath_-_Cerneka_vs_Santa_Monica_Props.csv
0     CHRISTOPHER McGRATH, having been first duly s...
1                           Good morning, Mr. McGrath.
2                                        Good morning.
3    Thank you for being with us again today. Do yo...
4                                           No, ma'am.
5    Okay.  We will go over them quickly. So before...
6                                          Yes, ma'am.
7    And remember always to give audible responses,...
8                                          Yes, ma'am.
9    Thank you. If for some reason you don't unders...
Name: text, dtype: object
Cerneka_v._Russell_No._8_-_8-16-19_-_McGrath_-_Vol._II_-_FINAL.csv
0     CHRISTOPHER McGRATH, having been first duly s...
1                           Good morning, Mr. McGrath.
2                                        Good morning.
3    Thank you for being with us again today. Do yo...
4                                           No, ma'am.
5    Okay.  We will go 

0     after having been first duly sworn, was depos...
1    Mr. Burns, the oath you have taken is the same...
2                                                 Yes.
3    The court reporter is taking down every spoken...
4                                                 Yes.
5    Okay.  If you don't understand a question, ple...
6                                                 Yes.
7    Is there any reason you cannot go forward toda...
8                                                  No.
9    Is there any reason you cannot testify fully a...
Name: text, dtype: object
USA_v_Boston_-_8-20-19_-_Burns_-_FINAL.csv
0     after having been first duly sworn, was depos...
1    Mr. Burns, the oath you have taken is the same...
2                                                 Yes.
3    The court reporter is taking down every spoken...
4                                                 Yes.
5    Okay.  If you don't understand a question, ple...
6                                                 Y

0     EDUARDO ROBLES, having been first duly sworn,...
1                                   Good morning, sir.
2                                        Good morning.
3    Would you state your complete name for the rec...
4                    Eduardo Pedroza Robles -- Robles.
5             Could you spell the middle name, please.
6                                       P-E-D-R-O-Z-A.
7                                How old are you, sir?
8                                                  54.
9    Okay.  Mr. Robles, my name is Frank Newton.  I...
Name: text, dtype: object



In [15]:
deleted_files = []

for duplicate in duplicates:
    for filename in duplicate[1:]:
        deleted_files.append(filename)
        path = dir_name_csv + filename
        if os.path.exists(path):
            os.remove(path)
        path = dir_name + filename[:-3] + ".txt"
        if os.path.exists(path):
            os.remove(path)

In [16]:
with open("deleted_duplicates.txt", "w") as f:
    for file in deleted_files:
        f.write(file + "\n")

## testing functions on all files

In [17]:
for filename in os.listdir(dir_name):
    #     print(filename)

    with open(dir_name + filename, "r", encoding="windows-1252") as f:
        lines = f.readlines()

    lines = strip_lines(lines)
    lines = delete_blank_lines(lines)
    lines = remove_time_tags(lines)

    found = find_first_question(lines)

    if not found:
        print("=" * 10)
        print(filename)
        print("=" * 10)
    elif not is_identifying_questioner(lines[found - 1]):
        print(filename)
        for line in lines[found - 3 : found + 1]:
            print(line)
        print("\n" * 3)

082319.txt
9        capacity as Trustee.
10    EXAMINATION BY
11    MR. CROKE:
12        Q.    Good morning, Mr. Mathur.




19-0813_Kelly_Boyle_Volume_2(1).txt
19                 MR. SAMMI:  -- any attorney-client                     
20  privileged information.                                               
21                 MR. SIMONS:  Not a problem.                            
22       Q    (BY MR. SIMONS)  I think if you go back to                  




19-0813_Kelly_Boyle_Volume_2(2).txt
19                 MR. SAMMI:  -- any attorney-client                     
20  privileged information.                                               
21                 MR. SIMONS:  Not a problem.                            
22       Q    (BY MR. SIMONS)  I think if you go back to                  




19-0813_Kelly_Boyle_Volume_2.txt
19                 MR. SAMMI:  -- any attorney-client
20  privileged information.
21                 MR. SIMONS:  Not a problem.
22       Q    (BY MR. SIMONS)  I 

## testing functions on individual files

In [None]:
filenames = os.listdir(dir_name)

In [None]:
filename = filenames[64]
# filename = "Morton,_David_-_Vol._1.txt"
print(filename)

In [None]:
with open(dir_name + filename, "r", encoding="windows-1252") as f:
    lines = f.readlines()
lines = strip_lines(lines)
lines = delete_blank_lines(lines)
lines = remove_time_tags(lines)

In [None]:
df = create_dataframe_from_lines(lines)

In [None]:
df.head()

In [None]:
with open(dir_name + filename, "r", encoding="windows-1252") as f:
    lines = f.readlines()

lines = strip_lines(lines)
lines = delete_blank_lines(lines)
lines = remove_time_tags(lines)

for i, line in enumerate(lines):
#     print(line)
    num_text = split_into_num_text(line)

    if num_text is None:
        continue
    if is_identifying_questioner(num_text[1]):
        print(is_identifying_questioner(num_text[1]))

## tracking structure of files
I skimmed through several depositions to find out what structure they have in common, to help determine how to carry out the preprocessing

08-09_Telesforo_Camacho-Lopez.txt
* Q and A. dots, many spaces
* page numbers. 
* start of dep. BY MS. NOSARI:
* side chats. THE INTERPRETER: ... 
* questions restarting after side chat. BY ...:
* questions restarting after misc.
* misc. - - - \n stuff - - - 

* Q and A.
* page numbers. 
* start of dep. 
* side chats.
* questions restarting after side chat.
* questions restarting after misc.
* misc


RT091119-0823_xf.txt. 
* Q and A. no dots, many spaces. spacing between Q and A are different
* page numbers. 
* start of dep. EXAMINATION BY MR. HELLER: (no new line!)
* side chats. ARBITRATOR COHEN
* questions restarting after side chat. BY MR. HARRISON:
* questions restarting after misc. BY MR. HARRISON:
* misc. Has headings. 'A. Qureshi - Heller'
* new lawyer, EXAMINATION \n BY MR. HARRISON:
* there are multiple witnesses..
* Z A I N A B ... being first duly affirmed by notary....   new witness questions start as normal. EXAMINATION \n BY..
* this is not a deposition!! deleted


Trudnak_and_Rancourt_v._Barth_-_8-27-19_-_Christian_Pizarro_M.D._-_FINAL.txt
* Q and A. dots, sevreal spaces.
* page numbers. yes
* start of dep. EXAMINATION \n BY MR. ...:
* side chats. MR. ... :
* questions restarting after side chat.
* questions restarting after misc.
* misc

8-20-19-TS.txt
* Q and A. Q. and A. with two spaces
* page numbers.
* start of dep. EXAMINATION \n BY MR. ELLERBE:
* side chats. THE VIDEOGRAPHER: MR. ...: 
* questions restarting after side chat. BY MR. ELLERBE:
* questions restarting after misc.
* misc. sometimes a question is answered in the side chat...
* misc. has long answers, with questioning saying OK in between to confirm their understanding...
* misc. marking exhibit done in brackets, then questions restarted with BY MR. ...:
* (reviewing documents) has same spacing as answers? less spaces than (exhibit x marked)
* has time tags that are only 1 space away from text...
* restarting after break. EXAMINATION \n BY MR. ROBINSON
* FURTHER EXAMINATION \n BY MR. ELL...:

Collins,_Mary_-_Vol._1.txt
* Q and A. with dots and multiple spaces
* page numbers.
* start of dep. EXAMINATION CONDUCTED \n BY MR. SUGARMAN:
* side chats. MR. SUGARMAN: THE WITNESS:  MR. DEATON:  more spaces than start of q and a, but same spacing as paragraphs within a q and a.
* questions restarting after side chat. nothing. just Q.   
* questions restarting after misc.
* misc.  separating sections. * * * * *
* misc. examined by 5 people
* misc. 'strike that'
* misc. had question, then side-chat, then question. so a question without an answer.
* misc. new lawyer.  EXAMINATION CONDUCTED \n BY MR. AGUDELO:
* misc. new lawyer.  EXAMINATION CONDUCTED \n  BY MS. VELLUCCI:
* marking exhibit done in brackets. Q.   But we will mark it as Exhibit 2. \n (Exhibit 2, Bottle of Powder, so\n marked.) \n   Q.   Now, a couple of follow-ups, ma'am,
* misc. A doesn't understand, in side chat, Q asks for testimony to be read. then in brackets we have (statement read out), then A continues to answer



Capitol_Indemnity_Corporation_v._Euro_Motorcars_Devon_-_8-28-19_-_Smith_-_FINAL.txt
* Q and A. without dots. Multiple spaces.
* Page numbers. top lines of each page
* start of dep. BY MR. BLUM:
* side chats. extra spaces. MR. GOGINENI:... THE WITNESS:... MR. BLUM: ...
* interupptions. your -- ... -- landlord?
* interruptions. left was cinder --.  A  concrete, cinderblock.  Q   cinderblock, okay. Where there any. (no -- on restart)
* misc. - - - (new line) exhibit (new line) - - -
* misc. can have -- in middle of sentence, not indicating interruptions, but pauses
* questions restarting after side chat. BY MR. BLUM:
* questions restarting after other. BY MR. BLUM: