In [1]:
import pandas as pd
import re
import io
import tqdm
import glob
import natsort
import os


# INPUT_PATH = "outputs/20231011_otter_all/"
# INPUT_PATH = "outputs/20231011_test/"
# INPUT_PATH = "outputs/20231011_redo/"
# INPUT_PATH = "outputs/20231026_gpt4parsed/"
INPUT_PATH = "outputs/"
OUTPUT_PATH = "outputs/"
QUESTIONS_TEMPLATE = "prompts/questions_8.txt"


filenames = glob.glob(os.path.join(INPUT_PATH, "C*.txt"))
filenames = natsort.natsorted(filenames)
expected_shape = (42, 3)  # (38,2) previously

# Utility
# def detect_delimiter(file_path):
#     with open(file_path, 'r', encoding='utf-8', newline='') as file:
#         sample = file.read(4096)
#         sniffer = csv.Sniffer()
#         delimiter = sniffer.sniff(sample).delimiter
#     return delimiter


def convert_whitespace_to_tabs(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    # Replace occurrences of two or more whitespaces (excluding newlines) with a tab in each line
    tabbed_lines = [re.sub(r"[ \t][ \t]+", "\t", line) for line in lines]
    return "".join(tabbed_lines)


exception_files = []
broken_files = []
dfs = []
answers = []
evidences = []
num_loaded_correctly = 0
for file_name in tqdm.tqdm(filenames):
    try:
        # V1
        df = pd.read_csv(
            file_name,
            sep="\t",
            # header=True,
            index_col=0,
        )

        # V2
        # delimiter = detect_delimiter(file_name)
        # df = pd.read_csv(file_name, sep=delimiter, header=0, index_col=0, on_bad_lines='skip',)

        # V3
        # tabbed_data = convert_whitespace_to_tabs(file_name)
        # data_io = io.StringIO(tabbed_data)
        # # print(data_io.getvalue())
        # df = pd.read_csv(data_io, sep="\t", header=0, index_col=0)
        # # print(file_name, df.columns)

    except Exception as e:
        exception_files.append((file_name, e))
        print("Exception in", file_name, e)
        continue

    if df.shape != expected_shape:
        broken_files.append((file_name, df.shape))
        print("Error in", file_name, "expected shape", expected_shape, "got", df.shape)
        continue

    subject_id = file_name.split(".")[0].split("/")[-1]
    filename = file_name.split("/")[-1]
    # print(subject_id, df.shape, df.columns)

    # df.set_index('question', inplace=True)
    # if 'question' in df.columns:

    df = df.drop(columns=["question"])
    answer = df.drop(columns=["evidence"]).T
    evidence = df.drop(columns=["answer"]).T
    df = df.T
    df["file_name"] = filename
    df["subject_id"] = subject_id
    dfs.append(df)

    answer["file_name"] = filename
    answer["subject_id"] = subject_id
    answers.append(answer)

    evidence["file_name"] = filename
    evidence["subject_id"] = subject_id
    evidences.append(evidence)
    num_loaded_correctly += 1

print("Loaded", num_loaded_correctly, "files correctly")

100%|██████████| 93/93 [00:00<00:00, 443.47it/s]

Loaded 93 files correctly





In [2]:
# Debug
# for df in dfs:
#     print(
#         df["subject_id"][0],
#         # df.columns[:3],
#         df.columns[-5:-2],
#     )
# len(dfs)
# [df.shape for df in dfs]

print("value_counts of data-frame-shapes:")
print(pd.Series([str(df.columns) for df in dfs]).value_counts())
print()


print("value_counts of data-frame-shapes:")
print(pd.Series([str(df.shape) for df in dfs]).value_counts())
print()

df

value_counts of data-frame-shapes:
Index([           1,            2,            3,            4,            5,\n                  6,            7,            8,            9,           10,\n                 11,           12,           13,           14,           15,\n                 16,           17,           18,           19,           20,\n                 21,           22,           23,           24,           25,\n                 26,           27,           28,           29,           30,\n                 31,           32,           33,           34,           35,\n                 36,           37,           38,           39,           40,\n                 41,           42,  'file_name', 'subject_id'],\n      dtype='object', name='question_number')    93
Name: count, dtype: int64

value_counts of data-frame-shapes:
(2, 44)    93
Name: count, dtype: int64



question_number,1,2,3,4,5,6,7,8,9,10,...,35,36,37,38,39,40,41,42,file_name,subject_id
answer,23,"Houston, Texas, United States",Unmarried,No,,No,A third-year medical student,Third year,Baylor College of Medicine,,...,"Yes, the student acknowledges the need for hel...","No, the student finds fulfillment in the medic...","No, however, the student mentions that the red...","Yes, the student strongly advocates for mental...","No physical obstacles, but mentions mental obs...",Feels closely adhering to the Hippocratic Oath,"Yes, but with some ambivalence. The trainee ag...",Potential impact of the crisis on Prometric exams,C102.gpt-4-32k.txt,C102
evidence,I'm 23 years old.,"Currently, I live in Houston, Texas, United St...",I'm unmarried.,Do you have kids? No.,Do you have kids? No.,"Are you a caretaker otherwise? No, I'm not.","I remember medical students specifically, I'm ...",I'm a third year medical student.,And where are you completing your training? Ba...,,...,"Yeah, I think mental health is very important,...","Probably not, I just can't see myself doing so...",I can't say that the services have been on so ...,"Yeah, I think mental health is very important,...",I don't think that there's any physical obstac...,"I think I speak for myself and my classmates, ...","For the most part, I do agree that medical stu...","Potentially, I would ask, you know, about step...",C102.gpt-4-32k.txt,C102


In [3]:
merged = pd.concat(dfs).reset_index(drop=True)
answers = pd.concat(answers).reset_index(drop=True)
evidences = pd.concat(evidences).reset_index(drop=True)

In [4]:
# Debug
print("Concatenated data-frames shape:", merged.shape)
if merged.shape[0] != num_loaded_correctly:
    print("Error: concatenated data-frame has wrong number of rows")
    print("Expected:", num_loaded_correctly, "got", merged.shape[0])

merged.head(n=2)

Concatenated data-frames shape: (186, 44)
Error: concatenated data-frame has wrong number of rows
Expected: 93 got 186


question_number,1,2,3,4,5,6,7,8,9,10,...,35,36,37,38,39,40,41,42,file_name,subject_id
0,32,"Houston, Texas, United States of America",Single,No,,No,MD - Cardiology critical care,,University of Tennessee,Yes,...,"Yes, would seek help from Baylor or hospital r...","-Unknown Speaker 30:17 Oh, no, I love my job ...",,"Yes, would seek help from Baylor or hospital r...",No,,Not applicable,Unknown Speaker 31:02 Thank you for your time...,C001.gpt-4-32k.txt,C001
1,"Unknown Speaker 00:34: Houston, Texas, United...","Unknown Speaker 00:34: Houston, Texas, United...",Unknown Speaker 00:36: What's your marital st...,Unknown Speaker 00:36: What's your marital st...,Not applicable,Unknown Speaker 01:09 or liquidator? Otherwis...,Unknown Speaker 00:59 Cardiology critical car...,Not applicable,Unknown Speaker 01:21 University of Tennessee?,Answer implied during interview,...,"Unknown Speaker 29:06 Yes, absolutely. I am. ...",,Not asked during interview,"Unknown Speaker 29:06 Yes, absolutely. I am. ...","Unknown Speaker 30:36 Personally, no, I mean,...",Not applicable,,,C001.gpt-4-32k.txt,C001


In [5]:
print(merged.columns.to_list())
print(merged["subject_id"].to_list())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 'file_name', 'subject_id']
['C001', 'C001', 'C002', 'C002', 'C003', 'C003', 'C004', 'C004', 'C005', 'C005', 'C006', 'C006', 'C007', 'C007', 'C008', 'C008', 'C009', 'C009', 'C010', 'C010', 'C011', 'C011', 'C012', 'C012', 'C014', 'C014', 'C015', 'C015', 'C016', 'C016', 'C017', 'C017', 'C018', 'C018', 'C019', 'C019', 'C020', 'C020', 'C021', 'C021', 'C022', 'C022', 'C023', 'C023', 'C024', 'C024', 'C025', 'C025', 'C026', 'C026', 'C028', 'C028', 'C029', 'C029', 'C030', 'C030', 'C031', 'C031', 'C032', 'C032', 'C033', 'C033', 'C034', 'C034', 'C035', 'C035', 'C036', 'C036', 'C037', 'C037', 'C038', 'C038', 'C039', 'C039', 'C040', 'C040', 'C041', 'C041', 'C042', 'C042', 'C043', 'C043', 'C044', 'C044', 'C045', 'C045', 'C046', 'C046', 'C047', 'C047', 'C048', 'C048', 'C049', 'C049', 'C050', 'C050', 'C051', 'C051', 'C052', 'C052', 'C053', 'C05

In [6]:
questions = pd.read_csv(QUESTIONS_TEMPLATE, sep="\t", header=0, index_col=0)["question"]
print(questions.shape)
questions.head()

(42,)


question_number
1                        How old are you?
2                      Where do you live?
3            What is your marital status?
4                       Do you have kids?
5    If you do have kids, provide details
Name: question, dtype: object

In [7]:
# merged.columns = questions.to_list() + merged.columns[-2:].to_list()
# merged.columns
# merged.sort_values(by=["subject_id", "file_name"], inplace=True)
# merged.to_csv(f"{OUTPUT_PATH}/merged.tsv", sep="\t", index=False)
# merged.to_excel(
#     f"{OUTPUT_PATH}/merged.xlsx", index=False, sheet_name="Sheet1", engine="xlsxwriter"
# )
# print(merged.shape)
# merged.head()


def process_df(df, output_path, file_name):
    print(file_name)
    df.columns = questions.to_list() + df.columns[-2:].to_list()
    df.sort_values(by=["subject_id", "file_name"], inplace=True)
    df.to_csv(f"{output_path}/{file_name}.tsv", sep="\t", index=False)
    df.to_excel(
        f"{output_path}/{file_name}.xlsx",
        index=False,
        sheet_name="Sheet1",
        engine="xlsxwriter",
    )
    print(df.shape)
    # print(df.head())


process_df(merged, OUTPUT_PATH, "merged")
process_df(answers, OUTPUT_PATH, "answers")
process_df(evidences, OUTPUT_PATH, "evidences")

merged
(186, 44)
answers
(93, 44)
evidences
(93, 44)


## Look at broken files

In [8]:
# print(exception_files)
print("Exception files:")
print([fn[0].split("/")[-1].split(".")[0] for fn in exception_files])

print("Broken files:")
print([fn[0].split("/")[-1].split(".")[0] for fn in broken_files])

Exception files:
[]
Broken files:
[]


In [9]:
# print(broken_files)
for file_name, shape in broken_files:
    print(file_name, shape)
    df = pd.read_csv(
        file_name,
        sep="\t",
        header=0,
        index_col=0,
        on_bad_lines="skip",
    )
    print("Missing row numbers", set(range(1, 42)) - set(df.index))
    # print(df)

# Inspect outputs per Q

In [10]:
print(merged.columns)
merged.iloc[:, 0]

Index(['How old are you?', 'Where do you live?',
       'What is your marital status?', 'Do you have kids?',
       'If you do have kids, provide details',
       'Are you a caretaker otherwise? (if not own kids, eg elderly parents, adopted family member, etc)',
       'What type of healthcare professional or student/trainee are you?',
       'If student or trainee, what year are you in?',
       'What institution did you complete your (or are currently) training at?',
       'If you are a physician, did you train in the US at any point?',
       'What is your specialty (if student, what specialty are you thinking of)?',
       'How long have you been practicing?',
       'Over the past two months, have you practiced clinically in areas where you could be in touch with patients who have covid-19?',
       'Are you concerned about your safety, and how?',
       'Are you concerned about safety of loved ones, and how?',
       'Have you modified your routine to protect yourself or others,

0                                                     32
1      Unknown Speaker  00:34: Houston, Texas, United...
2                                                     48
3      I was born in 71. So I stopped counting. I thi...
4                                                     62
                             ...                        
181                                 How old are you? 23.
182                                                   25
183                           Unknown Speaker  01:47 25.
184                                                   23
185                                    I'm 23 years old.
Name: How old are you?, Length: 186, dtype: object