In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Copy patient reports from google drive
!unzip -q "/content/drive/My Drive/Colab Notebooks/mimic-cxr-reports.zip"

In [None]:
# Process more than 200,000 patient reports
# Use regular expressions to identify ETT positive and ETT negative reports
# Select 8,000 ETT positive and 8,000 ETT negative reports

# Import library for file operations
import glob

# Import library for regular expressions
import re

# Search for all report files recursively:
reportFiles = glob.glob('mimic-cxr-reports/**/*.txt', recursive=True)

# Initialize data count to zero for the number of reports that contain no ett or endotracheal
noettCount = 0

# Initialize distance count to zero for the number of reports that contain distance to carina
ettCount = 0

# Initialize the highest distance to carina to zero
distHigh = 0.0

# Initialize the lowest distance to carina to zero
distLow = 100.0

# Set the ETT regular expression to match whole word
ettRegEx = r"\bendotracheal\b|\bett\b"

# Compile the ETT regex to improve performance
ettReComp = re.compile(ettRegEx, re.IGNORECASE)

# Set the distance regular expression
# Distance may be in the form of 5, 5.5, 5.5., 5-5.5, 5-, -5, or blank: (\d*\.*\d*\-*\d*\.*\d*)
distRegEx = r"(\bendotracheal\b|\bett\b)+\D+(\d*\.*\d*\-*\d*\.*\d*)\s*cm\D+\bcarina[l]*\b"

# Compile the distance regex to improve performance
distReComp = re.compile(distRegEx, re.IGNORECASE)

# Last patient id
lastPatientID = ""

# First two characters of patient id
prefixPatientID = ""

# Count for prefix
prefixCount = 0

# Open an output file to append patient id (subject id), study id, and path for non-ett reports
with open("no-ett.csv", "a") as noettFile:
    # Add the header line to the no-ett file
    noettFile.write("subject_id,study_id,path\n")

    # Open an output file to append patient id (subject id), study id, path, and distance from carina for ett reports
    with open("ett.csv", "a") as ettFile:
        # Add the header line to the ett file
        ettFile.write("subject_id,study_id,path,distance\n")

        # Process each report
        for reportFile in reportFiles:
            # Open the report file to read
            with open(reportFile, 'r') as inputFile:
                # Get patient ID from report file path
                patientStart = reportFile.rfind("/p") + 2
                patientEnd = reportFile.rfind("/s")
                patientID = reportFile[patientStart:patientEnd]

                # Get study ID from report file path
                studyStart = reportFile.rfind("/s") + 2
                studyEnd = reportFile.rfind(".txt")
                studyID = reportFile[studyStart:studyEnd]

                # Read the report content
                data = inputFile.read().replace('\n', ' ')

                # Find all matches for distance from carina by using a regular expression
                distMatch = distReComp.findall(data)

                # Found a list of matches and the last match is not blank
                if len(distMatch) > 0 and distMatch[len(distMatch) - 1][1] != "":
                    # Check ett count
                    if ettCount < 8000:
                        # Get the distance string
                        distStr = distMatch[len(distMatch) - 1][1]

                        # Distance string ends with a period
                        if distStr.endswith("."):
                            # Remove the last period
                            distStr = distStr[0:len(distStr) - 1]

                        # Initialize distance to 0.0 to make it float
                        dist = 0.0

                        # Distance string contains a dash character
                        if "-" in distStr:
                            # Split the distance string by the dash character
                            distList = distStr.split("-")

                            # Left part of the string is blank
                            if distList[0] == "":
                                # Must be a negative number
                                dist = float(distStr)
                            else:
                                # Convert left part to float
                                dist = float(distList[0])

                                # Right part of the string is not blank
                                if distList[1] != "":
                                    # Get the average of left and right parts
                                    dist = (dist + float(distList[1])) / 2
                        else:
                            # Convert distance string to float
                            dist = float(distStr)

                        # Check against the highest distance
                        if (dist > distHigh):
                            distHigh = dist
                        
                        # Check against the lowest distance
                        if (dist < distLow):
                            distLow = dist
                        
                        # Append to the ett file
                        ettFile.write(patientID + "," + studyID + "," + reportFile.replace("mimic-cxr-reports", "files") + "," + str(dist) + "\n")

                        # Increment ett count by 1
                        ettCount += 1
                else:
                    # Search for Endotracheal or ETT by using a regular expression
                    ettMatch = ettReComp.search(data)

                    # Check the conditions to add the patient to the no-ett file
                    if ettMatch is None and noettCount < 8000 and patientID != lastPatientID:
                        # Set last patient id
                        lastPatientID = patientID

                        # Check prefix of patient id
                        if patientID[0:2] != prefixPatientID:
                            # Set prefix of patient id
                            prefixPatientID = patientID[0:2]

                            # Initialize prefix count
                            prefixCount = 1
                        else:
                            # Increment prefix count
                            prefixCount += 1

                        # Check prefix count
                        if prefixCount < 801:
                            # Append to the no-ett file
                            noettFile.write(patientID + "," + studyID + "," + reportFile.replace("mimic-cxr-reports", "files") + "\n")

                            # Increment no-ett count by 1
                            noettCount += 1

                # Check counts
                if ettCount == 8000 and noettCount == 8000:
                    break

# Print variables
print("Reports with No ETT: " + str(noettCount))
print("Reports with ETT and Distance to Carina: " + str(ettCount))
print("Highest Distance: " + str(distHigh))
print("Lowest Distance: " + str(distLow))