In [16]:
# !pip install textract-trp

In [1]:
import boto3
import time
import sagemaker
import os 
import trp
import pandas as pd
from util.preprocess import *
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/medical_notes'

In [2]:
fileName =  'sample_report_1.pdf'
fileUploadPath = os.path.join('./data', fileName)
textractObjectName = os.path.join(prefix, 'data', fileName)

# Upload medical report file
boto3.Session().resource('s3').Bucket(bucket).Object(textractObjectName).upload_file(fileUploadPath)

In [3]:
textract = boto3.client('textract')
response = textract.start_document_analysis(
    DocumentLocation={
        'S3Object': {
            'Bucket': bucket,
            'Name': textractObjectName
        }},
    FeatureTypes=[
        'TABLES',
    ]
    )

textractJobId = response["JobId"]
print('job id is: ',textractJobId)

job id is:  ebb4521509f320238bfc72b5f0e669612d4fa2c19937fa2da97160a936554290


In [4]:
%%time
time.sleep(5)
response = textract.get_document_analysis(JobId=textractJobId)
status = response["JobStatus"]

while(status == "IN_PROGRESS"):
    time.sleep(5)
    response = textract.get_document_analysis(JobId=textractJobId)
    status = response["JobStatus"]
    print("Textract Job status: {}".format(status))
    

Textract Job status: SUCCEEDED
CPU times: user 68.3 ms, sys: 9.77 ms, total: 78 ms
Wall time: 11.9 s


In [5]:
%%time
pages = []

time.sleep(5)

response = textract.get_document_analysis(JobId=textractJobId)

pages.append(response)

nextToken = None
if('NextToken' in response):
    nextToken = response['NextToken']

while(nextToken):
    time.sleep(5)

    response = textract.get_document_analysis(JobId=textractJobId, NextToken=nextToken)

    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']

CPU times: user 62.3 ms, sys: 9.8 ms, total: 72.1 ms
Wall time: 6.47 s


In [8]:
doc = trp.Document(pages)
print("Total length of document is",len(doc.pages))
idx=1
for page in doc.pages:
    print(color.BOLD + f"Results from page {idx}: \n" + color.END, page.text)
    idx=idx+1

Total length of document is 2
[1mResults from page 1: 
[0m Discharge Summary
Name
Terri Hodosy
Birth Date
10/22/1962
Gender
female
Post Code
1826
Admission Date
01/01/2020
Discharge Date
01/20/2020
Medications
HISTORY: This 15-day-old female presents to Children's Hospital and transferred from Hospital
Emergency Department for further evaluation. Information is obtained in discussion with the mother
and the grandmother in review of previous medical records. This patient had the onset on the day of
presentation of a jelly-like red-brown stool started on Tuesday morning. Then, the patient was noted
to vomit after feeds. The patient was evaluated at Hospital with further evaluation with laboratory
data showing a white blood cell count elevated at 22.2; hemoglobin 14.1; sodium 138; potassium 7.2,
possibly hemolyzed; chloride 107; CO2 23; BUN 17; creatinine 1.2; and glucose of 50, which was
repeated and found to be stable in that range. The patient underwent a barium enema, which was
read

In [9]:
maxLength=20000

comprehendResponse = []
comprehend_medical_client = boto3.client(service_name='comprehendmedical', region_name='us-east-1')

for page in doc.pages:
    pageText = page.text
    
    for i in range(0, len(pageText), maxLength):
        response = comprehend_medical_client.detect_entities_v2(Text=pageText[0+i:maxLength+i])
        comprehendResponse.append(response)
    patient_string = ""

In [10]:
df_cm=extractMC_v2(comprehendResponse[0])
df_cm['ID']=1
df_cm.head

  df_mc = pd.DataFrame({'MEDICAL_CONDITION': pd.Series(medical_conditions), 'Score':pd.Series(scores),'Trait':pd.Series(traits)})


<bound method NDFrame.head of                  MEDICAL_CONDITION     Score      Trait  ID
0       jelly-like red-brown stool  0.366495       SIGN   1
1                            vomit  0.885382       SIGN   1
2                     hypertension  0.680734  DIAGNOSIS   1
3                    group B strep  0.759387        NaN   1
4                           herpes  0.591829        NaN   1
5                           Denied  0.512226        NaN   1
6                decreased feeding  0.561370       SIGN   1
7                         vomiting  0.998349        NaN   1
8                         diarrhea  0.992750        NaN   1
9                            small  0.776869       SIGN   1
10                  well-developed  0.944883       SIGN   1
11                 age appropriate  0.656153       SIGN   1
12                      atraumatic  0.948751       SIGN   1
13                   normocephalic  0.949870       SIGN   1
14                           equal  0.556062       SIGN   1
15        

In [11]:
boto3.Session().resource('s3').Bucket(bucket).Object(textractObjectName).delete()

{'ResponseMetadata': {'RequestId': '1XA72GVTCKTH7SPM',
  'HostId': 'Vkf3xcFA9g9DR+44SOesmK5mlyfv33//aYJNx/reIlriwbpOn/GFiG1/YIC4hg5eqz7txreck+k=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'Vkf3xcFA9g9DR+44SOesmK5mlyfv33//aYJNx/reIlriwbpOn/GFiG1/YIC4hg5eqz7txreck+k=',
   'x-amz-request-id': '1XA72GVTCKTH7SPM',
   'date': 'Sun, 09 Oct 2022 10:39:37 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}