In [23]:
from numpy import mean
import pandas as pd
import boto3
import os
import time
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from util.preprocess import *
import trp
import pickle
# load saved model
with open('model_pkl2' , 'rb') as f:
    model = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [24]:
# Retrieve the list of existing buckets
s3 = boto3.client('s3')
response = s3.list_buckets()

# Output the bucket names
for bucket in response['Buckets']:
    print(bucket);


{'Name': 'amplify-mediscan-dev-134554-deployment', 'CreationDate': datetime.datetime(2022, 10, 15, 8, 16, 2, tzinfo=tzlocal())}
{'Name': 'mediscan-aws-hackathon', 'CreationDate': datetime.datetime(2022, 10, 13, 12, 42, 15, tzinfo=tzlocal())}
{'Name': 'sagemaker-ap-south-1-134811648823', 'CreationDate': datetime.datetime(2022, 10, 13, 18, 0, 15, tzinfo=tzlocal())}


In [25]:
def extract_text(textractJobId,response):
    pages = []

    time.sleep(5)
    textract = boto3.client('textract')
    response = textract.get_document_analysis(JobId=textractJobId)

    pages.append(response)

    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']

    while(nextToken):
        time.sleep(5)
        textract = boto3.client('textract')
        response = textract.get_document_analysis(JobId=textractJobId, NextToken=nextToken)

        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']
    
    return pages 

In [26]:
def extractTextract(bucket,textractObjectName):
    textract = boto3.client('textract')
    response = textract.start_document_analysis(
        DocumentLocation={
            'S3Object': {
            'Bucket': 'mediscan-aws-hackathon',
            'Name': 'sample_report_1.pdf',

        }
            },
        FeatureTypes=[
            'TABLES',
        ]
        )

    textractJobId = response["JobId"]
    print('job id is: ',textractJobId)
    time.sleep(15)
    response = textract.get_document_analysis(JobId=textractJobId)
    status = response["JobStatus"]

    while(status == "IN_PROGRESS"):
        time.sleep(5)
        response = textract.get_document_analysis(JobId=textractJobId)
        status = response["JobStatus"]
        print("Textract Job status: {}".format(status))
    
    pages=extract_text(textractJobId,response)
    doc = trp.Document(pages)
    return doc

In [27]:
def extractMedical(doc):
    maxLength = 10000
    comprehendResponse = []
    comprehend_medical_client = boto3.client(service_name='comprehendmedical', region_name='ap-southeast-2')
    for page in doc.pages:
        pageText = page.text
    
        for i in range(0, len(pageText), maxLength):
            response = comprehend_medical_client.detect_entities_v2(Text=pageText[0+i:maxLength+i])
            comprehendResponse.append(response)
        patient_string = ""
        
    #df_cm=extractMC_v2(comprehendResponse[0])
    return comprehendResponse

#############################################################################################
############# functions to convert all medical conditions to 1 record ########################

In [28]:
# pipeline for new pdf
#extract data from textract
fileName =  'sample_report_1.pdf'
textractObjectName = os.path.join('public', fileName)
print("EHR file to be processed is at ", textractObjectName)
textract = boto3.client('textract')
doc=extractTextract(bucket,textractObjectName)
#extract data from comprehend medical
comprehendResponse=extractMedical(doc)
df_cm=extractMC_v2(comprehendResponse[0])
#Organize the extracted json file into dataframe
mclist, df_cm2=retrieve_mcList(df_cm, nFeature=20,threshold=0.9)
df_cm2=df_mc_generator_slim(df_cm2)

EHR file to be processed is at  public/sample_report_1.pdf
job id is:  1ca8bf776a732633fd0d44dda63ff0c2aee06f7bdc72fc7d788cb3c03b4e851a


  df_mc = pd.DataFrame({'MEDICAL_CONDITION': pd.Series(medical_conditions), 'Score':pd.Series(scores),'Trait':pd.Series(traits)})


In [29]:
df_cm2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      1 non-null      int64  
 1   nontender               1 non-null      float64
 2   foreign body            1 non-null      int64  
 3   edema                   1 non-null      int64  
 4   alert                   1 non-null      int64  
 5   murmur                  1 non-null      int64  
 6   chest pain              1 non-null      int64  
 7   vomiting                1 non-null      float64
 8   hiatal hernia           1 non-null      int64  
 9   distress                1 non-null      int64  
 10  hemostasis              1 non-null      int64  
 11  carpal tunnel syndrome  1 non-null      int64  
 12  endometriosis           1 non-null      int64  
 13  weakness                1 non-null      int64  
 14  pain                    1 non-null      int64 

In [30]:
#Prediction with the endpoint
df_cm2_aug = df_cm2.drop('ID', axis=1)
df_cm2_aug.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 37 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   nontender               1 non-null      float64
 1   foreign body            1 non-null      int64  
 2   edema                   1 non-null      int64  
 3   alert                   1 non-null      int64  
 4   murmur                  1 non-null      int64  
 5   chest pain              1 non-null      int64  
 6   vomiting                1 non-null      float64
 7   hiatal hernia           1 non-null      int64  
 8   distress                1 non-null      int64  
 9   hemostasis              1 non-null      int64  
 10  carpal tunnel syndrome  1 non-null      int64  
 11  endometriosis           1 non-null      int64  
 12  weakness                1 non-null      int64  
 13  pain                    1 non-null      int64  
 14  mass                    1 non-null      float6

In [31]:
sample_pred = model.predict(df_cm2_aug)
p = model.predict_proba(df_cm2_aug)
p=[int(sample_pred),0]
print(p)

[0]
