# Information Extraction from Images

In [4]:
# import libraries

import re
import os
import glob
import numpy as np
import pandas as pd
import pytesseract
from PIL import Image

In [230]:
# reporting

from reportlab.pdfgen import canvas
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_LEFT
from reportlab.lib import colors
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer

In [30]:
# show path to tesseract

pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'

### Initialise

In [287]:
# directories and filenames

nhs_blood_tests = 'blood_tests/NHS_app_screenshots/' 
report_file = 'blood_tests/report.pdf'
tabular_file = 'blood_tests/report_table.xlsx' # pivot each blood test by time; to be shown to doctor as well

In [240]:
# get all relevant image files sorted

images_iter = [i for i in os.listdir(nhs_blood_tests) if i[-3:] in ['png', 'jpg']]
images_iter.sort()

### Starting to Read Relevant Images

In [242]:
# reading all the images

text_list = []
for image in images_iter:
    img = Image.open(nhs_blood_tests+image)
    text = pytesseract.image_to_string(img)
    text = re.sub(r'^e\s', '', text, flags=re.MULTILINE)
    text_list.append(text)

In [243]:
# cleaning - 1

# combining all the results into a text
medical_text = "\n".join(text_list).split("\n") 

# replacing \n
medical_text = [text.replace('\n', ' ') for text in medical_text]

In [244]:
# cleaning - 2 

# correcting dates
months = [
    'January', 'February', 'March', 'April', 'May', 'June', 'July', 
    'August', 'September', 'October', 'November', 'December'
]
for i in range(len(medical_text)):
    line_split = medical_text[i].split(' ')
    if line_split[0].isdigit() and line_split[1] in months:
        date_var = medical_text[i].split(' ')[:3]
        medical_text[i] = ' '.join(date_var)

In [245]:
# cleaning - 3

# correcting line spacing
for i in range(len(medical_text)-1):
    try:
        line_split = medical_text[i].split(' ')
        if not (line_split[0].isdigit() and line_split[1] in months):
            if len(medical_text[i])!=0 and len(medical_text[i+1])!=0:
                medical_text[i] += ' '+medical_text.pop(i+1)
    except:
        continue

In [246]:
# cleaning - 4

# adding an empty line right after date
for i in range(len(medical_text)-1):
    line_split = medical_text[i].split(' ')
    if line_split[0].isdigit() and line_split[1] in months:
        medical_text.insert(i+1, '')

## PDF Report

In [248]:
# storing into pdf

# Define a custom ParagraphStyle for the date headings
date_style = ParagraphStyle(name='date', fontSize=12, leading=14, textColor=colors.black, alignment=TA_LEFT)

# Define the text styles
styles = getSampleStyleSheet()
title_style = styles['Title']
heading_style = styles['Heading4']
normal_style = styles['Normal']

# Define a list to hold the paragraphs
paragraphs = []

# replace empty lines with \n
medical_text = ['\n\n' if x == '' else x for x in medical_text]

# NHS blood report title
paragraphs.append(Paragraph('NHS Blood Tests', title_style))

# report style - store into pdf
for line in medical_text:
    line_split = line.split(' ')
    if line_split[0].isdigit() and line_split[1] in months:
        Spacer(1, 0.15 * inch)
        paragraphs.append(Paragraph(line, heading_style))
    else:
        paragraphs.append(Paragraph(line, normal_style))

# Create a SimpleDocTemplate object and add the paragraphs to it
doc = SimpleDocTemplate(report_file)
doc.build(paragraphs)

## Excel Report

In [250]:
# preprocessing - 1 

medical_list = [i for i in medical_text if i!='\n\n']

In [253]:
# preprocessing - 2

# getting dates, category, test sorted
date = ''
cat = ''
test = ''
rows = []

for i in range(len(medical_list)-1):
    line_split = medical_list[i].split(' ')
    if line_split[0].isdigit() and line_split[1] in months:
        date = medical_list[i]
        for t in range(i+1, len(medical_list)):
            row = {}
            line_split2 = medical_list[t].split(' ')
            if not (line_split2[0].isdigit() and line_split2[1] in months):
                test = medical_list[t]
            else:
                break
            row['date'] = date
            row['test'] = test
            rows.append(row)
        


In [284]:
# preprocessing - 2

# getting dates, category, test sorted
date = ''
cat = ''
test = ''
rows = []

for i in range(len(medical_list)-1):
    line_split = medical_list[i].split(' ')
    if line_split[0].isdigit() and line_split[1] in months:
        date = medical_list[i]
        cat = ''
        for t in range(i+1, len(medical_list)):
            row = {}
            line_split2 = medical_list[t].split(' ')
            if not (line_split2[0].isdigit() and line_split2[1] in months):
                if ':' not in medical_list[t]:
                    cat = medical_list[t]
                    continue
                test = medical_list[t]
            else:
                break
            test_breakdown = test.split(': ') # test name, test result
            row['date'] = date
            row['category'] = cat
            row['test'] = test_breakdown[0]
            if len(test_breakdown)>1:
                row['result'] = test_breakdown[1].replace(' (normal range', '')
            if len(test_breakdown)==3:
                if 'normal range' in test_breakdown[1]:
                    row['normal range'] = test_breakdown[2].replace(')', '')
            rows.append(row)

In [288]:
# save results to excel file

pd.DataFrame(rows).to_excel(tabular_file, index=False)