# Web Scraping Sample Patient Unstructred Data

From the Agency for Healthcare Research and Quality.

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd
import os

def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

### Part I: Web Scraping Tool
Creating a function that web scrapes from multiple links (this is the unstructured data), parses the HTML (this is the ETL), and stores in a pandas DataFrame. The DataFrame gets convered to a CSV and a job scheduler imports it to the cloud based database. 

In [15]:
def web_scrape_page(url):
    resp = requests.get(url)
    
    # All info from webpage scraped
    soup = BeautifulSoup(resp.text,'html.parser')
    
    # Scrap Patient Information
    table = soup.find_all('table')[2]
    cols = table.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    newlist = [word for line in cols for word in line.split(':')]
    data = []
    value = []

    for i in range(len(newlist)):
        if i%2!= 0:
            data.append(newlist[i])
        else:
            value.append(newlist[i])
    patient_information = pd.DataFrame({'value': value, 'data': data})
    
    
    # Scrap Patient Appointment Information
    h = []
    p = []
    for header in soup.find_all('h4'):
        h.append(header.text)
        para = header.find_next_sibling('p')
        p.append(para)

    clean_p = []
    for i in p:
        clean_p.append(str(i))

    clean_p2 = []
    for i in clean_p:
        clean_p2.append(remove_html_tags(i))

    appt_information = pd.DataFrame({'patient_information':h, 'patient_data':clean_p2})
    
    # Scrap Vitals 
    table = soup.find_all('table')[7]
    cols = table.find_all('td')
    cols = [ele.text.strip() for ele in cols]

    vitals = cols[0::3]
    results = cols[1::3]
    info = cols[2::3]

    results2 = []
    for i in results:
        results2.append(i.replace("\xa0", ""))

    vitals2 = []
    for i in vitals:
        vitals2.append(i.replace("\xa0", ""))

    medrec_vitals = pd.DataFrame({'vitals': vitals2, 'vitals_results': results2})


    # Scrap Test results
    table = soup.find_all('table')[10]
    cols = table.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    flowsheet = cols[2:]

    test = []
    results = []

    for i in range(len(flowsheet)):
        if i%2!= 0:
            results.append(flowsheet[i])
        else:
            test.append(flowsheet[i])
    medrec_tests = pd.DataFrame({'test': test, 'test_results': results})
    
    # Append all df to eachother
    new_cols = {x: y for x, y in zip(patient_information.columns, appt_information.columns)}
    df_out = appt_information.append(patient_information.rename(columns=new_cols))
    new_cols = {x: y for x, y in zip(df_out.columns, medrec_vitals.columns)}
    df_out2 = medrec_vitals.append(df_out.rename(columns=new_cols))
    new_cols = {x: y for x, y in zip(df_out2.columns, medrec_tests.columns)}
    final_df = medrec_tests.append(df_out2.rename(columns=new_cols))
    
    # Transpose and create new row
    final_df = final_df.T
    new_header = final_df.iloc[0] 
    final_df.columns = new_header 
    final_df = pd.DataFrame(final_df.iloc[1,:]).T
    final_df = final_df.rename(index={'test_results':0})
    
    return final_df

In [20]:
pg1 = web_scrape_page("https://www.ahrq.gov/ncepcr/tools/pf-handbook/mod8-app-b-adam-pie.html")
pg2 = web_scrape_page("https://www.ahrq.gov/ncepcr/tools/pf-handbook/mod8-app-n-bill-windows.html")
pg3 = web_scrape_page("https://www.ahrq.gov/ncepcr/tools/pf-handbook/mod8-app-b-billy-gato.html")
pg4 = web_scrape_page("https://www.ahrq.gov/ncepcr/tools/pf-handbook/mod8-app-b-john-donut.html")
pg5 = web_scrape_page("https://www.ahrq.gov/ncepcr/tools/pf-handbook/mod8-app-b-steve-apple.html")
pg6 = web_scrape_page("https://www.ahrq.gov/ncepcr/tools/pf-handbook/mod8-app-b-tom-gellato.html")

frames = [pg1, pg2, pg3, pg4, pg5, pg6]
med_recs = pd.concat(frames, sort=False)
med_recs = med_recs.reset_index(drop=True)

In [24]:
med_recs.set_index('Name')

Unnamed: 0_level_0,HEIGHT (in),WEIGHT (lb),TEMPERATURE (deg F),TEMP SITE,PULSE RATE (/min),PULSE RHYTHM,RESP RATE (/min),BP SYSTOLIC (mm Hg),BP DIASTOLIC (mm Hg),CHOLESTEROL (mg/dL),...,Review of Systems,WEIGHT (lb),TEMPERATURE (deg F),PULSE RATE (/min),RESP RATE (/min),BP SYSTOLIC (mm Hg),BP DIASTOLIC (mm Hg),CHOLESTEROL (mg/dL),HDL (mg/dL),LDL (mg/dL)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adam Pie,70,190.0,98.0,oral,72.0,,16.0,158.0,90.0,,...,,,,,,,,,,
Bill Windows,70,190.0,98.0,oral,72.0,,16.0,128.0,70.0,,...,"General: denies fatigue, malaise, fever, weigh...",,,,,,,,,
Billy Gato,65,,,oral,,,,,,,...,,180.0,98.0,72.0,16.0,134.0,92.0,,,121.0
John Donut,74,,,oral,,,,,,,...,,190.0,98.0,72.0,16.0,158.0,90.0,,102.0,125.0
Steve Apple,71,191.0,98.0,oral,72.0,,16.0,118.0,70.0,,...,"General: denies fatigue, malaise, fever, weigh...",,,,,,,,,
Tom Gellato,66,195.0,98.0,oral,72.0,,16.0,131.0,94.0,,...,,,,,,,,,,


In [28]:
# Save med_recs as csv
cwd = os.getcwd()
output_path = cwd + '/Med_Recs.csv'
med_recs.to_csv(output_path, index = True)