# Data Extraction from Disease Aritcles

The HTML dump is parsed using BeautifulSoup and Information like `disease_name`  `Symptoms`, `Treatment` or any other available information is captured.


Resules are stored in CSV where as the extracted information is JSON

### Headings captured

* Signs and symptoms
* Causes
* Diagnosis
* Treatment
* Prognosis
* Other First Level ToC headings

#### E.g JSON structure of the information Captured

`Thyroid Cancer`

```json
{
    "Signs and symptom": "Most often the first symptom of thyroid cancer is a nodule in the thyroid region of the neck. However, many adults have small nodules in their thyroids, but typically under 5% of these nodules are found to be cancerous Sometimes the first sign is an enlarged lymph node. ."
    
    "Causes" : "Thyroid cancers are thought to be related to a number of environmental and genetic predisposing factors, but significant uncertainty remains regarding its causes."
   
    "Diagnosis": "After a thyroid nodule is found during a physical examination, a referral to an endocrinologist or a thyroidologist may occur. "
    
}

```

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

from bs4 import BeautifulSoup
from functools import reduce



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import os


In [140]:

with open("./input/positive/Achalasia") as fp:
    soup = BeautifulSoup(fp, 'html.parser')
    body = soup.select('#mw-content-text')[0]
    


In [166]:
def get_disease_name(doc):
    return doc.select('h1#firstHeading')[0].get_text()

def read_doc(file_path):
    title = ""
    content = ""
    with open(file_path) as fp:
        soup = BeautifulSoup(fp, 'html.parser')
        
        
    
    return soup

def get_level_one_tocs(body):
    tocs = []
    tocs_node = body.select('#toc li.toclevel-1 a')
    if(len(tocs_node) > 0):
        tocs = [ toc['href'][1:len(toc['href'])] for toc in tocs_node]
    return tocs

def get_toc_section_contents(body, tocs):
    infos = body.find_all(recursive=False)
    disease_info = {}
    collecting = None

    for info in infos:


        toc_node = info.select('span.mw-headline')
        if(len(toc_node) > 0):
            toc_text = toc_node[0].get_text()
            if toc_text in tocs:
                collecting = toc_node[0].get_text()
                disease_info[collecting] = ""
                
        else:
            if collecting:
                node_text_acc = disease_info[collecting]
                text = info.get_text()
                node_text_acc = node_text_acc + " " + text
                disease_info[collecting] = node_text_acc
    
    
    return disease_info

def extract_all_info(file_path):
    info = {}
    soup = read_doc(file_path) 
    disease_name = get_disease_name(soup)
    body = soup.select('#mw-content-text')[0]
    tocs = get_level_one_tocs(body)
    disease_info  = get_toc_section_contents(body, tocs)
    
    return {'disease_name': disease_name, 'tocs': tocs, 'disease_info': disease_info}
        

In [30]:
positive_directory = os.fsencode('./input/positive')



In [168]:
# testing sample htmls

files = os.listdir(positive_directory)
disease_names = []
tocs = []
disease_info_list = []
for i in range(20):
    file_path = os.path.join(positive_directory, files[i])
    info = extract_all_info(file_path)
    disease_names.append(info['disease_name'])
    tocs.append(info['tocs'])
    disease_info_list.append(info['disease_info'])
    
dict_frame = { 'disease_name': disease_names, 'extracted_headings':tocs, 'disease_info': disease_info_list}
df = pd.DataFrame(dict_frame)
    
df.head()


Unnamed: 0,disease_info,disease_name,extracted_headings
0,{'Treatment': ' Many treatment strategies for ...,Peripheral neuropathy,"[Classification, Mononeuropathy, Mononeuritis_..."
1,{'Treatment': ' No specific treatment for CTF ...,Colorado tick fever,"[Epidemiology, Virology, Tick, Transmission_an..."
2,{},Rosselli–Gulienetti syndrome,[]
3,{'Neurological': ' Mental retardation ranging ...,Johanson–Blizzard syndrome,"[Characteristics, Exocrine, Endocrine, Nasal, ..."
4,{'Treatment': ' No cure is known for 22q11.2 d...,DiGeorge syndrome,"[Signs_and_symptoms, Cognitive_impairments, Sp..."


In [182]:
def extract_all_article_info(directory):
    disease_names = []
    tocs = []
    disease_info_list = []

    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)
        
        info = extract_all_info(file_path)
        disease_names.append(info['disease_name'])
        tocs.append(info['tocs'])
        disease_info_list.append(info['disease_info'])
    
    dict_frame = { 'disease_name': disease_names, 'extracted_headings':tocs, 'disease_info': disease_info_list}
    df = pd.DataFrame(dict_frame)
    return df
    

In [183]:
df = extract_all_article_info(positive_directory)
df.head()

Unnamed: 0,disease_info,disease_name,extracted_headings
0,{'Treatment': ' Many treatment strategies for ...,Peripheral neuropathy,"[Classification, Mononeuropathy, Mononeuritis_..."
1,{'Treatment': ' No specific treatment for CTF ...,Colorado tick fever,"[Epidemiology, Virology, Tick, Transmission_an..."
2,{},Rosselli–Gulienetti syndrome,[]
3,{'Neurological': ' Mental retardation ranging ...,Johanson–Blizzard syndrome,"[Characteristics, Exocrine, Endocrine, Nasal, ..."
4,{'Treatment': ' No cure is known for 22q11.2 d...,DiGeorge syndrome,"[Signs_and_symptoms, Cognitive_impairments, Sp..."


In [184]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3694 entries, 0 to 3693
Data columns (total 3 columns):
disease_info          3694 non-null object
disease_name          3694 non-null object
extracted_headings    3694 non-null object
dtypes: object(3)
memory usage: 86.7+ KB


In [185]:
df.to_csv('./input/extracted_info.csv', header=True, index=False)