# Preparing Diagnosis and Lab Test Data for multinomial naive bayes

In [9]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [10]:
lbl_df = pd.read_excel('labelled_db.xlsx')
dx_df = lbl_df[["Abstract", "Article Title", "Diagnosis"]]
lt_df = lbl_df[["Abstract", "Article Title", "Lab Name"]]
lt_df = lt_df.fillna("0")

print(dx_df.shape)
dx_df.head()

(17062, 3)


Unnamed: 0,Abstract,Article Title,Diagnosis
0.0,To determine whether the repetition of the rap...,Repetition of the rapid antigen test in initia...,Group A Streptococcus (GAS) pharyngitis
1.0,To determine whether the repetition of the rap...,Repetition of the rapid antigen test in initia...,Group A Streptococcus (GAS) pharyngitis
2.0,Helicobacter pylori infection can be detected ...,"Evaluation of Pyloriset Screen, a rapid whole-...",H.pylori Infection
3.0,Helicobacter pylori infection can be detected ...,"Evaluation of Pyloriset Screen, a rapid whole-...",H.pylori Infection
4.0,There is some debate over the clinical utility...,Measuring thyroglobulin autoantibodies by sens...,Hashimoto thyroiditis


In [11]:
lt_df = lt_df[~(lt_df['Lab Name'] == "0")]
lt_df = lt_df.reset_index(drop=True)

print(lt_df.shape)
lt_df.head()

(17062, 3)


Unnamed: 0,Abstract,Article Title,Lab Name
0,To determine whether the repetition of the rap...,Repetition of the rapid antigen test in initia...,Strep Pyogenes Rapid Ag
1,To determine whether the repetition of the rap...,Repetition of the rapid antigen test in initia...,"Glucose, synovial fluid (decreased)"
2,Helicobacter pylori infection can be detected ...,"Evaluation of Pyloriset Screen, a rapid whole-...",Erythrocyte sedimentation rate (increased)
3,Helicobacter pylori infection can be detected ...,"Evaluation of Pyloriset Screen, a rapid whole-...","Lactate Dehydrogenase, synovial fluid (increas..."
4,There is some debate over the clinical utility...,Measuring thyroglobulin autoantibodies by sens...,"WBC (increased, > 10000/uL)"


In [12]:
dx_html = open('dx_abstracts.html')
dx_soup = BeautifulSoup(dx_html, 'lxml')

dx_names = []
for dx_name in open('dx_list.txt', 'r').readlines():
    dx_names.append(dx_name.strip())

print(dx_names[:10])

['Rheumatism', 'Dyspepsia', 'Esophagitis', 'Hemiplegia', 'Dystocia', 'Thrombocytopenia', 'Meningismus', 'Myelofibrosis', 'Nonvisualization', 'Clostridium']


In [13]:
excluded_dxs = []
for dx in dx_names:
    # html tag ID's for BS4 identification
    title_tag = dx + "-Title"
    abstract_tag = dx + "-Abstract"
    titles = [h3.text for h3 in dx_soup.find_all('h3', {'id': title_tag})]
    abstracts = [p.text for p in dx_soup.find_all('p', {'id': abstract_tag})]
    if len(titles) == len(abstracts) and len(titles) != 0:
        name_ls = [dx for i in range(len(titles))]
        dx_df = dx_df.append(pd.DataFrame({"Article Title": titles, "Abstract": abstracts, "Diagnosis": name_ls}), ignore_index=True)
    else:
        excluded_dxs.append(dx)
        
dx_df.shape

(27310, 3)

In [14]:
unique_dxs = list(set(dx_df['Diagnosis']))
dx_dict = dict(enumerate(unique_dxs))
dx_dict = {v:k for k,v in dx_dict.items()}
dx_df['DxID'] = dx_df['Diagnosis'].map(dx_dict)
print("# of diagnoses: " + str(len(unique_dxs)))
print("# of diagnoses excluded: " + str(len(excluded_dxs)))

# of diagnoses: 1055
# of diagnoses excluded: 1


In [15]:
from pandas import ExcelWriter
writer = ExcelWriter('dx_labelled.xlsx')
dx_df.to_excel(writer, 'dx_labelled.xlsx')
writer.save()

In [None]:
lt_html = open('lt_abstracts.html', encoding='ISO-8859-1')
lt_soup = BeautifulSoup(lt_html, 'lxml')

lt_names = []
for lt_name in open('lt_list.txt', 'r').readlines():
    lt_names.append(lt_name.strip())

print(lt_names[:10])


In [None]:
excluded_lts = []
for lt in lt_names:
    # html tag ID's for BS4 identification
    title_tag = lt + "-Title"
    abstract_tag = lt + "-Abstract"
    titles = [h3.text for h3 in lt_soup.find_all('h3', {'id': title_tag})]
    abstracts = [p.text for p in lt_soup.find_all('p', {'id': abstract_tag})]
    if len(titles) == len(abstracts) and len(titles) != 0:
        name_ls = [lt for i in range(len(titles))]
        lt_df = lt_df.append(pd.DataFrame({"Article Title": titles, "Abstract": abstracts, "Lab Name": name_ls}), ignore_index=True)
    else:
        excluded_lts.append(lt)
        
lt_df.shape

In [None]:
unique_lts = list(set(lt_df['Lab Name']))
lt_dict = dict(enumerate(unique_lts))
lt_dict = {v:k for k,v in lt_dict.items()}
lt_df['LtID'] = lt_df['Lab Name'].map(lt_dict)
print("# of lab tests: " + str(len(unique_lts)))
print("# of lab tests excluded: " + str(len(excluded_lts)))

In [None]:
writer = ExcelWriter('CleanedData.xlsx')
lt_df.to_excel(writer, 'CleanedData.xlsx')
writer.save()