In [207]:
import json
import re
import spacy
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

find_d = re.compile(r'd')
nlp = spacy.load('en_core_web_sm')
json_data_source = "/Users/rthombley/Downloads/__data_export_new.json"

Build data stucture from JSON

In [2]:
with open(json_data_source, 'rb') as ct_raw:
    jdata = json.load(ct_raw)

In [52]:
elements_of_interest = ['conditions','description_html', 'eligibility_by_age_open_to_18_and_over','eligibility_by_age_open_under_18',
                       'eligibility_by_sex_all','eligibility_by_sex_female','eligibility_by_sex_male',
                       'eligibility_exclusion_html','eligibility_inclusion_html','eligibility_healthy_volunteers',
                       'eligibility_tags','is_joinable','is_visible','keywords','summary','title_brief','title_official']
analytic_data = {}
for nct in jdata.keys():
    analytic_data[nct] = {k:jdata[nct][k] for k in jdata[nct].keys() if k in elements_of_interest}

Build pre-processing function

In [47]:
import re
import html
find_html = re.compile(r'<.*?>')
find_spaces = re.compile(r'\s{2,+}')
def removeHTML(raw_html):
    cleantext = re.sub(find_html, '', raw_html)
    cleantext = re.sub('\n','',cleantext)
    cleantext = html.unescape(cleantext)
    cleantext = re.sub(find_spaces, ' ', cleantext)
    return cleantext.strip()
# Test it out:
html_test = "<html><head></head><h2>Hello\n world!</h2>"
print(removeHTML(html_test))

Hello world!


In [53]:
elements_with_html = ['description_html','eligibility_exclusion_html','eligibility_inclusion_html']

for trial in analytic_data.keys():
    for element in elements_with_html:
        analytic_data[trial][element] = removeHTML(analytic_data[trial][element]) 

The Sklearn tools for performing TF-IDF require full document inputs (ie- all text data at once) rather than pre-parsed data. We may decide that for other models, having the tokens processed will help us out, so its nice to have our analytic_data data structure built to easily accomodate changes in our models (ie - to have both processed sentences/tokens and full documents available for use).

Output an array of trial keys, condition and cluster. Here, our goal is to generate a list of (trial, condition) mapped to a specific cluster.

In [163]:
output = []
for k in analytic_data.keys():
    #trial_info = analytic_data[k]
    for c in analytic_data[k]['conditions']:
        for cl in c['clusters']:
            if len(c['raw']) == 0:
                output.append((cl.upper(), c['name'].upper(), k))
            else:
                for r in c['raw']:
                    output.append((cl.upper(), c['name'].upper(), k))
for i in range(10):
    print(output[i])

('INFECTIOUS DISEASES', 'HIV/AIDS', 'NCT01266616')
('MENTAL HEALTH', 'ATTENTION-DEFICIT/HYPERACTIVITY DISORDER', 'NCT01686724')
('OTHER', 'SHOULDER PAIN', 'NCT01332760')
('BRAIN AND NERVES', "PARKINSON'S DISEASE", 'NCT01022073')
('OTHER', 'ROBOTIC SURGERY', 'NCT02989415')
('OTHER', 'MECHANICAL VENTILATION', 'NCT02989415')
('OTHER', 'ANESTHESIA', 'NCT02989415')
('CANCER', 'MULTIPLE MYELOMA', 'NCT02728102')
('CANCER', 'PROSTATE CANCER', 'NCT02705469')
('BRAIN AND NERVES', 'AMYOTROPIC LATERAL SCLEROSIS (ALS)', 'NCT00243932')


Now - split out the "other" data from the training data.

In [165]:
others = set()
train = set()
for i,k in enumerate(output):
    if i < 10:
        print("({} + {}) => {}".format(k[2], k[1], k[0]))
    if k[0] == 'OTHER': 
        others.add(k)
    else:
        train.add(k)
    

(NCT01266616 + HIV/AIDS) => INFECTIOUS DISEASES
(NCT01686724 + ATTENTION-DEFICIT/HYPERACTIVITY DISORDER) => MENTAL HEALTH
(NCT01332760 + SHOULDER PAIN) => OTHER
(NCT01022073 + PARKINSON'S DISEASE) => BRAIN AND NERVES
(NCT02989415 + ROBOTIC SURGERY) => OTHER
(NCT02989415 + MECHANICAL VENTILATION) => OTHER
(NCT02989415 + ANESTHESIA) => OTHER
(NCT02728102 + MULTIPLE MYELOMA) => CANCER
(NCT02705469 + PROSTATE CANCER) => CANCER
(NCT00243932 + AMYOTROPIC LATERAL SCLEROSIS (ALS)) => BRAIN AND NERVES


And now we'll actually add in the data and build the "other_data" data structure

In [173]:
other_data = []
for i,k in enumerate(others):
    #if i > 10:
    #    break
    other_data.append([])
    # Key
    other_data[i].append(k[2])
    # Cluster
    other_data[i].append("OTHER")
    # Condition
    other_data[i].append(k[1])
    # text to parse
    tokens = ('title_brief', 'summary', 'eligibility_inclusion_html', 'description_html')
    text = [analytic_data[k[2]][t] for t in tokens]
    other_data[i].append('. '.join(text))

Now we build the train_data data structure.

In [174]:
train_data = []
find_d = re.compile(r'd')
for i,k in enumerate(train):
    #if i > 10:
    #    break
    train_data.append([])
    # Key
    train_data[i].append(k[2])
    # Cluster
    train_data[i].append(k[0])
    # Condition
    train_data[i].append(k[1])
    # text to parse
    tokens = ('title_brief', 'summary', 'eligibility_inclusion_html', 'description_html')
    text = [analytic_data[k[2]][t] for t in tokens]
    train_data[i].append('. '.join(text))

Our intermediate datasets (one training and one 'other') now contain our trial key, cluster, condition and relevant trial text.

And now, we will build a numeric dataset using first the sklearn CountVectoriser (which we will tweak a little bit to do tokenization our way).
CountVectoriser builds a bag of words model (word counts in a document-term matrix)

In [188]:
# set everything to lowercase
def my_preprocessor(doc):
    return(doc.lower())

# tokenize the doc and build our custom token list
def my_tokenizer(doc):
    my_nlp = nlp(doc)
    tokens = []
    for token in my_nlp:
        if len(token.text) > 3 and not token.is_stop and not find_d.search(token.shape_):
            tokens.append(token.lemma_)
    return(tokens)

# create a Pandas dataframe from a word matrix
def wm2df(wm, feat_names):
    
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm.toarray(), index=doc_names,
                      columns=feat_names)
    return(df)

# Test using the same data as we used for our worksheet sample.
a = "To determine the toxicity and response to treatment with cytotoxic chemotherapy using doxorubicin (Adriamycin), bleomycin, and vincristine (DBV) for advanced AIDS-related Kaposi's sarcoma in combination with either didanosine (ddI) or zalcitabine (dideoxycytidine; ddC)."
b = "To define the toxicity and maximum-tolerated dose of weekly oral etoposide (VP-16) in patients with AIDS-related Kaposi's sarcoma; to determine the clinical pharmacology of orally administered VP-16 in AIDS patients."
# One document per element of the list
corpora = [a, b]

# Initialize the vectorizer
custom_vec = CountVectorizer(preprocessor=my_preprocessor, tokenizer=my_tokenizer)

# Fit the "model" to our documents
cwm = custom_vec.fit_transform(corpora)

# Extract the tokens used
tokens = custom_vec.get_feature_names()

# cwm is a Numpy array, so it is hard to visuzalize. We use the helper function defined above to put this array into
# a Pandas data frame

wm2df(cwm, tokens)


Unnamed: 0,administer,adriamycin,advanced,aid,bleomycin,chemotherapy,clinical,combination,cytotoxic,define,...,pharmacology,relate,response,sarcoma,tolerated,toxicity,treatment,vincristine,weekly,zalcitabine
Doc0,0,1,1,1,1,1,0,1,1,0,...,0,1,1,1,0,1,1,1,0,1
Doc1,1,0,0,2,0,0,1,0,0,1,...,1,1,0,1,1,1,0,0,1,0


The TFIDFVectorizer from sklearn works similarly, but it creates a TF-IDF matrix instead. A few comments:
- norm = 'l1' tells the vectorizer to normalize our vectors (make it so the sum of all components of each vector = 1
- smooth_idf tells the vectorizer to add 1 to all of our IDFs so we never get a divide by 0 or divide by almost 0 problem (since we are using logs, this may happen when a word appears in all documents).

In [211]:
# Initialize the vectorizer - with a few new options, described above.
vectorizer = TfidfVectorizer(preprocessor=my_preprocessor, tokenizer=my_tokenizer, norm='l1', smooth_idf=True)

# Fit the model
cwm = vectorizer.fit_transform(corpora)

# Extract our tokens
tokens = vectorizer.get_feature_names()

# Make it pretty
wm2df(cwm, tokens)

Unnamed: 0,administer,adriamycin,advanced,aid,bleomycin,chemotherapy,clinical,combination,cytotoxic,define,...,pharmacology,relate,response,sarcoma,tolerated,toxicity,treatment,vincristine,weekly,zalcitabine
Doc0,0.0,0.057907,0.057907,0.041201,0.057907,0.057907,0.0,0.057907,0.057907,0.0,...,0.0,0.041201,0.057907,0.041201,0.0,0.041201,0.057907,0.057907,0.0,0.057907
Doc1,0.055616,0.0,0.0,0.079142,0.0,0.0,0.055616,0.0,0.0,0.055616,...,0.055616,0.039571,0.0,0.039571,0.055616,0.039571,0.0,0.0,0.055616,0.0


In [210]:
x = set([t[2] for t in train_data])

x

{'1P36 DELETION SYNDROME',
 '22Q11 DELETION SYNDROME',
 'ABNORMAL GLUCOSE TOLERANCE',
 'ABORTION',
 'ACUTE CORONARY SYNDROME',
 'ACUTE ERYTHROLEUKEMIA (M6)',
 'ACUTE INJURY OF ANTERIOR CRUCIATE LIGAMENT',
 'ACUTE KIDNEY INJURY',
 'ACUTE LIVER INJURY',
 'ACUTE LYMPHOBLASTIC LEUKEMIA',
 'ACUTE MYELOID LEUKEMIA',
 'ACUTE MYOCARDIAL INFARCTION',
 'ACUTE RENAL FAILURE',
 'ADA DEFICIENCY',
 'ADENOCARCINOMA',
 'ADENOCARCINOMA OF THE GASTROESOPHAGEAL JUNCTION',
 'ADENOCARCINOMA OF THE PANCREAS',
 'ADENOVIRUS',
 'AICARDI SYNDROME',
 'AIDS-RELATED MALIGNANCIES',
 'ALAGILLE SYNDROME',
 'ALLOGENEIC, HEMATOPOIETIC CELL TRANSPLANT (HCT)',
 'ALOPECIA AREATA',
 'ALOPECIA PARTIALIS',
 'ALOPECIA TOTALIS',
 'ALOPECIA UNIVERSALIS',
 'ALPHA 1-ANTITRYPSIN DEFICIENCY',
 'ALPHA-1 ANTITRYPSIN DEFICIENCY',
 "ALZHEIMER'S DISEASE",
 'AMBLYOPIA',
 'AMYOTROPIC LATERAL SCLEROSIS (ALS)',
 'ANAL INTRAEPITHELIAL NEOPLASIA',
 'ANAPLASTIC OLIGOASTROCYTOMA',
 'ANAPLASTIC OLIGODENDROGLIOMA',
 'ANEMIA',
 'ANEURYSM',
 'ANGEL