In [1]:
import pymongo, os, re, glob, sys
import pandas as pd
import numpy as np
from collections import OrderedDict, defaultdict
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
import nltk
from com.ibm.watson.datasource.clinicaltrials import ClinicalTrials, ClinicalStudy
from com.ibm.watson.datasource.concepts import get_concepts_from_ctm
from com.ibm.watson import get_mongo_client

In [2]:
DESKTOP = os.path.join(os.environ["HOME"], "Desktop")
DOCUMENTS = os.path.join(os.environ["HOME"], "Documents")

In [3]:
client = get_mongo_client("localhost", 27017)
db = client["WATSON"]

In [4]:
ct = ClinicalTrials()
count = ct.get_search_results_count(query=None, term="cancer AND \"bone marrow\"", type='Intr')

In [3]:
trials_dir = os.path.join(DOCUMENTS, "Data", "CTGOV", "bone_marrow")
line_break_regex = re.compile("(\n[ ]{2,})")
purpose_rationale_regex = re.compile("PURPOSE: |RATIONALE: ")

def replace_specific_drug_names(text):
    response = get_concepts_from_ctm(text)
    if "data" in response["unstructured"][0]:
        drug_mentions = [ {"begin" : concept["begin"], "end" : concept["end"], "text" : concept["coveredText"]} for concept in response["unstructured"][0]["data"]["concepts"] if concept["semanticType"] == "phsu"]
        if len(drug_mentions) > 0:
            regex = re.compile("|".join([mention["text"] for mention in drug_mentions]))
            #print "|".join([mention["text"] for mention in drug_mentions])
            text = regex.sub("drug", text)
                
    return text

#text = "All patients entered will receive induction therapy with cyclophosphamide, adriamycin, methotrexate and 5-fluorouracil with hormonal synchronization utilizing tamoxifen and premarin as in a previous Medicine Branch protocol (MB-160C)"
#replace_specific_drug_names(text)

In [4]:
#bone_marrow_trial_summaries = open(os.path.join(trials_dir, "bone_marrow_trial_summaries.tsv"), "wb")
bone_marrow_regex = re.compile("marrow( transplant|transplantation)?", re.IGNORECASE)
study_regex = re.compile("(clinical)?(study|trial)", re.IGNORECASE)

for i, f in enumerate(glob.glob1(trials_dir, "*.xml")[400:450]):
    cs = ClinicalStudy()
    print f
    try:
        cs.load_from_file(os.path.join(trials_dir, f))
        #if i % 50 == 0:
        #print "Processing(%d) %s" % (i, cs.get_trial_id())
        summary = cs.get_summary()
        summary = line_break_regex.sub(" ", summary)
        summary = purpose_rationale_regex.sub("", summary)
        #print summary
        summary = replace_specific_drug_names(summary)
        for j, sent in enumerate(nltk.sent_tokenize(summary)):
            index_of_bone_marrow = -1
            index_of_study = -1
            match = study_regex.search(sent)
            if match:
                #print "match 1 (%d)" % (match.end() - match.start())
                index_of_study = match.start()
            match = bone_marrow_regex.search(sent)
            if match:
                #print "match 2 (%d)" % (match.end() - match.start())
                index_of_bone_marrow = match.start()
            index_of_drug = sent.find("drug")
            if (index_of_study > -1) and (index_of_bone_marrow > -1) and (index_of_drug > -1):
                print sent
                print "\t%d: Study: %d, Bone Marrow: %d, Drug: %d" % (j, index_of_study, index_of_bone_marrow, index_of_drug)
                break
        #bone_marrow_trial_summaries.write("\t".join([cs.get_trial_id(), summary, "\n"]))
    except:
        print "Exception in %s: %s" % (cs.get_trial_id(), sys.exc_info()[0])
        
#bone_marrow_trial_summaries.close()

NCT00429143.xml
NCT00438958.xml
This randomized phase III trial is studying drug-mobilized sibling donor peripheral stem cell transplant to see how well it works compared with sibling donor bone marrow transplant in treating patients with hematologic cancers or drug diseases.
	7: Study: 26, Bone Marrow: 163, Drug: 44
NCT00445731.xml
This randomized clinical trial is studying a structured walking drug to see how well it works compared with standard therapy in cancer patients undergoing a donor bone marrow transplant.
	1: Study: 25, Bone Marrow: 168, Drug: 64
NCT00446550.xml
NCT00449592.xml
NCT00450450.xml
This randomized phase III trial is studying donor bone marrow transplant with or without drug to compare how well they work in treating young patients with hematologic cancer or drug diseases.
	0: Study: 26, Bone Marrow: 55, Drug: 89
NCT00453206.xml
NCT00453388.xml
This phase II trial studies how well total-body irradiation (TBI) works when given together with drug and drug followed by

In [8]:
cs = ClinicalStudy()
cs.load_from_file(os.path.join(trials_dir, "NCT00446550.xml"))
print cs.get_summary()

StructuralBadDocumentError: <class 'com.ibm.watson.schema.ct_gov.CTD_ANON_17'> cannot accept wildcard content <pyxb.utils.saxdom.Element object at 0x10f89aa50>

In [6]:
text = [ (x["NCT_ID"], x["TITLE"], x["SUMMARY"]) for x in db["CTGOV"].find({"metadata.TAGS" :  {"$gt" : []}}) ]
print len(text)

7


In [48]:
print text[0][1]
titles = [ x[1] for x in text ]
summaries = [ x[2] for x in text ]
vectorizer = CountVectorizer(min_df=1)
vectorizer2 = CountVectorizer(min_df=1)

vectorizer.fit_transform(titles)
vectorizer2.fit_transform(summaries)
analyze = vectorizer.build_analyzer()
feature_names = vectorizer.get_feature_names()

new_titles = ["Study to evaluate Pharmacokinetics safety", "Another drug study of Trametinib"]
new_summaries = ["Pharmacokinetics Evaluate Safety for Oral Doses of Dabrafenib", "Oral doses of Trametinib should be limited"]
titles_array = vectorizer.transform(new_titles)
summaries_array = vectorizer2.transform(new_summaries)
print titles_array.shape
print summaries_array.shape
combined = hstack([titles_array, summaries_array])
#print np.append(titles_array * 2, summaries_array, axis=1).shape
print vectorizer.vocabulary_.get('pharmacokinetics')
print vectorizer.vocabulary_.get('evaluate')
print vectorizer.vocabulary_.get('safety')
print vectorizer.vocabulary_.get('trametinib')

A Phase I Study to Evaluate the Pharmacokinetics and Safety of Repeat Oral Doses of Dabrafenib and the Combination of Dabrafenib With Trametinib in Chinese Subjects With Melanoma
(2, 103)
(2, 224)
72
35
85
96


In [51]:
print combined.toarray()[:, 96]

[0 1]


In [25]:
array1 = np.array([[1, 2], [3, 4]])
array2 = np.array([[5], [6]])
print array1
print array2
array3 = np.append(array1, array2, axis=1)
print array3

[[1 2]
 [3 4]]
[[5]
 [6]]
[[1 2 5]
 [3 4 6]]
