In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import pymongo as pm
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
import dateutil.parser as prs

In [2]:
# connect to MongoDB cluster

conn_str = "mongodb://127.0.0.1/clintrials"

client = pm.MongoClient(conn_str)

client["admin"].command("listDatabases")

NameError: name 'pm' is not defined

In [3]:
# define "db" and "coll" variables for convenience

db = client["clintrials"]
coll = db["clinical_studies"]

client["clintrials"].command("listCollections")

{'cursor': {'firstBatch': [{'idIndex': {'key': {'_id': 1},
     'name': '_id_',
     'ns': 'clintrials.clinical_studies',
     'v': 2},
    'info': {'readOnly': False,
     'uuid': UUID('7b37d371-e8b4-42a0-a1b7-7e9f056d7d16')},
    'name': 'clinical_studies',
    'options': {},
    'type': 'collection'}],
  'id': 0,
  'ns': 'clintrials.$cmd.listCollections'},
 'ok': 1.0}

In [4]:
# Grab a single record to get a sense of schema

doc = coll.find_one()
doc

{'_id': ObjectId('5aba645d97455c1288f91192'),
 'brief_summary': {'textblock': 'This study will test the ability of extended release nifedipine (Procardia XL), a blood\n      pressure medication, to permit a decrease in the dose of glucocorticoid medication children\n      take to treat congenital adrenal hyperplasia (CAH).'},
 'brief_title': 'Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets',
 'condition': 'Congenital Adrenal Hyperplasia',
 'condition_browse': {'mesh_term': ['Hyperplasia',
   'Adrenal Hyperplasia, Congenital',
   'Adrenogenital Syndrome',
   'Adrenocortical Hyperfunction']},
 'detailed_description': {'textblock': 'This protocol is designed to assess both acute and chronic effects of the calcium channel\n      antagonist, nifedipine, on the hypothalamic-pituitary-adrenal axis in patients with\n      congenital adrenal hyperplasia. The multicenter trial is composed of two phases and will\n      involve a double-blind, placebo-controlled parallel de

In [1]:
# Inspect the data to determine values for an implicit enum

coll.distinct("overall_status")

NameError: name 'coll' is not defined

In [6]:
# Example of getting some basic aggregation data
list(coll.aggregate([
    {"$group": {"_id": "$condition", "n": {"$sum": 1}}}
    ,{"$sort": {"n" : -1}}
    ,{"$limit": 10}
]))

[{'_id': 'Healthy', 'n': 5608},
 {'_id': 'Breast Cancer', 'n': 3222},
 {'_id': 'Prostate Cancer', 'n': 1924},
 {'_id': 'Asthma', 'n': 1863},
 {'_id': 'HIV Infections', 'n': 1799},
 {'_id': 'Obesity', 'n': 1617},
 {'_id': 'Hypertension', 'n': 1357},
 {'_id': 'Rheumatoid Arthritis', 'n': 1237},
 {'_id': 'Coronary Artery Disease', 'n': 1171},
 {'_id': 'Schizophrenia', 'n': 1133}]

In [14]:
# Example of using a dataframe
pd.DataFrame(list(coll.aggregate([
    {"$group": {"_id": "$condition", "n": {"$sum": 1}}}
    ,{"$sort": {"n" : -1}}
    ,{"$limit": 10}
])))

Unnamed: 0,_id,n
0,Healthy,5608
1,Breast Cancer,3222
2,Prostate Cancer,1924
3,Asthma,1863
4,HIV Infections,1799
5,Obesity,1617
6,Hypertension,1357
7,Rheumatoid Arthritis,1237
8,Coronary Artery Disease,1171
9,Schizophrenia,1133


In [7]:
data = list(coll.aggregate([
    {"$addFields": {"startDate" : {"$dateFromString": {"dateString" : "$study_first_submitted"}}} }
    ,{"$group": {"_id": "$condition", "n": {"$sum": 1}, "startDates": {"$push": "$startDate"}}}
    ,{"$sort": {"n" : -1}}
    ,{"$limit": 10}
]))

In [8]:
start = prs.parse('1998-01-01').timestamp()
end = prs.parse('2018-05-01').timestamp()
bins = np.linspace(start, end, 100)

In [9]:
pd.Timestamp(data[0]['startDates'][0]).value
data[0]['startDates'][0].timestamp()

941616000.0

In [10]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,1)

for disease in data:
    plt.hist(mdates.epoch2num([x.timestamp() for x in disease['startDates']])
             , mdates.epoch2num(bins)
             , alpha=0.5, label=disease['_id'])

ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('\'%y'))

ax.set_ylabel('number of studies started')
ax.set_xlabel('time of submission')

plt.legend(loc='upper right')
plt.show()

<IPython.core.display.Javascript object>

In [12]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,1)

for disease in [data[5]]:
    plt.hist(mdates.epoch2num([x.timestamp() for x in disease['startDates']])
             , mdates.epoch2num(bins)
             , alpha=0.5, label=disease['_id'])

ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('\'%y'))

ax.set_ylabel('number of studies started')
ax.set_xlabel('time of submission')

plt.legend(loc='upper right')
plt.show()

<IPython.core.display.Javascript object>