In [1]:
from pymongo import MongoClient
import pymongo
import pandas as pd
import numpy as np
import scipy as sp
from IPython.parallel import Client
import sys
import numbapro
import numba
from sklearn import svm, metrics

In [2]:
client = MongoClient()
db = client.VentDyssynchrony_db
test_data = db.TestData_collection
patient_data = db.PatientData_collection

In [3]:
unpack_col = []
def unpack(breath, unpack_col = unpack_col):
    for col in unpack_col:      
        key_names = set()
        keys = set()
        
        try:
            keys = breath[col].apply(lambda x: set(x.keys()) if isinstance(x, dict) else set())
        except:
            print(sys.exc_info()[0])
            
        
        try:
            for items in keys:
                key_names.update(items)
        except:
            print(sys.exc_info()[0])
            print(keys)

        try:
            for key in key_names:
                breath[col +':'+ key] = breath[col].apply(lambda x: x[key] if (isinstance(x, dict) and (key in x)) else np.nan)
            breath.drop(col, inplace=True, axis=1)
        except:
            print(sys.exc_info()[0])
            print('last', key, key_names)
    
    return breath


In [4]:
test_data.find_one()

{'_id': 'P114\\WF1773\\2015-01-27 15:57:09',
 'analysis': {'couplet': True,
  'ds': False,
  'ds_mid': True,
  'ds_sens': True,
  'ds_spec': False,
  'fl': False,
  'ie': False,
  'pds': True,
  'pl': True,
  'pvt': True},
 'breath_number': 1773,
 'characteristics': {'breath_time': 0.8959999999997308,
  'ds_vol': nan,
  'end_insp_vol': 15.0,
  'exp_time': 0.28799999999955617,
  'flow_max': [[9.0, 23.0]],
  'flow_min': [[2.0, 19.0]],
  'insp_time': 0.6080000000001746,
  'max_flow': "[[31.800000000000001, '0 days 01:12:21.680000', 3], [-0.75, '0 days 01:12:22.032000', 14], [-1.5, '0 days 01:12:22.160000', 18]]",
  'max_pressure': "[[15.199999999999999, '0 days 01:12:22.224000', 20]]",
  'max_vol': 115.0,
  'min_exp_flow': 31.8,
  'min_flow': "[[-3.75, '0 days 01:12:21.904000', 10], [-29.100000000000001, '0 days 01:12:22.288000', 22]]",
  'min_pressure': "[[2.6000000000000001, '0 days 01:12:21.648000', 2], [13.800000000000001, '0 days 01:12:22.064000', 15], [9.3000000000000007, '0 days 01

In [11]:
breath_df = pd.DataFrame(list(test_data.find({'validation.raw':{'$exists':1}})))
breath_df = unpack(breath_df, unpack_col = ['characteristics', 'vent_settings', 'data_frame', 'analysis', 'validation'])

headings = ['ds', 'pl', 'fl', 'ie', 'pvt' , 'dvt', 'normal']
for items in headings:
    breath_df['validation:'+ items] = breath_df['validation:raw'].apply(lambda x: True if (items in x) else False)

In [12]:
breath_df.drop(['analysis:ds_mid', 'analysis:ds_sens', 'analysis:ds_spec', 'characteristics:max_flow', 'characteristics:max_pressure', 'characteristics:min_flow', 'characteristics:min_pressure'], axis=1, inplace=True)

In [13]:
breath_df.columns

Index(['_id', 'breath_number', 'characteristics:ds_vol', 'end_time',
       'file_name', 'location', 'patientID', 'start_time',
       'characteristics:exp_time', 'characteristics:max_vol',
       'characteristics:breath_time', 'characteristics:ie_max',
       'characteristics:flow_max', 'characteristics:paw_min',
       'characteristics:min_exp_flow', 'characteristics:paw_max',
       'characteristics:insp_time', 'characteristics:pvt_max',
       'characteristics:end_insp_vol', 'characteristics:pvt_min',
       'characteristics:flow_min', 'characteristics:peak_pressure',
       'characteristics:min_vol', 'characteristics:peak_insp_flow',
       'characteristics:ie_min', 'vent_settings:ramp', 'vent_settings:ie',
       'vent_settings:leak', 'vent_settings:PEEP', 'vent_settings:p_mean',
       'vent_settings:set_VT', 'vent_settings:reference_doc',
       'vent_settings:file_name', 'vent_settings:distance',
       'vent_settings:load_errors', 'vent_settings:p_peak', 'vent_settings:ti',
 

In [61]:
train = breath_df[['characteristics:peak_insp_flow','characteristics:min_exp_flow', 'characteristics:min_vol', 
                  'characteristics:insp_time', 'characteristics:exp_time', 'characteristics:end_insp_vol', 
                  'characteristics:breath_time', 'validation:ds']].dropna(axis=0, how='any')
target = train['validation:ds'].values
train = train.drop(['validation:ds'], axis=1).values


In [62]:
target.shape

(1801,)

In [63]:
train.shape

(1801, 7)

In [64]:
clf = svm.SVC(probability=True)

In [68]:
clf.fit(X=train, y=target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [69]:
clf.decision_function(train)

array([-0.99984471,  0.22764524, -1.00001819, ..., -1.00033798,
       -1.00005994, -1.03552242])

In [70]:
clf.score(train, target)

1.0

In [71]:
prob_est = clf.predict_proba(train)

In [73]:
clf.classes_

array([False,  True], dtype=bool)

In [74]:
results = clf.predict(train)

In [75]:
results

array([False,  True, False, ..., False, False, False], dtype=bool)

In [28]:
df = pd.DataFrame(data={'results': results, 'train':train})

In [29]:
df['delta'] = df.apply(lambda x: False if x.results != x.train else True, axis=1)

In [30]:
df.head()

Unnamed: 0,results,train,delta
0,False,False,True
1,True,True,True
2,False,False,True
3,False,False,True
4,False,False,True


In [76]:
df['train'].value_counts()

False    1589
True      212
dtype: int64

In [56]:
metrics.roc_auc_score(train, prob_est[:,1])

1.0

In [57]:
metrics.precision_score(train, results)

1.0

In [58]:
metrics.recall_score(train, results)

1.0

In [60]:
print(metrics.classification_report(train, results))

             precision    recall  f1-score   support

      False       1.00      1.00      1.00      1589
       True       1.00      1.00      1.00       212

avg / total       1.00      1.00      1.00      1801

