In [1]:
%load_ext autoreload
%autoreload 2

In [80]:
import datetime
import os
from typing import List, Tuple
import pickle

import numpy as np
from sklearn import metrics

import piton
import piton.datasets

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgbm
from typing import Set

In [49]:
PATH_TO_PITON_DB = '/share/pi/nigam/data/som-rit-phi-starr-prod.starr_omop_cdm5_deid_2022_09_05_extract2'
PATH_TO_SAVE_MATRIX = "/share/pi/nigam/rthapa84/data"
LABELED_PATIENTS = "mortality_labeled_patients_v1.pickle"
FEATURIZED_DATA = "mortality_featurized_patients_v1.pickle"
SEED = 97

In [50]:
# Patient database
database = piton.datasets.PatientDatabase(PATH_TO_PITON_DB)

with open(os.path.join(PATH_TO_SAVE_MATRIX, FEATURIZED_DATA), 'rb') as f:
    featurized_data = pickle.load(f)

print("Data loaded")

Data loaded


In [78]:
# for code in database.get_code_dictionary():
#     code = bytes(code).decode("utf-8")
    
#     if "ICD10PCS/E08" in code:
#         print(code)

In [81]:
database = piton.datasets.PatientDatabase(PATH_TO_PITON_DB)
ontology = database.get_ontology()
diabetes_codes: Set[Tuple[str, int]] = set()
DIABETES_CODE = "SNOMED/44054006"

for code, code_str in enumerate(ontology.get_dictionary()):
    code_str = bytes(code_str).decode("utf-8")
    if code_str == DIABETES_CODE:
        diabetes_codes.add((code_str, code))

In [126]:
for patient in database:
    for event in patient.events:
        if type(event.value) != memoryview:
            print(event.value)
        # else:
        #     print(event.value)
        # print(type(event.value))
    break

None
None
None
None
None
None
None
None


In [117]:
database[3].events

(Event(start=1994-05-08 00:00:00, code=289),
 Event(start=1994-05-08 23:59:00, code=452),
 Event(start=1994-05-08 23:59:00, code=1380),
 Event(start=2017-03-26 13:43:00, code=1970),
 Event(start=2017-03-26 23:59:00, code=14),
 Event(start=2017-03-26 23:59:00, code=2979),
 Event(start=2017-03-26 23:59:00, code=9432),
 Event(start=2017-03-26 23:59:00, code=2028),
 Event(start=2017-04-04 11:57:00, code=1970),
 Event(start=2017-04-04 23:59:00, code=14),
 Event(start=2017-04-04 23:59:00, code=32),
 Event(start=2017-04-04 23:59:00, code=148),
 Event(start=2017-04-04 23:59:00, code=2979),
 Event(start=2017-04-04 23:59:00, code=9432),
 Event(start=2017-04-04 23:59:00, code=2028),
 Event(start=2017-04-08 23:59:00, code=32),
 Event(start=2017-04-08 23:59:00, code=7000),
 Event(start=2017-04-08 23:59:00, code=8088),
 Event(start=2017-04-08 23:59:00, code=11057),
 Event(start=2017-04-08 23:59:00, code=3770),
 Event(start=2017-04-08 23:59:00, code=4554),
 Event(start=2017-04-08 23:59:00, code=14838

In [87]:
list(diabetes_codes)[0][1]

1233

In [95]:
for item in ontology.get_children(1233):
    print(item)

680
24716
41731
46277
72965
105903


In [105]:
ontology.get_dictionary().index("LOINC/4548-4")

408

In [101]:
database.get_code_dictionary().index("LOINC/4548-4")

408

In [103]:
for item in ontology.get_children(408):
    print(item)

In [106]:
for item in ontology.get_all_parents(408):
    print(item)

408
101890
102046
102047
152118
152190


In [102]:
database.get_code_count(408)

1643698

In [108]:
bytes(database.get_code_dictionary()[102046]).decode("utf-8")

'LOINC/LG51070-7'

In [82]:
diabetes_codes

{('SNOMED/44054006', 1233)}

In [51]:
# with open(os.path.join(PATH_TO_SAVE_MATRIX, LABELED_PATIENTS), 'rb') as f:
#     labeled_patients = pickle.load(f)

# print(len(labeled_patients))

In [52]:
feature_matrix, labels, patient_ids = featurized_data[0], featurized_data[1], featurized_data[2]

In [53]:
labels.shape

(728350,)

In [54]:
feature_matrix.shape

(728350, 84869)

In [55]:
labels.sum()/len(labels)

0.12612892153497632

In [56]:
feature_matrix.shape

(728350, 84869)

In [57]:
hashed_pids = np.array([database.compute_split(SEED, pid) for pid in patient_ids])

In [58]:
train_pids_idx = np.where((hashed_pids < 70))[0]
dev_pids_idx = np.where(((hashed_pids >= 70) & (hashed_pids < 85)))[0]

In [59]:
train_pids_idx.shape, dev_pids_idx.shape

((511110,), (108492,))

In [60]:
X_train = feature_matrix[train_pids_idx]
y_train = labels[train_pids_idx]
X_test = feature_matrix[dev_pids_idx]
y_test = labels[dev_pids_idx]

In [61]:
print("Training Size:", X_train.shape)
print("Testing Size:", X_test.shape)

Training Size: (511110, 84869)
Testing Size: (108492, 84869)


In [62]:
print("Prevalence on total dataset:", round(sum(labels)/len(labels), 4))
print("Prevalence on training dataset:", round(sum(y_train)/len(y_train), 4))
print("Prevalence on testing dataset:", round(sum(y_test)/len(y_test), 4))

Prevalence on total dataset: 0.1261
Prevalence on training dataset: 0.1263
Prevalence on testing dataset: 0.1273


In [63]:
# Logistic Regresion
model = LogisticRegression().fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[::,1]
auroc = metrics.roc_auc_score(y_test, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7692621642098408 0.5005856835622114


In [64]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[::,1]
auroc = metrics.roc_auc_score(y_test, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

0.9146576911645753 0.6700984332975418


In [65]:
model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[::,1]
auroc = metrics.roc_auc_score(y_test, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

0.9241498525167235 0.6974471333309197


On Training Set

In [66]:
# Logistic Regresion
model = LogisticRegression().fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_train)[::,1]
auroc = metrics.roc_auc_score(y_train, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8422722542474823 0.6371506462510677


In [67]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_train)[::,1]
auroc = metrics.roc_auc_score(y_train, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

0.9830657658257993 0.9387466993513065


In [68]:
model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_train)[::,1]
auroc = metrics.roc_auc_score(y_train, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

0.9693857946319062 0.8839694913511226
