In [1]:
%load_ext autoreload
%autoreload 2

In [33]:
import datetime
import os
from typing import List, Tuple
import pickle

import numpy as np
from sklearn import metrics

import piton
import piton.datasets

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgbm

In [7]:
PATH_TO_PITON_DB = '/share/pi/nigam/data/som-rit-phi-starr-prod.starr_omop_cdm5_deid_2022_09_05_extract2'
PATH_TO_SAVE_MATRIX = "/share/pi/nigam/rthapa84/data"
LABELED_PATIENTS = "diabetes_labeled_patients_v5.pickle"
FEATURIZED_DATA = "diabetes_featurized_patients_v5.pickle"
SEED = 97

In [8]:
# Patient database
database = piton.datasets.PatientDatabase(PATH_TO_PITON_DB)

with open(os.path.join(PATH_TO_SAVE_MATRIX, FEATURIZED_DATA), 'rb') as f:
    featurized_data = pickle.load(f)

print("Data loaded")

Data loaded


In [45]:
with open(os.path.join(PATH_TO_SAVE_MATRIX, LABELED_PATIENTS), 'rb') as f:
    labeled_patients = pickle.load(f)

print(len(labeled_patients))

236818


KeysView(<piton.labelers.core.LabeledPatients object at 0x7fd3437271f0>)

In [9]:
feature_matrix, labels, patient_ids = featurized_data[0], featurized_data[1], featurized_data[2]

In [43]:
labels.shape

(713145,)

In [44]:
feature_matrix.shape

(713145, 84741)

In [38]:
labels.sum()/len(labels)

0.1714756466076324

In [10]:
feature_matrix.shape

(713145, 84741)

In [14]:
hashed_pids = np.array([database.compute_split(SEED, pid) for pid in patient_ids])

In [19]:
train_pids_idx = np.where((hashed_pids < 70))[0]
dev_pids_idx = np.where(((hashed_pids >= 70) & (hashed_pids < 85)))[0]

In [21]:
train_pids_idx.shape, dev_pids_idx.shape

((499916,), (105924,))

In [24]:
X_train = feature_matrix[train_pids_idx]
y_train = labels[train_pids_idx]
X_test = feature_matrix[dev_pids_idx]
y_test = labels[dev_pids_idx]

In [39]:
print("Training Size:", X_train.shape)
print("Testing Size:", X_test.shape)

Training Size: (499916, 84741)
Testing Size: (105924, 84741)


In [28]:
print("Prevalence on total dataset:", round(sum(labels)/len(labels), 4))
print("Prevalence on training dataset:", round(sum(y_train)/len(y_train), 4))
print("Prevalence on testing dataset:", round(sum(y_test)/len(y_test), 4))

Prevalence on total dataset: 0.1715
Prevalence on training dataset: 0.1708
Prevalence on testing dataset: 0.1653


In [35]:
# Logistic Regresion
model = LogisticRegression().fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[::,1]
auroc = metrics.roc_auc_score(y_test, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8951867139458299 0.7705493955475127


In [36]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[::,1]
auroc = metrics.roc_auc_score(y_test, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

0.9597899473675853 0.8752171354254592


In [37]:
model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[::,1]
auroc = metrics.roc_auc_score(y_test, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

0.9645770698195877 0.8826291947565964


On Training Set

In [40]:
# Logistic Regresion
model = LogisticRegression().fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_train)[::,1]
auroc = metrics.roc_auc_score(y_train, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9458851736876972 0.8625296935660999


In [41]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_train)[::,1]
auroc = metrics.roc_auc_score(y_train, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

0.9917347476093102 0.9722929534102924


In [42]:
model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_train)[::,1]
auroc = metrics.roc_auc_score(y_train, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
auc_precision_recall = auc(recall, precision)
print(auroc, auc_precision_recall)

0.9863995471868983 0.952022986630187
