In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

from utils import load_env_file, set_mpl_configs
from utils import leave_percentile, distribution_analysis

load_env_file()
set_mpl_configs()

DATA_DIR = os.getenv('DATA_DIR')
print('DATA_DIR: {}'.format(DATA_DIR))

load env file
  root dir:
    /Users/k/Repo/gp-ibd
  current system:
    Darwin
  load .env.darwin
  loaded data dir:
    /Users/k/Nutstore Files/毕设-EHR/DB
done.
set matplotlib configs
  font family:
    ['Times New Roman']
done.
DATA_DIR: /Users/k/Nutstore Files/毕设-EHR/DB


In [2]:
df = pd.read_csv(os.path.join(DATA_DIR, 'complication', 'Complications_Patients.csv'))
df_demography = df.groupby('subject_id').agg({
    'gender': 'first',
    'anchor_age': 'first',
    'anchor_year_group': 'first',
    'dod': 'first',
})

df_demography.head()

Unnamed: 0_level_0,gender,anchor_age,anchor_year_group,dod
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10001186,F,46,2011 - 2013,
10007174,M,70,2011 - 2013,
10018852,M,19,2011 - 2013,
10024331,M,72,2008 - 2010,2145-01-23
10025647,M,83,2008 - 2010,2181-06-16


In [69]:
def get_suspect_icd_list(threshold=300) -> list:
    stats = df.groupby(['subject_id', 'icd_code']).agg({
        'icd_code': ['count'],
    })
    stats.columns = ['count']
    stats = stats.reset_index()
    stats = stats.groupby('icd_code').agg({
        'subject_id': ['nunique']
    })
    stats.columns = ['count']
    return [_ for _ in stats[stats['count'] > threshold].index.tolist() if _ not in ['5559', '5569']]

In [105]:
# X
suspect_icd_list = get_suspect_icd_list(100)
X = np.zeros((len(df_demography.index), 3 + len(suspect_icd_list)))

'''
0: gender | 1: age_group | 2: anchor_year_group | 3: icd_2724 | 4: icd_27651 | 5: icd_2859 | 6: icd_30000 | 7: icd_311 | 8: icd_4019 | 9: icd_53081 | 10: icd_5849 | 11: icd_V1582
'''



for i in range(len(df_demography.index)):
    subject_id = df_demography.index[i]
    tmp_df = df[df['subject_id'] == subject_id]
    
    X[i, 0] = 1 if df_demography.loc[subject_id, 'gender'] == 'F' else 0
    X[i, 1] = int(df_demography.loc[subject_id, 'anchor_age'])
    # X[i, 2] = 1 if df_demography.loc[subject_id, 'anchor_year_group'] == 'Year 4' else 0
    
    for j in range(len(suspect_icd_list)):
        X[i, 3 + j] = 1 if (tmp_df['icd_code'] == suspect_icd_list[j]).any() else 0

X

array([[ 1., 46.,  0., ...,  0.,  0.,  0.],
       [ 0., 70.,  0., ...,  0.,  0.,  0.],
       [ 0., 19.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 1., 30.,  0., ...,  0.,  1.,  0.],
       [ 1., 37.,  0., ...,  1.,  0.,  1.],
       [ 0., 37.,  0., ...,  0.,  0.,  0.]])

In [106]:
# y
labelDod = lambda x: 1 if type(x) == str else 0

y = df_demography['dod'].apply(labelDod).values
print('  dead nums: {} live nums: {}'.format(y[y == 0].shape, y[y == 1].shape))

  dead nums: (2037,) live nums: (380,)


In [107]:
# Save files
np.savetxt('../ r scripts/X.csv', X, delimiter=',', header=','.join(['gender', 'age', 'anchor_year_group'] + suspect_icd_list), comments='')
np.savetxt('../ r scripts/y.csv', y, delimiter=',', header='dod', comments='')

# Logistic Regression

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('--> Logistic Regression')
print('  accuracy:\n    {:.4f}'.format(accuracy))
print('  precision:\n    {:.4f}'.format(precision))
print('  recall:\n    {:.4f}'.format(recall))
print('  f1:\n    {:.4f}'.format(f1))


--> Logistic Regression
  accuracy:
    0.8719
  precision:
    0.6250
  recall:
    0.2206
  f1:
    0.3261


In [21]:
def logistic_regression(X: np.ndarray, y: np.ndarray, random_state: int = 42, test_size: float = 0.2) -> LogisticRegression:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print('--> Logistic Regression')
    print('  accuracy:\n    {:.4f}'.format(accuracy))
    print('  precision:\n    {:.4f}'.format(precision))
    print('  recall:\n    {:.4f}'.format(recall))
    print('  f1:\n    {:.4f}'.format(f1))
    
    return model

In [12]:
model = logistic_regression(X, y)

--> Logistic Regression
  accuracy:
    0.8864
  precision:
    0.6444
  recall:
    0.4265
  f1:
    0.5133


In [27]:
model = logistic_regression(X[:, [1, 3, 5, 7, 10, 11]], y, test_size=0.2, random_state=45)

--> Logistic Regression
  accuracy:
    0.8574
  precision:
    0.7179
  recall:
    0.3256
  f1:
    0.4480


In [108]:
stats = df.groupby(['subject_id', 'icd_code']).agg({
    'icd_code': ['count'],
})
stats.columns = ['count']
stats = stats.reset_index()
stats = stats.groupby('icd_code').agg({
    'subject_id': ['nunique']
})
stats.columns = ['count']

stats[stats.index == '5552']

Unnamed: 0_level_0,count
icd_code,Unnamed: 1_level_1
5552,228
