In [1]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
import numpy as np
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('../data/patient_list.csv', parse_dates=['first_date'])
df['first_date'] = df['first_date'].dt.strftime('%Y-%m-%d')

# remove person with person_id = 0, BUG
df = df[df['person_id'] != 0]

In [3]:
person_id_list_str = str(df['person_id'].to_list())[1:-1]


In [4]:
person_id_list_1000 = str(df['person_id'][:1000].to_list())[1:-1]

In [5]:
with open('..\sql\exclusion\drug_diag_pair.sql') as file:
    sql_command = file.read()
    file.close()
sql_command = sql_command.replace('insert_list_person_id', person_id_list_str)

result = pd.read_sql(sql_command, conn)
result

Unnamed: 0,person_id,first_hf_date,first_ckd_date,first_obs_date,glp_start_date,sglt2_start_date
0,569837,2022-04-08,,,,2022-04-08
1,4686081,,2023-05-28,,,2022-11-27
2,3122487,,2017-08-05,,,2020-10-31
3,475287,,2015-05-09,,,2020-07-04
4,1991360,,2015-03-12,,,2022-12-23
...,...,...,...,...,...,...
60504,3194285,,2020-02-15,,,
60505,1343807,,2017-06-07,,,
60506,2258582,,2021-09-13,,,
60507,2810549,,2020-01-12,,,


In [6]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60509 entries, 0 to 60508
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   person_id         60509 non-null  int64 
 1   first_hf_date     14674 non-null  object
 2   first_ckd_date    38744 non-null  object
 3   first_obs_date    20835 non-null  object
 4   glp_start_date    2662 non-null   object
 5   sglt2_start_date  5565 non-null   object
dtypes: int64(1), object(5)
memory usage: 2.8+ MB


In [7]:
result['sglt2_hf'] = result['first_hf_date'] < result['sglt2_start_date']
result['sglt2_ckd'] = result['first_ckd_date'] < result['sglt2_start_date']
result['glp_obs'] = result['first_obs_date'] < result['glp_start_date']

In [8]:
result[result['sglt2_hf']]

Unnamed: 0,person_id,first_hf_date,first_ckd_date,first_obs_date,glp_start_date,sglt2_start_date,sglt2_hf,sglt2_ckd,glp_obs
5,4407194,2021-05-10,2021-04-01,,,2022-07-12,True,True,False
6,4655597,2023-04-11,,,,2023-05-18,True,False,False
14,1849442,2008-08-20,,,,2023-01-04,True,False,False
17,4770429,2023-07-10,,,,2023-08-23,True,False,False
18,1808638,2016-12-07,2015-11-04,2017-07-07,2020-12-15,2021-10-12,True,True,True
...,...,...,...,...,...,...,...,...,...
7121,3237347,2021-01-07,2021-03-03,,,2021-02-22,True,False,False
7131,3082631,2010-12-09,2017-07-12,,2017-11-29,2016-08-24,True,False,False
7133,4121640,2018-09-18,2019-10-28,,,2020-10-08,True,True,False
7138,2603711,2008-01-23,2019-02-03,,,2018-05-06,True,False,False


In [9]:
result[['sglt2_hf', 'sglt2_ckd', 'glp_obs']].sum()

sglt2_hf     1457
sglt2_ckd    2870
glp_obs      1546
dtype: int64

In [10]:
df_merged = df.merge(result[['person_id','sglt2_hf', 'sglt2_ckd', 'glp_obs']], on='person_id', how='left')

In [11]:
df_merged

Unnamed: 0,person_id,diag,lab,drug,first_date,criteria,sglt2_hf,sglt2_ckd,glp_obs
0,377893,0,1,0,2021-05-15,lab,,,
1,378094,1,0,0,2018-12-21,diag,,,
2,378195,1,0,0,2021-02-27,diag,,,
3,378311,1,0,1,2013-07-06,diag++drug,False,False,False
4,378342,1,1,1,2015-08-14,diag+lab+drug,,,
...,...,...,...,...,...,...,...,...,...
171236,4825363,0,0,1,2024-02-17,drug,,,
171237,4825466,0,0,1,2024-02-18,drug,,,
171238,4825638,0,0,1,2024-02-19,drug,,,
171239,4826969,0,0,1,2018-02-14,drug,,,


In [12]:
sglt2_hf_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['sglt2_hf'], 'person_id'].to_list()
sglt2_ckd_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['sglt2_ckd'], 'person_id'].to_list()
glp_obs_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['glp_obs'], 'person_id'].to_list()

print("pt. with sglt2_hf: ", len(sglt2_hf_pt))
print("pt. with sglt2_ckd: ", len(sglt2_ckd_pt))
print("pt. with glp_obs: ", len(glp_obs_pt))

# union all the person_id together
all_exclude_pt = list(set(sglt2_hf_pt + sglt2_ckd_pt + glp_obs_pt))


pt. with sglt2_hf:  360
pt. with sglt2_ckd:  372
pt. with glp_obs:  320


In [13]:
len(all_exclude_pt)

921

In [14]:
all_exclude_pt

[3078144,
 4757505,
 1742849,
 2129921,
 657409,
 4413442,
 1433613,
 4229133,
 4796433,
 2639905,
 1196066,
 3278883,
 1118245,
 811047,
 847912,
 3975208,
 2033709,
 3831856,
 1931317,
 2973750,
 2172986,
 3870780,
 2918467,
 3829827,
 4735044,
 2304074,
 2941007,
 1103952,
 3985490,
 2752596,
 4610133,
 3205215,
 4151393,
 786531,
 2025573,
 2605160,
 3602547,
 4300917,
 3763662,
 751736,
 2642043,
 3807357,
 2936959,
 3467392,
 1808516,
 2523269,
 3004549,
 809095,
 2838663,
 2855049,
 3186826,
 3393675,
 4794503,
 1861767,
 1865864,
 2396300,
 1280144,
 3016844,
 2029714,
 3893391,
 3723416,
 4022426,
 3446939,
 526503,
 2513067,
 3686572,
 4765867,
 4495534,
 893102,
 3004592,
 981170,
 1185973,
 1132727,
 3639479,
 3989687,
 700604,
 1267901,
 1775805,
 3733695,
 4774079,
 2429132,
 3674319,
 3858640,
 4724946,
 2050261,
 3928284,
 2185441,
 2986211,
 2044136,
 559340,
 919793,
 4161782,
 1857782,
 3129592,
 1827066,
 4671738,
 4751614,
 4573440,
 3127555,
 3426569,
 2730254,
 3