In [1]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
import numpy as np
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [29]:
df = pd.read_csv('../data/patient_list_28feb24.csv', parse_dates=['first_date'])
df['first_date'] = df['first_date'].dt.strftime('%Y-%m-%d')

# remove person with person_id = 0, BUG
df = df[df['person_id'] != 0]

In [31]:
df

Unnamed: 0,person_id,diag,lab,drug,first_date,criteria
1,377893,0,1,0,2021-05-15,lab
2,378094,1,0,0,2018-12-21,diag
3,378170,0,1,0,2021-08-20,lab
4,378195,1,0,0,2021-02-27,diag
5,378311,1,0,1,2013-07-06,diag++drug
...,...,...,...,...,...,...
170732,4790220,1,0,1,2023-09-29,diag++drug
170733,4790276,1,0,0,2023-09-30,diag
170734,4790308,1,0,1,2023-09-30,diag++drug
170735,4833480,0,0,1,2018-02-14,drug


In [32]:
person_id_list_str = str(df['person_id'].to_list())[1:-1]


In [33]:
person_id_list_1000 = str(df['person_id'][:1000].to_list())[1:-1]

### Exclusion criteria:
Drug-diagnosis pair

In [34]:
with open('..\sql\exclusion\drug_diag_pair.sql') as file:
    sql_command = file.read()
    file.close()
sql_command = sql_command.replace('insert_list_person_id', person_id_list_str)

result = pd.read_sql(sql_command, conn)
result

Unnamed: 0,person_id,first_hf_date,first_ckd_date,first_obs_date,glp_start_date,sglt2_start_date
0,1180009,2010-03-03,,2008-02-13,2015-07-22,2015-11-02
1,2159442,,,2007-07-19,,2015-11-03
2,1156613,2023-06-13,2022-08-16,,,2016-05-30
3,748912,,,2008-03-05,2013-06-05,2016-08-30
4,2229116,2006-10-29,2019-09-02,,,2016-09-02
...,...,...,...,...,...,...
60141,3194285,,2020-02-15,,,
60142,1343807,,2017-06-07,,,
60143,2258582,,2021-09-13,,,
60144,2810549,,2020-01-12,,,


In [35]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60146 entries, 0 to 60145
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   person_id         60146 non-null  int64 
 1   first_hf_date     14600 non-null  object
 2   first_ckd_date    38604 non-null  object
 3   first_obs_date    20611 non-null  object
 4   glp_start_date    2585 non-null   object
 5   sglt2_start_date  5370 non-null   object
dtypes: int64(1), object(5)
memory usage: 2.8+ MB


In [36]:
result['sglt2_hf'] = result['first_hf_date'] < result['sglt2_start_date']
result['sglt2_ckd'] = result['first_ckd_date'] < result['sglt2_start_date']
result['glp_obs'] = result['first_obs_date'] < result['glp_start_date']

In [37]:
result[['sglt2_hf', 'sglt2_ckd', 'glp_obs']].sum()

sglt2_hf     1381
sglt2_ckd    2778
glp_obs      1501
dtype: int64

In [38]:
df_merged = df.merge(result[['person_id','sglt2_hf', 'sglt2_ckd', 'glp_obs']], on='person_id', how='left')

In [39]:
df_merged

Unnamed: 0,person_id,diag,lab,drug,first_date,criteria,sglt2_hf,sglt2_ckd,glp_obs
0,377893,0,1,0,2021-05-15,lab,,,
1,378094,1,0,0,2018-12-21,diag,,,
2,378170,0,1,0,2021-08-20,lab,False,False,False
3,378195,1,0,0,2021-02-27,diag,,,
4,378311,1,0,1,2013-07-06,diag++drug,False,False,False
...,...,...,...,...,...,...,...,...,...
170731,4790220,1,0,1,2023-09-29,diag++drug,,,
170732,4790276,1,0,0,2023-09-30,diag,False,False,False
170733,4790308,1,0,1,2023-09-30,diag++drug,False,False,False
170734,4833480,0,0,1,2018-02-14,drug,,,


In [40]:
sglt2_hf_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['sglt2_hf'], 'person_id'].to_list()
sglt2_ckd_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['sglt2_ckd'], 'person_id'].to_list()
glp_obs_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['glp_obs'], 'person_id'].to_list()

print("pt. with sglt2_hf: ", len(sglt2_hf_pt))
print("pt. with sglt2_ckd: ", len(sglt2_ckd_pt))
print("pt. with glp_obs: ", len(glp_obs_pt))

# union all the person_id together
exc_dx_drug = list(set(sglt2_hf_pt + sglt2_ckd_pt + glp_obs_pt))

print("pt. with sglt2_hf or sglt2_ckd or glp_obs: ", len(exc_dx_drug))


pt. with sglt2_hf:  282
pt. with sglt2_ckd:  274
pt. with glp_obs:  273
pt. with sglt2_hf or sglt2_ckd or glp_obs:  717


### Exclusion criteria:
Age < 18 year

In [41]:
## Get year at birth
sql = """
SELECT person_id, year_of_birth
FROM cdm.person
"""

df_yob = pd.read_sql(sql, conn)

In [42]:
df = df.merge(df_yob, on='person_id', how='left')

In [43]:
df['first_date'] = pd.to_datetime(df['first_date'])

# Get age of patient at first diagnosis date
df['age'] = df['first_date'].dt.year - df['year_of_birth']

In [44]:
age18_pt = df.loc[df['age'] < 18, 'person_id'].to_list()

### Exclusion criteria
have T1DM diagnosis

In [45]:
with open("../sql/exclusion/t1dm.sql", "r") as file:
    sql = file.read()
    file.close()

t1dm = pd.read_sql(sql, conn)

In [46]:
t1dm.head()

Unnamed: 0,person_id,condition_concept_id,first_diag,age_at_first_diag
0,697579,443412,2013-09-04,22
1,725874,443412,2021-03-31,45
2,1258936,443412,2014-07-03,16
3,1301500,443412,2020-06-02,21
4,1109182,443412,2019-04-23,59


In [47]:
t1dm_pt = t1dm['person_id'].to_list()

### Union all excluded patients

In [48]:
print("pt. with age < 18: ", len(age18_pt))
print("pt. with t1dm: ", len(t1dm_pt))
print("pt. with drug_diag: ", len(exc_dx_drug))

all_exclude_pt = set(age18_pt + t1dm_pt + exc_dx_drug)

print("All excluded pt. : ", len(all_exclude_pt))

pt. with age < 18:  1168
pt. with t1dm:  1961
pt. with drug_diag:  717
All excluded pt. :  3340


In [49]:
# remove excluded pt. from the df
df = df[~df['person_id'].isin(all_exclude_pt)]

In [50]:
df

Unnamed: 0,person_id,diag,lab,drug,first_date,criteria,year_of_birth,age
0,377893,0,1,0,2021-05-15,lab,1982.0,39.0
1,378094,1,0,0,2018-12-21,diag,1969.0,49.0
2,378170,0,1,0,2021-08-20,lab,1952.0,69.0
3,378195,1,0,0,2021-02-27,diag,1982.0,39.0
4,378311,1,0,1,2013-07-06,diag++drug,1939.0,74.0
...,...,...,...,...,...,...,...,...
170731,4790220,1,0,1,2023-09-29,diag++drug,1953.0,70.0
170732,4790276,1,0,0,2023-09-30,diag,1937.0,86.0
170733,4790308,1,0,1,2023-09-30,diag++drug,1958.0,65.0
170734,4833480,0,0,1,2018-02-14,drug,,


In [51]:
df.to_csv('../data/cohort_28feb24.csv', index=False)

In [52]:
df['criteria'].value_counts()

criteria
diag             63445
diag+lab+drug    60910
diag++drug       19819
diag+lab          9952
lab               7272
drug              4739
lab+drug          1701
Name: count, dtype: int64