In [1]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
import numpy as np
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('../data/patient_list_25mar24.csv', parse_dates=['first_date'])
df['first_date'] = df['first_date'].dt.strftime('%Y-%m-%d')

# remove person with person_id = 0, BUG
df = df[df['person_id'] != 0]

In [3]:
df

Unnamed: 0,person_id,diag,lab,drug,first_date,criteria
1,377893,0,1,0,2021-05-15,lab
2,378094,1,0,0,2018-12-21,diag
3,378195,1,0,0,2021-02-27,diag
4,378311,1,0,1,2013-07-06,diag++drug
5,378342,1,1,1,2015-08-14,diag+lab+drug
...,...,...,...,...,...,...
169637,4790220,1,0,0,2023-09-29,diag
169638,4790276,1,0,0,2023-09-30,diag
169639,4790308,1,0,0,2023-09-30,diag
169640,4834464,0,0,1,2018-02-14,drug


### Exclusion criteria:
Drug-diagnosis pair

In [4]:
with open('..\sql\exclusion\drug_diag_pair.sql', 'r') as f:
    sql_q = f.read()
    f.close()

# Due to memory problem, we will query in batch
# We will query 10000 at a time
pt_list = df['person_id'].to_list()
n_per_batch = 5000
n_batches = len(pt_list) // n_per_batch + 1

result=None

for i in range(n_batches):
    print("starting batch ", i+1, " of ", n_batches, "subject ", i*n_per_batch, " to ", (i+1)*n_per_batch - 1)
    pt_sub = str(pt_list[i*n_per_batch:(i+1)*n_per_batch])[1:-1]
    sql_q_sub = sql_q.replace('insert_list_person_id', pt_sub)

    if result is None:
        print("no exisiting dataframe, creating new one")
        result = pd.read_sql(sql_q_sub, conn)
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(result))
    else:
        next_result = pd.read_sql(sql_q_sub, conn)
        
        result = pd.concat([result, next_result])
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(result))


starting batch  1  of  34 subject  0  to  4999
no exisiting dataframe, creating new one
finising batch  1  of  34 number of records 2122
starting batch  2  of  34 subject  5000  to  9999
finising batch  2  of  34 number of records 4219
starting batch  3  of  34 subject  10000  to  14999
finising batch  3  of  34 number of records 6313
starting batch  4  of  34 subject  15000  to  19999
finising batch  4  of  34 number of records 8475
starting batch  5  of  34 subject  20000  to  24999
finising batch  5  of  34 number of records 10642
starting batch  6  of  34 subject  25000  to  29999
finising batch  6  of  34 number of records 12826
starting batch  7  of  34 subject  30000  to  34999
finising batch  7  of  34 number of records 14918
starting batch  8  of  34 subject  35000  to  39999
finising batch  8  of  34 number of records 16893
starting batch  9  of  34 subject  40000  to  44999
finising batch  9  of  34 number of records 18927
starting batch  10  of  34 subject  45000  to  49999

In [5]:
result.head()

Unnamed: 0,person_id,first_hf_date,first_ckd_date,first_obs_date,glp_start_date,sglt2_start_date
0,511984,2007-01-22,2018-08-15,,,2020-12-09
1,502358,,2017-01-23,,,2020-12-09
2,537154,,2010-06-16,,,2020-05-20
3,435961,,2010-03-05,,2019-05-17,
4,395549,,2019-07-11,2008-07-16,,2018-01-30


In [24]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60045 entries, 0 to 1205
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   person_id         60045 non-null  int64 
 1   first_hf_date     14568 non-null  object
 2   first_ckd_date    38610 non-null  object
 3   first_obs_date    20584 non-null  object
 4   glp_start_date    2593 non-null   object
 5   sglt2_start_date  5412 non-null   object
 6   sglt2_hf          60045 non-null  bool  
 7   sglt2_ckd         60045 non-null  bool  
 8   glp_obs           60045 non-null  bool  
dtypes: bool(3), int64(1), object(5)
memory usage: 3.4+ MB


In [25]:
result['sglt2_hf'] = result['first_hf_date'] < result['sglt2_start_date']
result['sglt2_ckd'] = result['first_ckd_date'] < result['sglt2_start_date']
result['glp_obs'] = result['first_obs_date'] < result['glp_start_date']

In [8]:
result[['sglt2_hf', 'sglt2_ckd', 'glp_obs']].sum()

sglt2_hf     1389
sglt2_ckd    2798
glp_obs      1503
dtype: int64

In [9]:
df_merged = df.merge(result[['person_id','sglt2_hf', 'sglt2_ckd', 'glp_obs']], on='person_id', how='left')

In [10]:
df_merged

Unnamed: 0,person_id,diag,lab,drug,first_date,criteria,sglt2_hf,sglt2_ckd,glp_obs
0,377893,0,1,0,2021-05-15,lab,,,
1,378094,1,0,0,2018-12-21,diag,,,
2,378195,1,0,0,2021-02-27,diag,,,
3,378311,1,0,1,2013-07-06,diag++drug,False,False,False
4,378342,1,1,1,2015-08-14,diag+lab+drug,,,
...,...,...,...,...,...,...,...,...,...
169636,4790220,1,0,0,2023-09-29,diag,,,
169637,4790276,1,0,0,2023-09-30,diag,False,False,False
169638,4790308,1,0,0,2023-09-30,diag,False,False,False
169639,4834464,0,0,1,2018-02-14,drug,,,


In [11]:
sglt2_hf_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['sglt2_hf'], 'person_id'].to_list()
sglt2_ckd_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['sglt2_ckd'], 'person_id'].to_list()
glp_obs_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['glp_obs'], 'person_id'].to_list()

print("pt. with sglt2_hf: ", len(sglt2_hf_pt))
print("pt. with sglt2_ckd: ", len(sglt2_ckd_pt))
print("pt. with glp_obs: ", len(glp_obs_pt))

# union all the person_id together
exc_dx_drug = list(set(sglt2_hf_pt + sglt2_ckd_pt + glp_obs_pt))

print("pt. with sglt2_hf or sglt2_ckd or glp_obs: ", len(exc_dx_drug))


pt. with sglt2_hf:  287
pt. with sglt2_ckd:  279
pt. with glp_obs:  274
pt. with sglt2_hf or sglt2_ckd or glp_obs:  727


### Exclusion criteria:
Age < 18 year

In [12]:
## Get year at birth
sql = """
SELECT person_id, year_of_birth
FROM cdm.person
"""

df_yob = pd.read_sql(sql, conn)

In [13]:
df = df.merge(df_yob, on='person_id', how='left')

In [14]:
df['first_date'] = pd.to_datetime(df['first_date'])

# Get age of patient at first diagnosis date
df['age'] = df['first_date'].dt.year - df['year_of_birth']

In [15]:
age18_pt = df.loc[df['age'] < 18, 'person_id'].to_list()

### Exclusion criteria
have T1DM diagnosis

In [16]:
with open("../sql/exclusion/t1dm.sql", "r") as file:
    sql = file.read()
    file.close()

t1dm = pd.read_sql(sql, conn)

In [17]:
t1dm.head()

Unnamed: 0,person_id,condition_concept_id,first_diag,age_at_first_diag
0,1966930,443412,2021-09-21,64
1,2169771,443412,2020-03-18,78
2,3242643,4063042,2019-11-13,23
3,3256215,4063042,2016-08-26,30
4,3500615,443412,2014-01-21,18


In [18]:
t1dm_pt = t1dm['person_id'].to_list()

### Union all excluded patients

In [19]:
print("pt. with age < 18: ", len(age18_pt))
print("pt. with t1dm: ", len(t1dm_pt))
print("pt. with drug_diag: ", len(exc_dx_drug))

all_exclude_pt = set(age18_pt + t1dm_pt + exc_dx_drug)

print("All excluded pt. : ", len(all_exclude_pt))

pt. with age < 18:  1123
pt. with t1dm:  1961
pt. with drug_diag:  727
All excluded pt. :  3307


In [20]:
# remove excluded pt. from the df
df = df[~df['person_id'].isin(all_exclude_pt)]

In [21]:
df

Unnamed: 0,person_id,diag,lab,drug,first_date,criteria,year_of_birth,age
0,377893,0,1,0,2021-05-15,lab,1982.0,39.0
1,378094,1,0,0,2018-12-21,diag,1969.0,49.0
2,378195,1,0,0,2021-02-27,diag,1982.0,39.0
3,378311,1,0,1,2013-07-06,diag++drug,1939.0,74.0
4,378342,1,1,1,2015-08-14,diag+lab+drug,1964.0,51.0
...,...,...,...,...,...,...,...,...
169636,4790220,1,0,0,2023-09-29,diag,1953.0,70.0
169637,4790276,1,0,0,2023-09-30,diag,1937.0,86.0
169638,4790308,1,0,0,2023-09-30,diag,1958.0,65.0
169639,4834464,0,0,1,2018-02-14,drug,2000.0,18.0


In [22]:
df.to_csv('../data/cohort_25mar24.csv', index=False)

In [23]:
df['criteria'].value_counts()

criteria
diag             64887
diag+lab+drug    60096
diag++drug       20158
diag+lab          9227
lab               6135
drug              4825
lab+drug          1455
Name: count, dtype: int64