In [1]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
import numpy as np
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('../data/patient_list_3oct24.csv', parse_dates=['first_date'])
df['first_date'] = df['first_date'].dt.strftime('%Y-%m-%d')

# remove person with person_id = 0, BUG
df = df[df['person_id'] != 0]

### Exclusion criteria:
Drug-diagnosis pair

In [3]:
with open('..\sql\exclusion\drug_diag_pair.sql', 'r') as f:
    sql_q = f.read()
    f.close()

# Due to memory problem, we will query in batch
# We will query 10000 at a time
pt_list = df['person_id'].to_list()
n_per_batch = 5000
n_batches = len(pt_list) // n_per_batch + 1

result=None

for i in range(n_batches):
    print("starting batch ", i+1, " of ", n_batches, "subject ", i*n_per_batch, " to ", (i+1)*n_per_batch - 1)
    pt_sub = str(pt_list[i*n_per_batch:(i+1)*n_per_batch])[1:-1]
    sql_q_sub = sql_q.replace('insert_list_person_id', pt_sub)

    if result is None:
        print("no exisiting dataframe, creating new one")
        result = pd.read_sql(sql_q_sub, conn)
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(result))
    else:
        next_result = pd.read_sql(sql_q_sub, conn)
        
        result = pd.concat([result, next_result])
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(result))


starting batch  1  of  36 subject  0  to  4999
no exisiting dataframe, creating new one
finising batch  1  of  36 number of records 2090
starting batch  2  of  36 subject  5000  to  9999
finising batch  2  of  36 number of records 4150
starting batch  3  of  36 subject  10000  to  14999
finising batch  3  of  36 number of records 6239
starting batch  4  of  36 subject  15000  to  19999
finising batch  4  of  36 number of records 8359
starting batch  5  of  36 subject  20000  to  24999
finising batch  5  of  36 number of records 10494
starting batch  6  of  36 subject  25000  to  29999
finising batch  6  of  36 number of records 12575
starting batch  7  of  36 subject  30000  to  34999
finising batch  7  of  36 number of records 14669
starting batch  8  of  36 subject  35000  to  39999
finising batch  8  of  36 number of records 16648
starting batch  9  of  36 subject  40000  to  44999
finising batch  9  of  36 number of records 18653
starting batch  10  of  36 subject  45000  to  49999

In [4]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 61016 entries, 0 to 75
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   person_id         61016 non-null  int64 
 1   first_hf_date     14632 non-null  object
 2   first_ckd_date    40013 non-null  object
 3   first_obs_date    19858 non-null  object
 4   glp_start_date    2528 non-null   object
 5   sglt2_start_date  5599 non-null   object
dtypes: int64(1), object(5)
memory usage: 3.3+ MB


In [5]:
result['sglt2_hf'] = result['first_hf_date'] < result['sglt2_start_date']
result['sglt2_ckd'] = result['first_ckd_date'] < result['sglt2_start_date']
result['glp_obs'] = result['first_obs_date'] < result['glp_start_date']

In [6]:
result[['sglt2_hf', 'sglt2_ckd', 'glp_obs']].sum()

sglt2_hf     1350
sglt2_ckd    2933
glp_obs      1419
dtype: int64

In [7]:
df_merged = df.merge(result[['person_id','sglt2_hf', 'sglt2_ckd', 'glp_obs']], on='person_id', how='left')

In [8]:
df_merged.to_pickle("../data/exclude_df_drug_diag_pair_3oct.pkl")


In [9]:
df_merged = pd.read_pickle("../data/exclude_df_drug_diag_pair_3oct.pkl")

In [10]:
sglt2_hf_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['sglt2_hf'], 'person_id'].to_list()
sglt2_ckd_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['sglt2_ckd'], 'person_id'].to_list()
glp_obs_pt = df_merged.loc[(df_merged['criteria'] == 'drug') & df_merged['glp_obs'], 'person_id'].to_list()

print("pt. with sglt2_hf: ", len(sglt2_hf_pt))
print("pt. with sglt2_ckd: ", len(sglt2_ckd_pt))
print("pt. with glp_obs: ", len(glp_obs_pt))

# union all the person_id together
exc_dx_drug = list(set(sglt2_hf_pt + sglt2_ckd_pt + glp_obs_pt))

print("pt. with sglt2_hf or sglt2_ckd or glp_obs: ", len(exc_dx_drug))


pt. with sglt2_hf:  240
pt. with sglt2_ckd:  255
pt. with glp_obs:  149
pt. with sglt2_hf or sglt2_ckd or glp_obs:  549


### Exclusion criteria:
Age < 18 year

In [11]:
## Get year at birth
sql = """
SELECT person_id, year_of_birth
FROM omop.person
"""

df_yob = pd.read_sql(sql, conn)

In [12]:
df = df.merge(df_yob, on='person_id', how='left')

In [13]:
df['first_date'] = pd.to_datetime(df['first_date'])

# Get age of patient at first diagnosis date
df['age'] = df['first_date'].dt.year - df['year_of_birth']

In [14]:
age18_pt = df.loc[df['age'] < 18, 'person_id'].to_list()

### Exclusion criteria
have T1DM diagnosis

In [15]:
with open("../sql/exclusion/t1dm.sql", "r") as file:
    sql = file.read()
    file.close()

t1dm = pd.read_sql(sql, conn)

In [16]:
t1dm_pt = t1dm['person_id'].to_list()

### Union all excluded patients

In [17]:
print("pt. with age < 18: ", len(age18_pt))
print("pt. with t1dm: ", len(t1dm_pt))
print("pt. with drug_diag: ", len(exc_dx_drug))

all_exclude_pt = set(age18_pt + t1dm_pt + exc_dx_drug)

print("All excluded pt. : ", len(all_exclude_pt))

pt. with age < 18:  1360
pt. with t1dm:  2372
pt. with drug_diag:  549
All excluded pt. :  3827


In [18]:
# remove excluded pt. from the df
df = df[~df['person_id'].isin(all_exclude_pt)]

In [19]:
df.person_id.nunique()

172286

In [20]:
df.to_csv('../data/cohort_3oct.csv', index=False)

In [21]:
df['criteria'].value_counts()

criteria
diag+lab+drug    58052
diag             55239
diag++drug       28116
diag+lab         12979
lab               8893
drug              7114
lab+drug          1893
Name: count, dtype: int64