In [1]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Diagnosis criteria

In [2]:
with open('..\sql\inclusion\icd.sql') as file:
    sql_command = file.read()
    pt_icd = pd.read_sql(sql_command, conn)
    file.close()

print("Number of patients included by OMOP concept IDs:",len(pt_icd))

Number of patients included by OMOP concept IDs: 155703


In [3]:
pt_icd.head()

Unnamed: 0,person_id,condition_concept_id,first_diag,age_at_first_diag
0,471492,4193704,2013-11-28,91
1,471521,4193704,2013-06-05,52
2,486914,4193704,2013-06-18,84
3,486941,4193704,2018-01-16,43
4,486955,4193704,2017-12-21,66


### Lab Criteria

In [4]:
with open('..\sql\inclusion\lab.sql') as file:
    sql_command = file.read()
    pt_lab = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by lab criteria:",len(pt_lab))

Number of patients included by lab criteria: 81377


In [5]:
pt_lab

Unnamed: 0,person_id,first_diag_datetime
0,826685,2015-10-31 07:40:17
1,1976535,2023-05-31 07:36:17
2,1697171,2023-02-01 09:07:17
3,392697,2018-07-20 06:30:08
4,4691677,2023-04-27 09:51:39
...,...,...
81372,1650238,2018-05-22 09:41:39
81373,1514698,2019-07-02 13:49:49
81374,4041356,2017-11-27 08:36:24
81375,1720866,2016-03-18 08:33:48


### Drug Criteria

In [6]:
with open('..\sql\inclusion\drug.sql') as file:
    sql_command = file.read()
    pt_drug = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by drug criteria:",len(pt_drug))

Number of patients included by drug criteria: 91588


In [7]:
pt_drug

Unnamed: 0,person_id,drug_source_value,first_drug_start_date,age_at_first_drug
0,461389,GLIT,2021-08-24,90.0
1,542642,METTFM5,2021-11-01,28.0
2,523543,METTF,2013-08-21,63.0
3,533518,GLITCM3,2013-06-16,78.0
4,501079,METTF,2015-11-07,81.0
...,...,...,...,...
91583,4654374,GLITP5,2022-08-10,81.0
91584,4681189,NOVIMF,2022-12-24,75.0
91585,4681232,METTDE1,2022-12-29,49.0
91586,4720329,SAXI,2024-01-22,22.0


### Merging all patient hn

In [8]:
all_data = pd.merge(pd.merge(pt_icd, pt_lab, on='person_id', how='outer'), pt_drug, on='person_id', how='outer')
print("Number of patients included by all criteria:",len(all_data))
all_data.head()

Number of patients included by all criteria: 172630


Unnamed: 0,person_id,condition_concept_id,first_diag,age_at_first_diag,first_diag_datetime,drug_source_value,first_drug_start_date,age_at_first_drug
0,0,,NaT,,2015-11-11 11:42:32,,,
1,377893,,NaT,,2021-05-15 09:52:13,,,
2,378094,4193704.0,2018-12-21,49.0,NaT,,,
3,378170,,NaT,,2021-08-20 07:02:43,,,
4,378195,4193704.0,2021-02-27,39.0,NaT,,,


In [9]:
all_data['diag'] = (~all_data['condition_concept_id'].isna()).astype(int)
all_data['lab'] = (~all_data['first_diag_datetime'].isna()).astype(int)
all_data['drug'] = (~all_data['drug_source_value'].isna()).astype(int)

In [10]:
all_data['first_drug_start_date'] = all_data['first_drug_start_date'].astype('datetime64[ns]')

In [11]:
all_data['first_date'] = all_data[['first_diag', 'first_diag_datetime', 'first_drug_start_date']].min(axis=1)

In [12]:
all_data = all_data[['person_id', 'diag', 'lab', 'drug', 'first_date']]

In [19]:
# filter only pt. between June 2013 - Sep 2023
all_data_filter = all_data[all_data['first_date'] >= '2013-06-01']
all_data_filter = all_data_filter[all_data_filter['first_date'] <= '2023-09-30']

In [25]:
all_data_filter

Unnamed: 0,person_id,diag,lab,drug,first_date,criteria
0,0,0,1,0,2015-11-11 11:42:32,lab
1,377893,0,1,0,2021-05-15 09:52:13,lab
2,378094,1,0,0,2018-12-21 00:00:00,diag
3,378170,0,1,0,2021-08-20 07:02:43,lab
4,378195,1,0,0,2021-02-27 00:00:00,diag
...,...,...,...,...,...,...
172287,4790220,1,0,1,2023-09-29 00:00:00,diag++drug
172291,4790276,1,0,0,2023-09-30 00:00:00,diag
172292,4790308,1,0,1,2023-09-30 00:00:00,diag++drug
172628,4833480,0,0,1,2018-02-14 00:00:00,drug


In [26]:
all_data_filter.person_id.nunique()

170737

In [27]:
dx = ['diag' if x == 1 else '' for x in all_data['diag']]
lab =['lab' if x == 1 else '' for x in all_data['lab']]
rx = ['drug' if x == 1 else '' for x in all_data['drug']]
result = [i+' '+j+' '+k for i, j, k in zip(dx, lab, rx)]
result = [x.strip().replace(' ', '+') for x in result]

all_data['criteria'] = result


In [28]:
all_data['criteria'].value_counts()

criteria
diag             63646
diag+lab+drug    61964
diag++drug       20093
diag+lab         10000
drug              7514
lab               7396
lab+drug          2017
Name: count, dtype: int64

In [29]:
all_data_filter.to_csv('..\data\patient_list_28feb24.csv', index=False)