In [27]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

### Diagnosis criteria

In [28]:
with open('..\sql\inclusion\icd.sql') as file:
    sql_command = file.read()
    pt_icd = pd.read_sql(sql_command, conn)
    file.close()

print("Number of patients included by OMOP concept IDs:",len(pt_icd))

Number of patients included by OMOP concept IDs: 155703


In [29]:
pt_icd.head()

Unnamed: 0,person_id,condition_concept_id,first_diag,age_at_first_diag
0,475514,4193704,2013-06-06,79
1,489684,4193704,2023-01-05,44
2,489754,4193704,2013-08-15,79
3,489777,4193704,2013-07-29,73
4,441954,4193704,2013-07-08,56


### Lab Criteria

In [30]:
with open('..\sql\inclusion\lab.sql') as file:
    sql_command = file.read()
    pt_lab = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by lab criteria:",len(pt_lab))

Number of patients included by lab criteria: 78209


In [31]:
pt_lab

Unnamed: 0,person_id,first_diag_datetime
0,3409284,2016-10-04 09:11:50
1,2691902,2019-05-23 06:58:47
2,3625376,2016-01-26 17:12:26
3,965016,2016-04-25 07:40:08
4,2866895,2016-04-19 08:25:59
...,...,...
78204,1201706,2015-10-22 08:32:58
78205,3668015,2021-02-27 09:29:51
78206,4041356,2017-11-27 08:36:24
78207,3182938,2018-05-30 08:03:17


### Drug Criteria

In [32]:
with open('..\sql\inclusion\drug.sql') as file:
    sql_command = file.read()
    pt_drug = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by drug criteria:",len(pt_drug))

Number of patients included by drug criteria: 89066


In [33]:
pt_drug

Unnamed: 0,person_id,drug_source_value,first_drug_start_date,age_at_first_drug
0,437794,GLIT,2020-02-06,74
1,564801,METTF,2015-09-21,79
2,564840,NOVIMF,2013-06-21,80
3,437926,GLUT5,2013-08-17,49
4,437994,JANTM50,2013-09-07,63
...,...,...,...,...
89061,4772155,TRATD255,2023-09-20,60
89062,4750920,GLITP5,2023-03-15,43
89063,4769447,DIATM60,2023-07-04,55
89064,4785966,GLITP5,2023-09-14,77


### Merging all patient hn

In [45]:
all_data = pd.merge(pd.merge(pt_icd, pt_lab, on='person_id', how='outer'), pt_drug, on='person_id', how='outer')
print("Number of patients included by all criteria:",len(all_data))
all_data.head()

Number of patients included by all criteria: 169642


Unnamed: 0,person_id,condition_concept_id,first_diag,age_at_first_diag,first_diag_datetime,drug_source_value,first_drug_start_date,age_at_first_drug
0,0,,NaT,,2015-11-11 11:42:32,,,
1,377893,,NaT,,2021-05-15 09:52:13,,,
2,378094,4193704.0,2018-12-21,49.0,NaT,,,
3,378195,4193704.0,2021-02-27,39.0,NaT,,,
4,378311,4193704.0,2013-07-06,74.0,NaT,METTF,2013-07-21,74.0


In [35]:
all_data['diag'] = (~all_data['condition_concept_id'].isna()).astype(int)
all_data['lab'] = (~all_data['first_diag_datetime'].isna()).astype(int)
all_data['drug'] = (~all_data['drug_source_value'].isna()).astype(int)

In [36]:
all_data['first_drug_start_date'] = all_data['first_drug_start_date'].astype('datetime64[ns]')

In [37]:
all_data['first_date'] = all_data[['first_diag', 'first_diag_datetime', 'first_drug_start_date']].min(axis=1)

In [38]:
all_data = all_data[['person_id', 'diag', 'lab', 'drug', 'first_date']]

In [39]:
# filter only pt. between June 2013 - Sep 2023
all_data_filter = all_data[all_data['first_date'] >= '2013-06-01']
all_data_filter = all_data_filter[all_data_filter['first_date'] <= '2023-09-30']

In [40]:
all_data_filter

Unnamed: 0,person_id,diag,lab,drug,first_date
0,0,0,1,0,2015-11-11 11:42:32
1,377893,0,1,0,2021-05-15 09:52:13
2,378094,1,0,0,2018-12-21 00:00:00
3,378195,1,0,0,2021-02-27 00:00:00
4,378311,1,0,1,2013-07-06 00:00:00
...,...,...,...,...,...
169637,4790220,1,0,0,2023-09-29 00:00:00
169638,4790276,1,0,0,2023-09-30 00:00:00
169639,4790308,1,0,0,2023-09-30 00:00:00
169640,4834464,0,0,1,2018-02-14 00:00:00


In [46]:
all_data_filter.person_id.nunique()

169642

In [42]:
dx = ['diag' if x == 1 else '' for x in all_data['diag']]
lab =['lab' if x == 1 else '' for x in all_data['lab']]
rx = ['drug' if x == 1 else '' for x in all_data['drug']]
result = [i+' '+j+' '+k for i, j, k in zip(dx, lab, rx)]
result = [x.strip().replace(' ', '+') for x in result]

all_data_filter['criteria'] = result


In [43]:
all_data_filter['criteria'].value_counts()

criteria
diag             65094
diag+lab+drug    60973
diag++drug       20364
diag+lab          9272
lab               6210
drug              5975
lab+drug          1754
Name: count, dtype: int64

In [44]:
all_data_filter.to_csv('..\data\patient_list_25mar24.csv', index=False)