In [36]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

### Diagnosis criteria

In [37]:
with open('..\sql\inclusion\icd.sql') as file:
    sql_command = file.read()
    pt_icd = pd.read_sql(sql_command, conn)
    file.close()

print("Number of patients included by OMOP concept IDs:",len(pt_icd))

Number of patients included by OMOP concept IDs: 155703


In [38]:
pt_icd.head()

Unnamed: 0,person_id,condition_concept_id,first_diag,age_at_first_diag
0,4333959,4193704,2020-07-24,68
1,4333969,4193704,2020-06-06,66
2,4334053,4193704,2020-05-01,76
3,4332902,4193704,2021-01-14,69
4,4333055,4193704,2020-06-19,66


### Lab Criteria

In [39]:
with open('..\sql\inclusion\lab.sql') as file:
    sql_command = file.read()
    pt_lab = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by lab criteria:",len(pt_lab))

Number of patients included by lab criteria: 78209


In [40]:
pt_lab

Unnamed: 0,person_id,first_diag_datetime
0,1771964,2020-10-07 08:58:05
1,1013345,2016-08-01 16:49:28
2,964117,2018-06-25 09:13:30
3,2454983,2015-10-27 09:03:03
4,1271914,2015-11-03 09:03:06
...,...,...
78204,1160121,2017-03-22 07:48:36
78205,2163929,2018-01-19 07:20:52
78206,2302831,2019-02-23 07:41:25
78207,3447093,2019-06-07 08:34:25


### Drug Criteria

In [41]:
with open('..\sql\inclusion\drug.sql') as file:
    sql_command = file.read()
    pt_drug = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by drug criteria:",len(pt_drug))

Number of patients included by drug criteria: 91501


In [42]:
pt_drug

Unnamed: 0,person_id,drug_source_value,first_drug_start_date,age_at_first_drug
0,4277315,PIOT15,2022-09-29,67.0
1,4368800,METTFM5,2020-11-10,65.0
2,4466898,METTFM5,2021-08-20,66.0
3,4612063,GLITP5,2022-05-31,67.0
4,4644683,AMAT3,2022-06-18,67.0
...,...,...,...,...
91496,2071268,HUMIC1,2015-06-13,71.0
91497,2377243,DIATM60,2013-09-17,48.0
91498,2470473,JARTD5,2018-11-29,63.0
91499,2848177,GLIT,2013-06-27,68.0


### Merging all patient hn

In [43]:
all_data = pd.merge(pd.merge(pt_icd, pt_lab, on='person_id', how='outer'), pt_drug, on='person_id', how='outer')
print("Number of patients included by all criteria:",len(all_data))
all_data.head()

Number of patients included by all criteria: 171242


Unnamed: 0,person_id,condition_concept_id,first_diag,age_at_first_diag,first_diag_datetime,drug_source_value,first_drug_start_date,age_at_first_drug
0,0,,NaT,,2015-11-11 11:42:32,,,
1,377893,,NaT,,2021-05-15 09:52:13,,,
2,378094,4193704.0,2018-12-21,49.0,NaT,,,
3,378195,4193704.0,2021-02-27,39.0,NaT,,,
4,378311,4193704.0,2013-07-06,74.0,NaT,METTF,2013-07-21,74.0


In [44]:
all_data['diag'] = (~all_data['condition_concept_id'].isna()).astype(int)
all_data['lab'] = (~all_data['first_diag_datetime'].isna()).astype(int)
all_data['drug'] = (~all_data['drug_source_value'].isna()).astype(int)

In [52]:
all_data['first_drug_start_date'] = all_data['first_drug_start_date'].astype('datetime64[ns]')

In [53]:
all_data['first_date'] = all_data[['first_diag', 'first_diag_datetime', 'first_drug_start_date']].min(axis=1)

In [55]:
all_data = all_data[['person_id', 'diag', 'lab', 'drug', 'first_date']]

In [56]:
all_data

Unnamed: 0,person_id,diag,lab,drug,first_date
0,0,0,1,0,2015-11-11 11:42:32
1,377893,0,1,0,2021-05-15 09:52:13
2,378094,1,0,0,2018-12-21 00:00:00
3,378195,1,0,0,2021-02-27 00:00:00
4,378311,1,0,1,2013-07-06 00:00:00
...,...,...,...,...,...
171237,4825363,0,0,1,2024-02-17 00:00:00
171238,4825466,0,0,1,2024-02-18 00:00:00
171239,4825638,0,0,1,2024-02-19 00:00:00
171240,4826969,0,0,1,2018-02-14 00:00:00


In [57]:
dx = ['diag' if x == 1 else '' for x in all_data['diag']]
lab =['lab' if x == 1 else '' for x in all_data['lab']]
rx = ['drug' if x == 1 else '' for x in all_data['drug']]
result = [i+' '+j+' '+k for i, j, k in zip(dx, lab, rx)]
result = [x.strip().replace(' ', '+') for x in result]

all_data['criteria'] = result


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['criteria'] = result


In [58]:
all_data['criteria'].value_counts()

criteria
diag             64709
diag+lab+drug    61281
diag++drug       20749
diag+lab          8964
drug              7575
lab               6068
lab+drug          1896
Name: count, dtype: int64

In [59]:
all_data.to_csv('..\data\patient_list.csv', index=False)