In [14]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

### Diagnosis criteria

In [15]:
with open('..\sql\inclusion\icd.sql') as file:
    sql_command = file.read()
    pt_icd = pd.read_sql(sql_command, conn)
    file.close()

print("Number of patients included by OMOP concept IDs:",len(pt_icd))

Number of patients included by OMOP concept IDs: 155703


In [16]:
pt_icd.head()

Unnamed: 0,person_id,condition_concept_id,age_at_first_diag
0,667547,4193704,70
1,679135,4193704,64
2,1680262,4193704,66
3,1483795,4193704,64
4,1561394,4193704,68


### Lab Criteria

In [17]:
with open('..\sql\inclusion\lab.sql') as file:
    sql_command = file.read()
    pt_lab = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by lab criteria:",len(pt_lab))

Number of patients included by lab criteria: 66485


In [18]:
pt_lab

Unnamed: 0,person_id,lab_diag
0,3583934,0
1,655501,0
2,1005591,0
3,1534317,0
4,1581658,0
...,...,...
66480,446319,1
66481,1950653,1
66482,3368614,1
66483,4634169,1


### Drug Criteria

In [19]:
with open('..\sql\inclusion\drug.sql') as file:
    sql_command = file.read()
    pt_drug = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by drug criteria:",len(pt_drug))

Number of patients included by drug criteria: 91501


In [20]:
pt_drug

Unnamed: 0,person_id,drug_source_value,age_at_first_drug
0,4386123,METTFM5,79.0
1,4401454,METTFM5,68.0
2,4401494,METTFM5,77.0
3,4401553,METTFM5,74.0
4,4401663,METTFM5,86.0
...,...,...,...
91496,3668156,METTFM5,66.0
91497,4020652,PIOT30,51.0
91498,4358071,METTFM5,49.0
91499,2663526,GLITB,47.0


### Merging all patient hn

In [21]:
all_data = pd.merge(pd.merge(pt_icd, pt_lab, on='person_id', how='outer'), pt_drug, on='person_id', how='outer')
print("Number of patients included by all criteria:",len(all_data))
all_data.head()

Number of patients included by all criteria: 169101


Unnamed: 0,person_id,condition_concept_id,age_at_first_diag,lab_diag,drug_source_value,age_at_first_drug
0,0,,,1.0,,
1,377893,,,2.0,,
2,378094,4193704.0,49.0,,,
3,378195,4193704.0,39.0,,,
4,378311,4193704.0,74.0,,METTF,74.0


In [22]:
all_data['diag'] = (~all_data['condition_concept_id'].isna()).astype(int)
all_data['lab'] = (~all_data['lab_diag'].isna()).astype(int)
all_data['drug'] = (~all_data['drug_source_value'].isna()).astype(int)

In [23]:
all_data = all_data[['person_id', 'diag', 'lab', 'drug']]

In [24]:
dx = ['diag' if x == 1 else '' for x in all_data['diag']]
lab =['lab' if x == 1 else '' for x in all_data['lab']]
rx = ['drug' if x == 1 else '' for x in all_data['drug']]
result = [i+' '+j+' '+k for i, j, k in zip(dx, lab, rx)]
result = [x.strip().replace(' ', '+') for x in result]

all_data['criteria'] = result


In [25]:
all_data['criteria'].value_counts()

criteria
diag             67778
diag+lab+drug    55208
diag++drug       26822
drug              8016
diag+lab          5895
lab               3927
lab+drug          1455
Name: count, dtype: int64

In [26]:
# all_data.to_csv('..\data\patient_list.csv')