In [67]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

### Diagnosis criteria

In [68]:
with open('..\sql\inclusion\icd.sql') as file:
    sql_command = file.read()
    pt_icd = pd.read_sql(sql_command, conn)
    file.close()

print("Number of patients included by OMOP concept IDs:",len(pt_icd))

Number of patients included by OMOP concept IDs: 155703


In [69]:
pt_icd.head()

Unnamed: 0,person_id,condition_concept_id,age_at_first_diag
0,3826218,4193704,69
1,3832062,4193704,73
2,3965186,4193704,72
3,3942587,4193704,70
4,4312571,4193704,73


### Lab Criteria

In [70]:
with open('..\sql\inclusion\lab.sql') as file:
    sql_command = file.read()
    pt_lab = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by lab criteria:",len(pt_lab))

Number of patients included by lab criteria: 67917


In [71]:
pt_lab

Unnamed: 0,person_id,lab_diag
0,1198004,0
1,3246917,0
2,2334838,0
3,2029750,0
4,2550950,0
...,...,...
67912,4516184,1
67913,1287379,1
67914,3022623,1
67915,2621575,1


### Drug Criteria

In [72]:
with open('..\sql\inclusion\drug.sql') as file:
    sql_command = file.read()
    pt_drug = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by drug criteria:",len(pt_drug))

Number of patients included by drug criteria: 91501


In [73]:
pt_drug

Unnamed: 0,person_id,drug_source_value,age_at_first_drug
0,4386123,GLIT,79.0
1,4401454,METTFM5,68.0
2,4401494,METTFM5,77.0
3,4401553,METTFM5,74.0
4,4401663,METTFM5,86.0
...,...,...,...
91496,2462455,METTFM5,54.0
91497,3237512,METTF8,45.0
91498,3485128,GENIR,45.0
91499,4023811,METTFM5,52.0


### Merging all patient hn

In [80]:
all_data = pd.merge(pd.merge(pt_icd, pt_lab, on='person_id', how='outer'), pt_drug, on='person_id', how='outer')
print("Number of patients included by all criteria:",len(all_data))
all_data.head()

Number of patients included by all criteria: 169684


Unnamed: 0,person_id,condition_concept_id,age_at_first_diag,lab_diag,drug_source_value,age_at_first_drug
0,0,,,0.0,,
1,245616,,,2.0,,
2,377893,,,2.0,,
3,378094,4193704.0,49.0,,,
4,378195,4193704.0,39.0,,,


In [81]:
all_data['diag'] = (~all_data['condition_concept_id'].isna()).astype(int)
all_data['lab'] = (~all_data['lab_diag'].isna()).astype(int)
all_data['drug'] = (~all_data['drug_source_value'].isna()).astype(int)

In [82]:
all_data = all_data[['person_id', 'diag', 'lab', 'drug']]

In [83]:
dx = ['diag' if x == 1 else '' for x in all_data['diag']]
lab =['lab' if x == 1 else '' for x in all_data['lab']]
rx = ['drug' if x == 1 else '' for x in all_data['drug']]
result = [i+' '+j+' '+k for i, j, k in zip(dx, lab, rx)]
result = [x.strip().replace(' ', '+') for x in result]

all_data['criteria'] = result


In [84]:
all_data['criteria'].value_counts()

criteria
diag             67175
diag+lab+drug    55410
diag++drug       26620
drug              7972
diag+lab          6498
lab               4510
lab+drug          1499
Name: count, dtype: int64

In [None]:
all_data.to_csv('..\data\patient_list.csv')