In [1]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Diagnosis criteria

In [2]:
with open('..\sql\inclusion\icd.sql') as file:
    sql_command = file.read()
    pt_icd = pd.read_sql(sql_command, conn)
    file.close()

print("Number of patients included by OMOP concept IDs:",len(pt_icd))

Number of patients included by OMOP concept IDs: 155702


### Lab Criteria

In [3]:
with open('..\sql\inclusion\lab.sql') as file:
    sql_command = file.read()
    pt_lab = pd.read_sql(sql_command, conn) 
    file.close()
print("Number of patients included by lab criteria:",len(pt_lab))

Number of patients included by lab criteria: 83147


### Drug Criteria

In [4]:
with open('..\sql\inclusion\drug.sql') as file:
    sql_command = file.read()
    pt_drug = pd.read_sql(sql_command, conn)
    file.close()
print("Number of patients included by drug criteria:",len(pt_drug))

Number of patients included by drug criteria: 97812


### Merging all patient hn

In [5]:
all_data = pd.merge(pd.merge(pt_icd, pt_lab, on='person_id', how='outer'), pt_drug, on='person_id', how='outer')
print("Number of patients included by all criteria:",len(all_data))


Number of patients included by all criteria: 175319


In [6]:
all_data['diag'] = (~all_data['condition_concept_id'].isna()).astype(int)
all_data['lab'] = (~all_data['first_diag_datetime'].isna()).astype(int)
all_data['drug'] = (~all_data['drug_source_value'].isna()).astype(int)

In [7]:
all_data['first_drug_start_date'] = pd.to_datetime(all_data['first_drug_start_date'])
all_data['first_diag'] = pd.to_datetime(all_data['first_diag'])
all_data['first_diag_datetime'] = pd.to_datetime(all_data['first_diag_datetime'])

In [8]:
all_data['first_date'] = all_data[['first_diag', 'first_diag_datetime', 'first_drug_start_date']].min(axis=1, skipna=True)

all_data = all_data.rename(columns={
    "first_drug_start_date": "first_drug_date",
    "first_diag": "first_diag_date",
    "first_diag_datetime": "first_lab_date"
    })

In [9]:
all_data = all_data[['person_id', 'diag', 'lab', 'drug', 'first_date', "first_drug_date", "first_diag_date", "first_lab_date"]]

# create column to track which criteria was met first
all_data['first_criteria'] = all_data[['first_diag_date', 'first_lab_date', 'first_drug_date']].idxmin(axis=1)
all_data['first_criteria'] = all_data['first_criteria'].apply(lambda x: x.split('_')[1])



In [10]:
# filter only pt. between June 2013 - Sep 2023
all_data_filter = all_data[all_data['first_date'] >= '2013-06-01']
all_data_filter = all_data_filter[all_data_filter['first_date'] <= '2023-09-30']

In [11]:
all_data_filter.person_id.nunique()

175319

In [12]:
all_data_filter.first_criteria.value_counts()

first_criteria
diag    139391
drug     19321
lab      16607
Name: count, dtype: int64

In [13]:
dx = ['diag' if x == 1 else '' for x in all_data['diag']]
lab =['lab' if x == 1 else '' for x in all_data['lab']]
rx = ['drug' if x == 1 else '' for x in all_data['drug']]
result = [i+' '+j+' '+k for i, j, k in zip(dx, lab, rx)]
result = [x.strip().replace(' ', '+') for x in result]

all_data_filter['criteria'] = result


In [14]:
all_data_filter['criteria'].value_counts()

criteria
diag+lab+drug    58886
diag             55425
diag++drug       28368
diag+lab         13023
lab               9059
drug              8379
lab+drug          2179
Name: count, dtype: int64

In [15]:
all_data_filter.to_csv('..\data\patient_list_3oct24.csv', index=False)