In [1]:
import pandas as pd
import sqlalchemy as sal
from dotenv import load_dotenv
import os
load_dotenv()

# Create a connection to the database
server_database = os.getenv("SERVER_DATABASE")
engine = sal.create_engine(f'mssql+pyodbc://@{server_database}?trusted_connection=yes&driver=SQL+Server')
conn = engine.connect()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# get patients data
pt = pd.read_csv('../data/cohort_28feb24.csv')

In [3]:
pt_df = pt[['person_id','first_date','year_of_birth']]

print("Total rows:" , len(pt_df))
print("Total number of pt.:", len(pt_df['person_id'].unique()))

Total rows: 167838
Total number of pt.: 167838


In [4]:
pt_list = list(pt_df['person_id'].unique())
print(len(pt_list))


167838


# Lab extraction

In [14]:
with open('../sql/feature_extraction/lab_person.sql', 'r') as f:
    sql_q = f.read()
    f.close()

# Due to memory problem, we will query in batch
# We will query 10000 at a time

n_per_batch = 10000
n_batches = len(pt_list) // n_per_batch + 1

lab_df=None

for i in range(n_batches):
    print("starting batch ", i+1, " of ", n_batches, "subject ", i*n_per_batch, " to ", (i+1)*n_per_batch - 1)
    pt_sub = str(pt_list[i*n_per_batch:(i+1)*n_per_batch])[1:-1]
    sql_q_sub = sql_q.replace('INSERT_PERSON_ID_LIST', pt_sub)

    if lab_df is None:
        print("no exisiting dataframe, creating new one")
        lab_df = pd.read_sql(sql_q_sub, conn)
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(lab_df))
    else:
        next_lab_df = pd.read_sql(sql_q_sub, conn)
        
        lab_df = pd.concat([lab_df, next_lab_df])
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(lab_df))


starting batch  1  of  17 subject  0  to  9999
no exisiting dataframe, creating new one
finising batch  1  of  17 number of records 283254
starting batch  2  of  17 subject  10000  to  19999
finising batch  2  of  17 number of records 563049
starting batch  3  of  17 subject  20000  to  29999
finising batch  3  of  17 number of records 842662
starting batch  4  of  17 subject  30000  to  39999
finising batch  4  of  17 number of records 1103157
starting batch  5  of  17 subject  40000  to  49999
finising batch  5  of  17 number of records 1347699
starting batch  6  of  17 subject  50000  to  59999
finising batch  6  of  17 number of records 1595090
starting batch  7  of  17 subject  60000  to  69999
finising batch  7  of  17 number of records 1831631
starting batch  8  of  17 subject  70000  to  79999
finising batch  8  of  17 number of records 2053692
starting batch  9  of  17 subject  80000  to  89999
finising batch  9  of  17 number of records 2263415
starting batch  10  of  17 subj

In [17]:
lab_df.to_csv('../data/lab/lab_person_22mar24.csv', index=False)

In [7]:
lab_df = pd.read_csv('../data/lab/lab_person_22mar24.csv')

# Comorbidities

In [20]:
with open('../sql/feature_extraction/comorb.sql', 'r') as f:
    sql_q = f.read()
    f.close()

# Due to memory problem, we will query in batch
# We will query 10000 at a time

n_per_batch = 10000
n_batches = len(pt_list) // n_per_batch + 1

comorb_df=None

for i in range(n_batches):
    print("starting batch ", i+1, " of ", n_batches, "subject ", i*n_per_batch, " to ", (i+1)*n_per_batch - 1)
    pt_sub = str(pt_list[i*n_per_batch:(i+1)*n_per_batch])[1:-1]
    sql_q_sub = sql_q.replace('INSERT_PERSON_ID_LIST', pt_sub)

    if comorb_df is None:
        print("no exisiting dataframe, creating new one")
        comorb_df = pd.read_sql(sql_q_sub, conn)
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(comorb_df))
    else:
        next_comorb_df = pd.read_sql(sql_q_sub, conn)
        
        comorb_df = pd.concat([comorb_df, next_comorb_df])
        print("finising batch ", i+1, " of ", n_batches, 'number of records',len(comorb_df))


starting batch  1  of  17 subject  0  to  9999
no exisiting dataframe, creating new one


finising batch  1  of  17 number of records 10000
starting batch  2  of  17 subject  10000  to  19999
finising batch  2  of  17 number of records 20000
starting batch  3  of  17 subject  20000  to  29999
finising batch  3  of  17 number of records 30000
starting batch  4  of  17 subject  30000  to  39999
finising batch  4  of  17 number of records 40000
starting batch  5  of  17 subject  40000  to  49999
finising batch  5  of  17 number of records 50000
starting batch  6  of  17 subject  50000  to  59999
finising batch  6  of  17 number of records 60000
starting batch  7  of  17 subject  60000  to  69999
finising batch  7  of  17 number of records 70000
starting batch  8  of  17 subject  70000  to  79999
finising batch  8  of  17 number of records 80000
starting batch  9  of  17 subject  80000  to  89999
finising batch  9  of  17 number of records 90000
starting batch  10  of  17 subject  90000  to  99999
finising batch  10  of  17 number of records 100000
starting batch  11  of  17 su

In [None]:
comorb_df.to_pickle('../data\comorb\comorb_person_22mar24.pkl')

In [8]:
comorb_df = pd .read_pickle('../data\comorb\comorb_person_22mar24.pkl')