# This is the notebook for quick experimentations

In [1]:
import pandas as pd
import logging
import yaml

In [2]:

# Create a StreamHandler for console output
console_handler = logging.StreamHandler()

# Set the level and format for the handler
console_handler.setLevel(logging.DEBUG)
console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(console_formatter)

# Get the root logger and attach the handler
logger = logging.getLogger()  # Root logger
logger.setLevel(logging.DEBUG)  # Set logger level
logger.addHandler(console_handler)

# Example log message
logger.debug("This is a DEBUG message")


2024-11-29 17:18:53,718 - DEBUG - This is a DEBUG message


In [17]:
filepath_csv = '../data/raw_data/encounters.csv'
df_encounters_raw = pd.read_csv(filepath_csv)

In [18]:
df_encounters_raw.head(2)

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
0,294d0dab-907e-8fce-7a47-0c0d322a5734,2012-04-01T09:04:48Z,2012-04-01T10:02:47Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,1567.0,87.2,,
1,2ccec874-cbaa-e280-7abb-f2bc2b603961,2013-04-07T09:04:48Z,2013-04-07T09:55:49Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,704.2,0.0,,


In [5]:
# Describe the dataframe
df_encounters_raw.describe()

Unnamed: 0,CODE,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE
count,7049.0,7049.0,7049.0,7049.0,4613.0
mean,128815100000.0,108.447557,2236.399206,1541.488302,862637400000.0
std,7632098000000.0,27.050623,4834.429394,3869.689828,14031640000000.0
min,1505002.0,75.0,75.0,0.0,3718001.0
25%,185347000.0,85.55,535.87,0.0,66383010.0
50%,185347000.0,85.55,879.52,436.6,125605000.0
75%,308335000.0,136.8,1791.03,1103.8,431857000.0
max,453131000000000.0,146.18,67764.63,56814.68,442571000000000.0


In [6]:
print(len(df_encounters_raw))

7049


In [7]:
print(df_encounters_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7049 entries, 0 to 7048
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   7049 non-null   object 
 1   START                7049 non-null   object 
 2   STOP                 7049 non-null   object 
 3   PATIENT              7049 non-null   object 
 4   ORGANIZATION         7049 non-null   object 
 5   PROVIDER             7049 non-null   object 
 6   PAYER                7049 non-null   object 
 7   ENCOUNTERCLASS       7049 non-null   object 
 8   CODE                 7049 non-null   int64  
 9   DESCRIPTION          7049 non-null   object 
 10  BASE_ENCOUNTER_COST  7049 non-null   float64
 11  TOTAL_CLAIM_COST     7049 non-null   float64
 12  PAYER_COVERAGE       7049 non-null   float64
 13  REASONCODE           4613 non-null   float64
 14  REASONDESCRIPTION    4613 non-null   object 
dtypes: float64(4), int64(1), object(10)
me

In [8]:
# Test logging 
log_info = df_encounters_raw.info()
logger.debug(f'df_info: {log_info}')

2024-11-29 17:18:56,661 - DEBUG - df_info: None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7049 entries, 0 to 7048
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   7049 non-null   object 
 1   START                7049 non-null   object 
 2   STOP                 7049 non-null   object 
 3   PATIENT              7049 non-null   object 
 4   ORGANIZATION         7049 non-null   object 
 5   PROVIDER             7049 non-null   object 
 6   PAYER                7049 non-null   object 
 7   ENCOUNTERCLASS       7049 non-null   object 
 8   CODE                 7049 non-null   int64  
 9   DESCRIPTION          7049 non-null   object 
 10  BASE_ENCOUNTER_COST  7049 non-null   float64
 11  TOTAL_CLAIM_COST     7049 non-null   float64
 12  PAYER_COVERAGE       7049 non-null   float64
 13  REASONCODE           4613 non-null   float64
 14  REASONDESCRIPTION    4613 non-null   object 
dtypes: float64(4), int64(1), object(10)
me

In [9]:
# Fix Column names 
column_names = df_encounters_raw.columns.to_list()
print(column_names)

['Id', 'START', 'STOP', 'PATIENT', 'ORGANIZATION', 'PROVIDER', 'PAYER', 'ENCOUNTERCLASS', 'CODE', 'DESCRIPTION', 'BASE_ENCOUNTER_COST', 'TOTAL_CLAIM_COST', 'PAYER_COVERAGE', 'REASONCODE', 'REASONDESCRIPTION']


In [10]:
# define Column mappings 
# This is manual but can be automated
'''
column_mapping_map = {
    'Id': {'normalized_colname': 'id', 'type': 'str'},
    'START': {'normalized_colname': 'start_time', 'type': 'datetime64[ns]'},
    'STOP': {'normalized_colname': 'stop_time', 'type': 'datetime64[ns]'},
    'PATIENT': {'normalized_colname': 'patient_id', 'type': 'str'},
    'ORGANIZATION': {'normalized_colname': 'organization_id', 'type': 'str'},
    'PROVIDER': {'normalized_colname': 'provider_id', 'type': 'str'},
    'PAYER': {'normalized_colname': 'payer_id', 'type': 'str'},
    'ENCOUNTERCLASS': {'normalized_colname': 'encounter_class', 'type': 'str'},
    'CODE': {'normalized_colname': 'code', 'type': 'str'},
    'DESCRIPTION': {'normalized_colname': 'description', 'type': 'str'},
    'BASE_ENCOUNTER_COST': {'normalized_colname': 'base_encounter_cost', 'type': 'float'},
    'TOTAL_CLAIM_COST': {'normalized_colname': 'total_claim_cost', 'type': 'float'},
    'PAYER_COVERAGE': {'normalized_colname': 'payer_coverage', 'type': 'float'},
    'REASONCODE': {'normalized_colname': 'reason_code', 'type': 'str'},
    'REASONDESCRIPTION': {'normalized_colname': 'reason_description', 'type': 'str'}
}
print(column_mapping_map)
''' 

# Put them in YAML

"\ncolumn_mapping_map = {\n    'Id': {'normalized_colname': 'id', 'type': 'str'},\n    'START': {'normalized_colname': 'start_time', 'type': 'datetime64[ns]'},\n    'STOP': {'normalized_colname': 'stop_time', 'type': 'datetime64[ns]'},\n    'PATIENT': {'normalized_colname': 'patient_id', 'type': 'str'},\n    'ORGANIZATION': {'normalized_colname': 'organization_id', 'type': 'str'},\n    'PROVIDER': {'normalized_colname': 'provider_id', 'type': 'str'},\n    'PAYER': {'normalized_colname': 'payer_id', 'type': 'str'},\n    'ENCOUNTERCLASS': {'normalized_colname': 'encounter_class', 'type': 'str'},\n    'CODE': {'normalized_colname': 'code', 'type': 'str'},\n    'DESCRIPTION': {'normalized_colname': 'description', 'type': 'str'},\n    'BASE_ENCOUNTER_COST': {'normalized_colname': 'base_encounter_cost', 'type': 'float'},\n    'TOTAL_CLAIM_COST': {'normalized_colname': 'total_claim_cost', 'type': 'float'},\n    'PAYER_COVERAGE': {'normalized_colname': 'payer_coverage', 'type': 'float'},\n    

In [11]:
# Load the mappings in dictionary

YAML_location = '../config/encounters.yaml' 
with open(YAML_location, "r") as file:
    dict_column_mappings = yaml.safe_load(file)

# Print the dictionary
print(type(dict_column_mappings))
print('-'*5)
print(dict_column_mappings.keys())
print('-'*5)
print(dict_column_mappings['encounters'].keys())
print('-'*5)
print(dict_column_mappings['encounters'])


<class 'dict'>
-----
dict_keys(['encounters'])
-----
dict_keys(['Id', 'START', 'STOP', 'PATIENT', 'ORGANIZATION', 'PROVIDER', 'PAYER', 'ENCOUNTERCLASS', 'CODE', 'DESCRIPTION', 'BASE_ENCOUNTER_COST', 'TOTAL_CLAIM_COST', 'PAYER_COVERAGE', 'REASONCODE', 'REASONDESCRIPTION'])
-----
{'Id': {'normalised_colname': 'id', 'type': 'str'}, 'START': {'normalised_colname': 'start_time', 'type': 'datetimestamp'}, 'STOP': {'normalised_colname': 'stop_time', 'type': 'datetimestamp'}, 'PATIENT': {'normalised_colname': 'patient_id', 'type': 'str'}, 'ORGANIZATION': {'normalised_colname': 'organization_id', 'type': 'str'}, 'PROVIDER': {'normalised_colname': 'provider_id', 'type': 'str'}, 'PAYER': {'normalised_colname': 'payer_id', 'type': 'str'}, 'ENCOUNTERCLASS': {'normalised_colname': 'encounter_class', 'type': 'str'}, 'CODE': {'normalised_colname': 'code', 'type': 'str'}, 'DESCRIPTION': {'normalised_colname': 'description', 'type': 'str'}, 'BASE_ENCOUNTER_COST': {'normalised_colname': 'base_encounter_c

In [12]:
print(df_encounters_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7049 entries, 0 to 7048
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   7049 non-null   object 
 1   START                7049 non-null   object 
 2   STOP                 7049 non-null   object 
 3   PATIENT              7049 non-null   object 
 4   ORGANIZATION         7049 non-null   object 
 5   PROVIDER             7049 non-null   object 
 6   PAYER                7049 non-null   object 
 7   ENCOUNTERCLASS       7049 non-null   object 
 8   CODE                 7049 non-null   int64  
 9   DESCRIPTION          7049 non-null   object 
 10  BASE_ENCOUNTER_COST  7049 non-null   float64
 11  TOTAL_CLAIM_COST     7049 non-null   float64
 12  PAYER_COVERAGE       7049 non-null   float64
 13  REASONCODE           4613 non-null   float64
 14  REASONDESCRIPTION    4613 non-null   object 
dtypes: float64(4), int64(1), object(10)
me

In [19]:
# Standardizing Column Names
def func_rename_and_cast_columns(df, column_mappings):
    list_unnormalised_colnames = df.columns.to_list()
    for unnormalised_colname in list_unnormalised_colnames:
        normalised_colname = column_mappings[unnormalised_colname]['normalised_colname']
        col_type = column_mappings[unnormalised_colname]['type']
        print(f"renaming column: {unnormalised_colname} to {normalised_colname} and casting type to: {col_type}")
        df.rename(columns={unnormalised_colname: normalised_colname}, inplace=True)
        if col_type == 'str':
            # Fill NAs and empty strings to "unknowns"
            if df[normalised_colname].isnull().any() or df[normalised_colname].isna().any():
                print(f"column: {normalised_colname} contains null values thus filling with unknown")
                df[normalised_colname] = df[normalised_colname].fillna("Unknown")
            df[normalised_colname] = df[normalised_colname].astype(str)
            print(f"{normalised_colname} casted to type(str)")
        if col_type == 'datetimestamp':
            df[normalised_colname] = pd.to_datetime(df[normalised_colname], errors='coerce')
            print(f"{normalised_colname} casted to datetime")
        if col_type == 'numeric':
            df[normalised_colname] = pd.to_numeric(df[normalised_colname], errors='coerce')
            print(f"{normalised_colname} casted to numeric")
        print('-'*5)

In [20]:
# df_encounters_raw['id'].isnull().any()
# df_encounters_raw['id'].isna().any()

In [21]:
func_rename_and_cast_columns(df_encounters_raw, dict_column_mappings['encounters'])

renaming column: Id to id and casting type to: str
id casted to type(str)
-----
renaming column: START to start_time and casting type to: datetimestamp
start_time casted to datetime
-----
renaming column: STOP to stop_time and casting type to: datetimestamp
stop_time casted to datetime
-----
renaming column: PATIENT to patient_id and casting type to: str
patient_id casted to type(str)
-----
renaming column: ORGANIZATION to organization_id and casting type to: str
organization_id casted to type(str)
-----
renaming column: PROVIDER to provider_id and casting type to: str
provider_id casted to type(str)
-----
renaming column: PAYER to payer_id and casting type to: str
payer_id casted to type(str)
-----
renaming column: ENCOUNTERCLASS to encounter_class and casting type to: str
encounter_class casted to type(str)
-----
renaming column: CODE to code and casting type to: str
code casted to type(str)
-----
renaming column: DESCRIPTION to description and casting type to: str
description casted

In [22]:
df_encounters_raw.head(5)

Unnamed: 0,id,start_time,stop_time,patient_id,organization_id,provider_id,payer_id,encounter_class,code,description,base_encounter_cost,total_claim_cost,payer_coverage,reason_code,reason_description
0,294d0dab-907e-8fce-7a47-0c0d322a5734,2012-04-01 09:04:48+00:00,2012-04-01 10:02:47+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,1567.0,87.2,Unknown,Unknown
1,2ccec874-cbaa-e280-7abb-f2bc2b603961,2013-04-07 09:04:48+00:00,2013-04-07 09:55:49+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,704.2,0.0,Unknown,Unknown
2,953c5138-ce17-4084-3432-1ac23f184528,2015-09-28 09:04:48+00:00,2015-09-28 11:02:48+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,db106514-f254-3402-b6a4-6d210c78c7e2,2c4b7d17-0ded-3e16-b5eb-6dda1d6a81bb,d31fccc3-1767-390d-966a-22a5156f4219,emergency,50849002,Emergency room admission (procedure),146.18,1008.98,0.0,125605004.0,Fracture of bone (disorder)
3,17dd3b88-0b85-2b6f-c342-c9d6cf5315cb,2015-10-31 11:02:48+00:00,2015-10-31 11:17:48+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,f8918a95-31e8-3ac4-8d12-29ca6080ebda,b4d9fbc9-fdca-369d-bbba-019479923f08,d31fccc3-1767-390d-966a-22a5156f4219,ambulatory,185349003,Encounter for check up (procedure),85.55,85.55,3.95,359817006.0,Closed fracture of hip (disorder)
4,0b03e41b-06a6-66fa-b972-acc5a83b134a,2016-04-10 09:04:48+00:00,2016-04-10 10:00:45+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,2039.18,464.94,Unknown,Unknown


In [23]:
# Data Quality

# Check missing Values

# Missing values
# print(df_encounters_raw.isnull().sum())
# print('-'*5)
print(df_encounters_raw.info())

# Fill 



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7049 entries, 0 to 7048
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   id                   7049 non-null   object             
 1   start_time           7049 non-null   datetime64[ns, UTC]
 2   stop_time            7049 non-null   datetime64[ns, UTC]
 3   patient_id           7049 non-null   object             
 4   organization_id      7049 non-null   object             
 5   provider_id          7049 non-null   object             
 6   payer_id             7049 non-null   object             
 7   encounter_class      7049 non-null   object             
 8   code                 7049 non-null   object             
 9   description          7049 non-null   object             
 10  base_encounter_cost  7049 non-null   float64            
 11  total_claim_cost     7049 non-null   float64            
 12  payer_coverage      

In [24]:
df_encounters_raw.head(5)

Unnamed: 0,id,start_time,stop_time,patient_id,organization_id,provider_id,payer_id,encounter_class,code,description,base_encounter_cost,total_claim_cost,payer_coverage,reason_code,reason_description
0,294d0dab-907e-8fce-7a47-0c0d322a5734,2012-04-01 09:04:48+00:00,2012-04-01 10:02:47+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,1567.0,87.2,Unknown,Unknown
1,2ccec874-cbaa-e280-7abb-f2bc2b603961,2013-04-07 09:04:48+00:00,2013-04-07 09:55:49+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,704.2,0.0,Unknown,Unknown
2,953c5138-ce17-4084-3432-1ac23f184528,2015-09-28 09:04:48+00:00,2015-09-28 11:02:48+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,db106514-f254-3402-b6a4-6d210c78c7e2,2c4b7d17-0ded-3e16-b5eb-6dda1d6a81bb,d31fccc3-1767-390d-966a-22a5156f4219,emergency,50849002,Emergency room admission (procedure),146.18,1008.98,0.0,125605004.0,Fracture of bone (disorder)
3,17dd3b88-0b85-2b6f-c342-c9d6cf5315cb,2015-10-31 11:02:48+00:00,2015-10-31 11:17:48+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,f8918a95-31e8-3ac4-8d12-29ca6080ebda,b4d9fbc9-fdca-369d-bbba-019479923f08,d31fccc3-1767-390d-966a-22a5156f4219,ambulatory,185349003,Encounter for check up (procedure),85.55,85.55,3.95,359817006.0,Closed fracture of hip (disorder)
4,0b03e41b-06a6-66fa-b972-acc5a83b134a,2016-04-10 09:04:48+00:00,2016-04-10 10:00:45+00:00,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,2039.18,464.94,Unknown,Unknown


In [25]:
# Check duplicates
duplicates = df_encounters_raw.duplicated().sum()

In [26]:
duplicates

0

In [None]:
# Check if id 
df_encounters_raw['encounter_id'].value_counts()[df_encounters_raw['encounter_id'].value_counts() > 1]

Series([], Name: count, dtype: int64)