# Question 1 
What are the top 5 most common valid procedure codes?   
How many patients are associated with at least one of those procedures?  Please do not use the result values from 1a - provide code that will find the answer without specifying explicitly those code values.  


In [17]:
# we ingest the csv files
import pandas as pd
from os import listdir, getcwd
from os.path import isfile, join
path = getcwd()
filelist = [f for f in listdir(path) if isfile(join(path, f))]
filelist

['Data Acquisition Engineer Assessment Instructions_.docx',
 'DAQ Engineer.ipynb',
 'sample_claims.csv',
 'valid_cpt_codes.csv',
 'valid_icd_10_codes.csv',
 'Table Definitions.csv']

In [18]:
import chardet
for file in filelist:
    with open(file, 'rb') as f:
        result = chardet.detect(f.read(10000))
        print(result)

{'encoding': None, 'confidence': 0.0, 'language': None}
{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


In [19]:
#inspect the datatypes
df_def = pd.read_csv(filelist[5], encoding="us-ascii").convert_dtypes()
df_cpt = pd.read_csv(filelist[3], encoding="us-ascii").convert_dtypes()
df_icd = pd.read_csv(filelist[4], encoding="us-ascii").convert_dtypes()
df_claims = pd.read_csv(filelist[2], encoding="us-ascii").convert_dtypes()
print("procedure types:\n",df_cpt.dtypes)
print("diagnosis types:\n",df_icd.dtypes)
print("claims types:\n",df_claims.dtypes)

procedure types:
 code                  Int64
short_description    string
dtype: object
diagnosis types:
 code    string
dtype: object
claims types:
 patient_id         string
claim_id            Int64
diagnosis_codes    string
procedure_code     string
date_service       string
date_received      string
dtype: object


In [20]:
#change datatype so it matches column to join
df_cpt['code'] = df_cpt['code'].astype("string")

In [21]:
#join to get valid codes, then count and rank codes
top_procedures= (df_claims.join(df_cpt.set_index("code"), how="inner", on="procedure_code")
                 .value_counts("procedure_code").head())
top_procedures

procedure_code
88175    155
87591    123
87491    122
87798    116
85049    107
dtype: int64

# Question 2 
What are the top 5 most common valid diagnosis codes? 

In [37]:
import re
#extract codes
df_claims_diag_split = (df_claims["diagnosis_codes"].str.extractall(r"([A-Z]{1}\d{2}\.?\d*)")
                        .reset_index())
#remove periods
df_claims_diag_split[0] = df_claims_diag_split[0].apply(lambda x: x.replace(".",""))
df_claims_diag_split.rename(columns={'level_0':'sample_row',"match":"entry",0:"code"}, inplace=True)

#join to get valid codes
df_claims_dc = df_claims_diag_split.join(df_icd.set_index("code"), how="inner", on="code").copy()

#count and rank codes
top_diagnosis = df_claims_dc.value_counts("code").head()
top_diagnosis


code
Z113      947
E559      498
Z01419    404
Z3481     347
N926      326
dtype: int64

# Question 3 
We need to review this file for overall data quality and highlight any potential problems so that they can be discussed with the data provider.  Write a series of tests that will identify any errors in the file and provide a list of all errors found.  Do not consider any medical details of the treatments, this question does not require any healthcare knowledge.
These quality checks should include, but are not limited to
Sample data matches the table definition
All standardized codes are valid based on given reference material
Date values are logical and chronological trending is consistent

In [23]:
#check if procedure code in cpt table (refers to index row not claim id)
valid_proc_code = df_claims["procedure_code"].isin(df_cpt['code'])
#list of not valid
df_claims[~valid_proc_code]

Unnamed: 0,patient_id,claim_id,diagnosis_codes,procedure_code,date_service,date_received
0,A1670,1,Z01.419^Z11.51,99999,2021-01-25,2021-01-26
1,A0086,2,Z01.419^Z12.4,99999,2021-01-27,2021-01-29
10,A0311,11,Z00.00^E55.9,99999,2021-01-19,2021-01-20
11,A0311,12,Z01.419^Z11.51^N95.1,99999,2021-01-28,2021-01-30
13,A0311,14,N76.0,99999,2021-01-25,2021-01-26
...,...,...,...,...,...,...
4974,A3396,4977,N89.8^R10.2^N91.1,99999,2021-01-06,2021-01-08
4976,A3396,4979,Z01.419^E28.2^R53.83^E03.9,99999,2021-01-05,2021-01-06
4978,A3396,4981,Z00.00^Z11.3^Z11.59^Z13.228^Z13.29^Z13.79^Z13....,99999,2021-01-04,2021-01-08
4986,A3396,4989,Z11.3,Z11.8,2173-10-13,2021-01-11


In [24]:
#check if claim_id exists
claim_id_exists =  df_claims["claim_id"].notna()
#list of not valid
df_claims[~claim_id_exists]

Unnamed: 0,patient_id,claim_id,diagnosis_codes,procedure_code,date_service,date_received
4998,A1623,,N92.6^Z11.3^Z11.8^Z32.01,99999,2021-01-04,2021-01-09
4999,A1258,,R41.3^E78.2^E03.9,84450,2021-01-27,2021-01-28


In [25]:
#check if there diagnostic codes are valid
df_claims_diag_split["is_valid"]= df_claims_diag_split["code"].isin(df_icd["code"])

valid_diag_code = df_claims_diag_split[["sample_row","is_valid"]]
#list of not valid
valid_diag_code[valid_diag_code["is_valid"]==False]

Unnamed: 0,sample_row,is_valid
13,6,False
47,24,False
56,26,False
79,39,False
98,47,False
...,...,...
9972,4922,False
10005,4942,False
10033,4958,False
10100,4989,False


In [26]:
#check if row has valid diagnostic code format
codeformat = r'[A-Z]{1}\d{2}\.?\d*'
has_valid_diag_code_format = (df_claims["diagnosis_codes"].apply(lambda x: bool(re.search(codeformat, str(x))))
                        .rename("validformat_diagcode"))
#list of not valid
df_claims[~has_valid_diag_code_format]

Unnamed: 0,patient_id,claim_id,diagnosis_codes,procedure_code,date_service,date_received
5,A0311,6,,85049,2021-01-08,2021-01-09
12,A0311,13,,82465,2021-01-20,2021-01-21
15,A0311,16,,99999,2021-01-06,2021-01-07
19,A0311,20,,87512,2021-01-09,2021-01-10
21,A0311,22,,87491,2021-01-06,2021-01-07
...,...,...,...,...,...,...
4973,A3396,4976,,83540,2021-01-08,2021-01-09
4984,A3396,4987,,88175,2021-01-06,2021-01-07
4988,A3396,4991,,82950,2021-01-05,2021-01-06
4992,A3396,4995,,86900,2021-01-27,2021-01-28


In [27]:
#check if date is in right format
dateformat = r'^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$'
valid_date_service_format = (df_claims["date_service"].apply(lambda x: bool(re.search(dateformat, str(x))))
                             .rename("validformat_serviced"))
#list of not valid
df_claims[~valid_date_service_format]

Unnamed: 0,patient_id,claim_id,diagnosis_codes,procedure_code,date_service,date_received
125,A0482,125,Z34.82^O09.32,N92.6,2141-05-30,2021-01-01
570,A1415,572,M15.0^L65.9^M85.9^E55.9^R73.09^E78.5,86850,,2021-01-29
593,A1415,595,Z31.83^Z11.3^Z31.49^Z13.29^Z11.59^Z11.4^Z00.00...,Z13.79,2173-10-13,2021-01-18
720,A1517,724,Z01.419^N89.8^Z11.3,99999,,2021-01-27
765,A1549,768,Z34.01^O34.01,Q51.4,2173-10-13,2021-01-01
932,A1580,934,Z11.3^Z34.81,85049,,2021-01-21
985,A1580,987,N76.0,99999,,2021-01-09
995,A1580,997,M15.0^L65.9^M85.9^E55.9^R73.09,E78.5,2130-05-26,2021-01-07
1589,A1617,1595,Z00.00^Z11.3^Z11.59^Z13.228^Z13.29^Z13.79^Z13....,82247,,2021-01-05
1613,A1617,1619,J45,84460,,2021-01-13


In [28]:
#check if date is in right format
valid_date_received_format = (df_claims["date_received"].apply(lambda x: bool(re.search(dateformat, str(x))))
                             .rename("validformat_received"))
df_claims[~valid_date_received_format]

Unnamed: 0,patient_id,claim_id,diagnosis_codes,procedure_code,date_service,date_received


In [29]:
#create dates dataframe
import datetime as dt
claims_dates = (df_claims[["date_service","date_received"]].reset_index().rename(columns={"index":"sample_row"})
                .join(valid_date_service_format).join(valid_date_received_format))
claims_dates

Unnamed: 0,sample_row,date_service,date_received,validformat_serviced,validformat_received
0,0,2021-01-25,2021-01-26,True,True
1,1,2021-01-27,2021-01-29,True,True
2,2,2021-01-07,2021-01-10,True,True
3,3,2021-01-15,2021-01-16,True,True
4,4,2021-01-06,2021-01-07,True,True
...,...,...,...,...,...
4995,4995,2021-01-22,2021-01-26,True,True
4996,4996,2021-01-27,2021-01-28,True,True
4997,4997,2021-01-03,2021-01-05,True,True
4998,4998,2021-01-04,2021-01-09,True,True


In [30]:
#check if date_service is before present
claims_valid_date_service = (claims_dates[claims_dates["validformat_serviced"]==True].copy()
                             .filter(items=("sample_row","date_service","validformat_serviced")))
claims_valid_date_service["date_service"]= pd.to_datetime(claims_valid_date_service['date_service'])
claims_valid_date_service["before_present_s"] = claims_valid_date_service["date_service"] < dt.datetime.now()
#list of not valid
claims_valid_date_service[claims_valid_date_service["before_present_s"] == False]

Unnamed: 0,sample_row,date_service,validformat_serviced,before_present_s


In [31]:
#check if date_received is before present
claims_valid_date_received = (claims_dates[claims_dates["validformat_received"]==True].copy()
                             .filter(items=("sample_row","date_received","validformat_received")))
claims_valid_date_received["date_received"]= pd.to_datetime(claims_valid_date_received['date_received'])
claims_valid_date_received["before_present_r"] = claims_valid_date_received["date_received"] < dt.datetime.now()
#list of not valid
claims_valid_date_received[claims_valid_date_received["before_present_r"] == False]

Unnamed: 0,sample_row,date_received,validformat_received,before_present_r


In [32]:
#check if date_service is less or equal to date_received
all_dates_valid = (claims_valid_date_service
                   .join(claims_valid_date_received.set_index("sample_row"), how="inner",on="sample_row"))
all_dates_valid["valid_date_order"] = all_dates_valid["date_service"] <= all_dates_valid["date_received"]
#list of not valid
all_dates_valid[all_dates_valid["valid_date_order"]==False]

Unnamed: 0,sample_row,date_service,validformat_serviced,before_present_s,date_received,validformat_received,before_present_r,valid_date_order
50,50,2021-01-05,True,True,2020-12-26,True,True,False
118,118,2021-01-15,True,True,2021-01-05,True,True,False
261,261,2021-01-26,True,True,2021-01-16,True,True,False
414,414,2021-01-16,True,True,2021-01-06,True,True,False
491,491,2021-01-07,True,True,2020-12-28,True,True,False
...,...,...,...,...,...,...,...,...
4615,4615,2021-01-25,True,True,2021-01-15,True,True,False
4641,4641,2021-01-19,True,True,2021-01-09,True,True,False
4777,4777,2021-01-23,True,True,2021-01-13,True,True,False
4793,4793,2021-01-11,True,True,2021-01-01,True,True,False
