In [31]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import great_expectations as ge

In [2]:
valid_cpt_codes = './data/valid_cpt_codes.csv'
sample_claims = './data/sample_claims.csv'
valid_icd_10_codes = './data/valid_icd_10_codes.csv'

In [3]:
sample_claim_df = pd.read_csv(sample_claims)
sample_claim_df.head(3)

Unnamed: 0,patient_id,claim_id,diagnosis_codes,procedure_code,date_service,date_received
0,A1670,1.0,Z01.419^Z11.51,99999,2021-01-25,2021-01-26
1,A0086,2.0,Z01.419^Z12.4,99999,2021-01-27,2021-01-29
2,A0086,3.0,Z12.4^Z11.51,87491,2021-01-07,2021-01-10


In [4]:
valid_cpt_codes_df = pd.read_csv(valid_cpt_codes)
valid_cpt_codes_df.head(3)

Unnamed: 0,code,short_description
0,96409,CHEMO IV PUSH SNGL DRUG
1,99335,DOMICIL/R-HOME VISIT EST PAT
2,86413,SARS-COV-2 ANTB QUANTITATIVE


In [5]:
valid_cpt_codes_df.info()  # code is type int

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   code               2861 non-null   int64 
 1   short_description  2861 non-null   object
dtypes: int64(1), object(1)
memory usage: 44.8+ KB


In [6]:
valid_icd_10_codes_df = pd.read_csv(valid_icd_10_codes)
valid_icd_10_codes_df.head(3)

Unnamed: 0,code
0,A56
1,A560
2,A5600


Q1a: What are the top 5 most common valid procedure codes?

In [8]:
valid_codes = [str(x) for x in list(valid_cpt_codes_df.code.unique())]
filtered_sample_claim_df = sample_claim_df[sample_claim_df['procedure_code'].isin(valid_codes)]
result1 = pd.value_counts(filtered_sample_claim_df['procedure_code']).index.tolist()[:5]
result1

['88175', '87591', '87491', '87798', '85049']

Q1b: How many patients are associated with at least one of those procedures? 

NOTE
> Please do not use the result values from 1a - provide code that will find the answer without specifying explicitly those code values.

In [10]:
result2 = sample_claim_df[sample_claim_df['procedure_code'].isin(result1)].patient_id.nunique()
result2

58

Q2: What are the top 5 most common valid diagnosis codes?

NOTE
> Diagnosis code field needs to be split

In [16]:
valid_codes = [str(x) for x in list(valid_icd_10_codes_df.code.unique())]

In [18]:
most_common_diagonis = sample_claim_df['diagnosis_codes'].str.split('^', n = 0, expand = False).explode().value_counts().index.tolist()

result1 = [x for x in most_common_diagonis if x in valid_codes]
result1[:5]

['I10', 'R05', 'N72', 'C61', 'C20']

Q3: We need to review this file for overall data quality and highlight any potential problems so that they can be discussed with the data provider. Write a series of tests that will identify any errors in the file and provide a list of all errors found. Do not consider any medical details of the treatments, this question does not require any healthcare knowledge

In [19]:
#check duplicates
assert sample_claim_df.count().to_dict() == sample_claim_df.drop_duplicates().count().to_dict()  #assert pass

{'patient_id': 4994,
 'claim_id': 4998,
 'diagnosis_codes': 3714,
 'procedure_code': 4997,
 'date_service': 4990,
 'date_received': 5000}

In [23]:
#standardized codes are valid
valid_codes = [str(x) for x in list(valid_icd_10_codes_df.code.unique())]
assert len(valid_codes) == valid_icd_10_codes_df[valid_icd_10_codes_df['code'].isin(valid_codes)].count()[0]

In [29]:
try:
    pd.to_datetime(sample_claim_df['date_received']).sort_values() 
except Exception as e:
    print('Date received column contains illegal datatime values')
    print(e)

In [30]:
try:
    pd.to_datetime(sample_claim_df['date_service']).sort_values() 
except Exception as e:
    print('Date received column contains illegal datatime values')
    print(e)

Date received column contains illegal datatime values
year 85014 is out of range: 85014 present at position 42


In [32]:
health_data_profile = ProfileReport(sample_claim_df, title= 'Report', minimal = False)

In [36]:
health_data_profile.to_file("report.html")

Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 68.25it/s]


In [34]:
ge_df = ge.from_pandas(sample_claim_df)                # 6 values are missing/ or null in patient_id
ge_df.expect_column_values_to_not_be_null("patient_id")

{
  "meta": {},
  "result": {
    "element_count": 5000,
    "unexpected_count": 6,
    "unexpected_percent": 0.12,
    "unexpected_percent_total": 0.12,
    "partial_unexpected_list": []
  },
  "success": false,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [35]:
ge_df.expect_column_values_to_not_be_null("claim_id")  # 2 values are missing

{
  "meta": {},
  "result": {
    "element_count": 5000,
    "unexpected_count": 2,
    "unexpected_percent": 0.04,
    "unexpected_percent_total": 0.04,
    "partial_unexpected_list": []
  },
  "success": false,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

**List of errors found**

- The service time is not in order with the date received. I think there should be relation between these two dates,
- It shows that either the service is not provided on time or the service is delayed relative to date received
- diagnosis_codes is missing > 25% of records.  Note: Diagnosis code can have null values 
- date_service has 49 distinct values whereas date_received has only 44

**Good aspect**

- There are no duplicates, all entries are unique

Q: Using your choice of Python test framework (pytest, unittest, DocTest etc), write a test and assert for:
- duplicate claim_id 
- Null or empty diagnosis_codes

In [37]:
import unittest

class TestStringMethods(unittest.TestCase):

    def setUp(self):
       self.sample_claim_df = pd.read_csv(sample_claims)

    def test_duplicate_claims_id(self):
        # check count claim_id equals count after droping duplicates
        self.assertEqual(sample_claim_df.claim_id.count(),sample_claim_df.claim_id.drop_duplicates().count())

    def test_missing_values(self):
        # Null values, drop and check, drop na, blank ' ', convert ' ' to null
        self.assertEqual(len(sample_claim_df.diagnosis_codes),sample_claim_df.diagnosis_codes.dropna().count())
    

unittest.main(argv=[''], verbosity=2, exit=False)

test_duplicate_claims_id (__main__.TestStringMethods) ... FAIL
test_missing_values (__main__.TestStringMethods) ... FAIL

FAIL: test_duplicate_claims_id (__main__.TestStringMethods)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/var/folders/jv/23m40pmn69j7nnhhc17mqvmm0000gn/T/ipykernel_3932/881189407.py", line 10, in test_duplicate_claims_id
    self.assertEqual(sample_claim_df.claim_id.count(),sample_claim_df.claim_id.drop_duplicates().count())
AssertionError: 4998 != 4971

FAIL: test_missing_values (__main__.TestStringMethods)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/var/folders/jv/23m40pmn69j7nnhhc17mqvmm0000gn/T/ipykernel_3932/881189407.py", line 14, in test_missing_values
    self.assertEqual(len(sample_claim_df.diagnosis_codes),sample_claim_df.diagnosis_codes.dropna().count())
AssertionError: 5000 != 3714

----------------------------------

<unittest.main.TestProgram at 0x7fc5c09ab370>