### Import packages

In [1]:
import pandas as pd
import numpy as np

### Import utilities

In [2]:
import sys
sys.path.append('../src/')
import utils

rename_map = utils.icd_category_map

### Import CCSR data

In [3]:
icd_map = pd.read_csv('../data/ccsr/DXCCSR_v2021-1.CSV')

In [4]:
icd_map.head()

Unnamed: 0,'ICD-10-CM CODE','ICD-10-CM CODE DESCRIPTION','Default CCSR CATEGORY IP,'Default CCSR CATEGORY DESCRIPTION IP','Default CCSR CATEGORY OP,'Default CCSR CATEGORY DESCRIPTION OP','CCSR CATEGORY 1','CCSR CATEGORY 1 DESCRIPTION','CCSR CATEGORY 2','CCSR CATEGORY 2 DESCRIPTION','CCSR CATEGORY 3','CCSR CATEGORY 3 DESCRIPTION','CCSR CATEGORY 4','CCSR CATEGORY 4 DESCRIPTION','CCSR CATEGORY 5','CCSR CATEGORY 5 DESCRIPTION','CCSR CATEGORY 6','CCSR CATEGORY 6 DESCRIPTION'
0,'A000',"Cholera due to Vibrio cholerae 01, biovar chol...",'DIG001',Intestinal infection,'DIG001',Intestinal infection,'DIG001',Intestinal infection,'INF003',Bacterial infections,' ',,' ',,' ',,' ',
1,'A001',"Cholera due to Vibrio cholerae 01, biovar eltor",'DIG001',Intestinal infection,'DIG001',Intestinal infection,'DIG001',Intestinal infection,'INF003',Bacterial infections,' ',,' ',,' ',,' ',
2,'A009',"Cholera, unspecified",'DIG001',Intestinal infection,'DIG001',Intestinal infection,'DIG001',Intestinal infection,'INF003',Bacterial infections,' ',,' ',,' ',,' ',
3,'A0100',"Typhoid fever, unspecified",'DIG001',Intestinal infection,'DIG001',Intestinal infection,'DIG001',Intestinal infection,'INF003',Bacterial infections,' ',,' ',,' ',,' ',
4,'A0101',Typhoid meningitis,'NVS001',Meningitis,'NVS001',Meningitis,'INF003',Bacterial infections,'NVS001',Meningitis,' ',,' ',,' ',,' ',


### Clean CCSR data

In [5]:
icd_map.columns = [x.replace("'",'') for x in icd_map.columns]

In [6]:
icd_map = icd_map[icd_map.columns[:6,].values]

In [7]:
for n in [0,2,4]:
    icd_map.iloc[:,n] = [x.replace("'",'') for x in icd_map.iloc[:,n].to_list()]

In [8]:
icd_map.head()

Unnamed: 0,ICD-10-CM CODE,ICD-10-CM CODE DESCRIPTION,Default CCSR CATEGORY IP,Default CCSR CATEGORY DESCRIPTION IP,Default CCSR CATEGORY OP,Default CCSR CATEGORY DESCRIPTION OP
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...",DIG001,Intestinal infection,DIG001,Intestinal infection
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor",DIG001,Intestinal infection,DIG001,Intestinal infection
2,A009,"Cholera, unspecified",DIG001,Intestinal infection,DIG001,Intestinal infection
3,A0100,"Typhoid fever, unspecified",DIG001,Intestinal infection,DIG001,Intestinal infection
4,A0101,Typhoid meningitis,NVS001,Meningitis,NVS001,Meningitis


### Import CCSR categories and naming conventions

In [9]:
#read in ccs categories
cat = pd.read_excel('../data/ccsr/DXCCSR-Reference-File-v2021-1.xlsx', sheet_name='CCSR_Categories', skiprows=1)

In [10]:
#read in ccs naming conventions
nam = pd.read_excel('../data/ccsr/DXCCSR-Reference-File-v2021-1.xlsx', sheet_name='Naming_Conventions', skiprows=1)

### Clean CCSR categories and naming conventions

In [11]:
#extract broad category
cat['abb'] = [x[:3] for x in cat['CCSR Category']]

In [12]:
#merge categories and naming conventions
cats = pd.merge(cat, nam, left_on='abb', right_on='3-Character Abbreviation', how='left')

In [13]:
cats.head()

Unnamed: 0,CCSR Category,CCSR Category Description,abb,ICD-10-CM Diagnosis Chapter,3-Character Abbreviation
0,BLD001,Nutritional anemia,BLD,Diseases of the Blood and Blood Forming Organs...,BLD
1,BLD002,Hemolytic anemia,BLD,Diseases of the Blood and Blood Forming Organs...,BLD
2,BLD003,Aplastic anemia,BLD,Diseases of the Blood and Blood Forming Organs...,BLD
3,BLD004,Acute posthemorrhagic anemia,BLD,Diseases of the Blood and Blood Forming Organs...,BLD
4,BLD005,Sickle cell trait/anemia,BLD,Diseases of the Blood and Blood Forming Organs...,BLD


In [14]:
#then merge with icd10 map using inpatient defaults
merge = pd.merge(icd_map, cats, left_on='Default CCSR CATEGORY IP', right_on='CCSR Category', how='left')

In [15]:
merge.head()

Unnamed: 0,ICD-10-CM CODE,ICD-10-CM CODE DESCRIPTION,Default CCSR CATEGORY IP,Default CCSR CATEGORY DESCRIPTION IP,Default CCSR CATEGORY OP,Default CCSR CATEGORY DESCRIPTION OP,CCSR Category,CCSR Category Description,abb,ICD-10-CM Diagnosis Chapter,3-Character Abbreviation
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...",DIG001,Intestinal infection,DIG001,Intestinal infection,DIG001,Intestinal infection,DIG,Diseases of the Digestive System,DIG
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor",DIG001,Intestinal infection,DIG001,Intestinal infection,DIG001,Intestinal infection,DIG,Diseases of the Digestive System,DIG
2,A009,"Cholera, unspecified",DIG001,Intestinal infection,DIG001,Intestinal infection,DIG001,Intestinal infection,DIG,Diseases of the Digestive System,DIG
3,A0100,"Typhoid fever, unspecified",DIG001,Intestinal infection,DIG001,Intestinal infection,DIG001,Intestinal infection,DIG,Diseases of the Digestive System,DIG
4,A0101,Typhoid meningitis,NVS001,Meningitis,NVS001,Meningitis,NVS001,Meningitis,NVS,Diseases of the Nervous System,NVS


In [16]:
#save a copy of this data to document which CCSR categories were reassigned
import copy
for_comparison = copy.deepcopy(merge)

### Rename and recategorize disease categories

In [17]:
merge['ICD-10-CM Diagnosis Chapter'].value_counts()

Injury, Poisoning and Certain Other Consequences of External Causes                                    39193
Unacceptable principal diagnosis (inpatient data) or first-listed diagnosis (outpatient data)           9906
Diseases of the Musculoskeletal System and Connective Tissue                                            6230
Diseases of the Eye and Adnexa                                                                          2633
Pregnancy, Childbirth and the Puerperium                                                                2233
Mental, Behavioral and Neurodevelopmental Disorders                                                     1761
Neoplasms                                                                                               1701
Diseases of the Circulatory System                                                                      1389
Diseases of the Digestive System                                                                         888
Endocrine, Nutritio

In [18]:
merge.replace(rename_map, inplace=True)

In [19]:
merge['ICD-10-CM Diagnosis Chapter'].value_counts()

Injuries and external causes          39193
Nonspecific                           10670
Musculoskeletal diseases               6230
Other noncommunicable diseases         5252
Pregnancy and childbirth               2741
Mental and substance use disorders     1761
Neoplasms                              1701
Cardiovascular diseases                1389
Gastrointestinal diseases               888
Endocrine diseases                      879
Genitourinary diseases                  719
Nervous system diseases                 716
Infections                              690
Respiratory diseases                    376
Name: ICD-10-CM Diagnosis Chapter, dtype: int64

### Clean data

Select default inpatient category

In [20]:
merge.rename(columns={'ICD-10-CM CODE':'icd_10_cm_code',
                      'ICD-10-CM CODE DESCRIPTION':'icd_10_cm_code_desc',
                      'Default CCSR CATEGORY DESCRIPTION IP':'ccsr_category',
                      'ICD-10-CM Diagnosis Chapter': 'disease_category'}, inplace=True)

In [21]:
merge = merge[['icd_10_cm_code','icd_10_cm_code_desc','ccsr_category','disease_category']]

In [22]:
merge.head()

Unnamed: 0,icd_10_cm_code,icd_10_cm_code_desc,ccsr_category,disease_category
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...",Intestinal infection,Gastrointestinal diseases
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor",Intestinal infection,Gastrointestinal diseases
2,A009,"Cholera, unspecified",Intestinal infection,Gastrointestinal diseases
3,A0100,"Typhoid fever, unspecified",Intestinal infection,Gastrointestinal diseases
4,A0101,Typhoid meningitis,Meningitis,Nervous system diseases


### Recode certain ICD10 codes to new CCSR category

In [23]:
stroke_codes = [
    'Middle cerebral artery syndrome',
    'Anterior cerebral artery syndrome',
    'Posterior cerebral artery syndrome',
    'Brain stem stroke syndrome',
    'Cerebellar stroke syndrome',
    'Pure motor lacunar syndrome',
    'Pure sensory lacunar syndrome',
    'Other lacunar syndromes',
    'Other vascular syndromes of brain in cerebrovascular diseases']

In [24]:
merge.loc[merge['icd_10_cm_code_desc'].isin(stroke_codes),'ccsr_category'] = 'Other and ill-defined cerebrovascular disease'
merge.loc[merge['icd_10_cm_code_desc'].isin(stroke_codes),'disease_category'] = 'Cardiovascular diseases'

### Import intermediate codes

CCSR categories were grouped into "intermediate" umbrella categories

In [25]:
intermediate = pd.read_excel('../data/ccsr/intermediate_category.xlsx')

In [26]:
intermediate.head()

Unnamed: 0,ccsr_category,int_category,disease_category
0,Chronic rheumatic heart disease,Other cardiovascular disorders,Cardiovascular diseases
1,Acute rheumatic heart disease,Other cardiovascular disorders,Cardiovascular diseases
2,Nonrheumatic and unspecified valve disorders,Other cardiovascular disorders,Cardiovascular diseases
3,Endocarditis and endocardial disease,Other cardiovascular disorders,Cardiovascular diseases
4,Myocarditis and cardiomyopathy,Other cardiovascular disorders,Cardiovascular diseases


In [27]:
intermediate.rename(columns={'disease_category':'disease_category_int'}, inplace=True)

In [28]:
#merge int categories
merge = merge.merge(intermediate, on='ccsr_category', how='left')

In [29]:
merge[merge['int_category'].isna()]

Unnamed: 0,icd_10_cm_code,icd_10_cm_code_desc,ccsr_category,disease_category,int_category,disease_category_int


In [30]:
print(merge.shape)
print('\n')
print(merge[merge['ccsr_category'].str.lower().str.endswith('sequela')]['disease_category'].value_counts())
print('\n')
print(merge[merge['ccsr_category'].str.lower().str.endswith('subsequent encounter')]['disease_category'].value_counts())

(73205, 6)


Injuries and external causes          9146
Mental and substance use disorders     388
Musculoskeletal diseases               216
Name: disease_category, dtype: int64


Injuries and external causes          18341
Musculoskeletal diseases                828
Mental and substance use disorders      318
Name: disease_category, dtype: int64


In [31]:
print(merge['icd_10_cm_code'].nunique())

73205


There are no duplicate ICD 10 codes, so each one maps uniquely to a category.

In [32]:
#remove codes that map to ccsr_category that end with sequela and subsequent encounter
merge = merge[~merge['ccsr_category'].str.lower().str.endswith('sequela')]
print('Without sequela: ',merge.shape)
merge = merge[~merge['ccsr_category'].str.lower().str.endswith('subsequent encounter')]
print('Without subsequent encounter: ',merge.shape)

Without sequela:  (63455, 6)
Without subsequent encounter:  (43968, 6)


In [33]:
print(merge['ccsr_category'].nunique())

test_ccsr_duplicates = merge[['ccsr_category','disease_category']].drop_duplicates('ccsr_category')
print(test_ccsr_duplicates.shape)
print(test_ccsr_duplicates['ccsr_category'].nunique())

446
(446, 2)
446


There are no duplicate CCSR categories, so each one maps uniquely to an aggregated disease category.

In [34]:
merge.head()

Unnamed: 0,icd_10_cm_code,icd_10_cm_code_desc,ccsr_category,disease_category,int_category,disease_category_int
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...",Intestinal infection,Gastrointestinal diseases,Other gastrointestinal disorders,Gastrointestinal diseases
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor",Intestinal infection,Gastrointestinal diseases,Other gastrointestinal disorders,Gastrointestinal diseases
2,A009,"Cholera, unspecified",Intestinal infection,Gastrointestinal diseases,Other gastrointestinal disorders,Gastrointestinal diseases
3,A0100,"Typhoid fever, unspecified",Intestinal infection,Gastrointestinal diseases,Other gastrointestinal disorders,Gastrointestinal diseases
4,A0101,Typhoid meningitis,Meningitis,Nervous system diseases,Other neurologic disorders,Nervous system diseases


### Clean merged data

In [35]:
#rename stroke to cerebrovascular disease
merge['int_category'].replace({'Stroke':'Cerebrovascular disease'}, inplace=True)

In [36]:
#recode some respiratory cancers as lung cancer
resp_cancer_codes = merge[merge['ccsr_category'] == 'Respiratory cancers']['icd_10_cm_code'].tolist()
resp_cancer_codes_keep = ['C33','C399','D021','D023','D024']
lung_cancer_codes = np.setdiff1d(resp_cancer_codes,resp_cancer_codes_keep)

merge.loc[merge['icd_10_cm_code'].isin(lung_cancer_codes),'int_category'] = 'Lung cancer'
merge.loc[merge['icd_10_cm_code'].isin(resp_cancer_codes_keep),'int_category'] = 'Other neoplasms'

In [37]:
#recode some neurocognitive disorders as neurodegenerative diseases
neurocog_codes = merge[merge['ccsr_category'] == 'Neurocognitive disorders']['icd_10_cm_code'].tolist()
neurocog_codes_keep = ['F04','F05','F0781']
neurodegen_codes = np.setdiff1d(neurocog_codes, neurocog_codes_keep)

merge.loc[merge['icd_10_cm_code'].isin(neurodegen_codes),'int_category'] = 'Neurodegenerative diseases'
merge.loc[merge['icd_10_cm_code'].isin(neurocog_codes_keep),'int_category'] = 'Other neurologic disorders'

In [38]:
#delete old disease category
del merge['disease_category']
merge.rename(columns={'disease_category_int':'disease_category'}, inplace=True)

### Final data inspection

In [39]:
print(merge.shape)
print(merge['icd_10_cm_code'].nunique())
print(merge['ccsr_category'].nunique())
print(merge['int_category'].nunique())
print(merge['disease_category'].nunique())

(43968, 5)
43968
446
144
14


In [40]:
merge['disease_category'].value_counts()

Injuries and external causes          11706
Nonspecific                           10670
Musculoskeletal diseases               5437
Other noncommunicable diseases         4955
Pregnancy and childbirth               2233
Neoplasms                              1701
Cardiovascular diseases                1497
Mental and substance use disorders     1055
Gastrointestinal diseases              1005
Endocrine diseases                      879
Genitourinary diseases                  878
Nervous system diseases                 769
Infections                              737
Respiratory diseases                    446
Name: disease_category, dtype: int64

### Export data

In [41]:
merge.to_csv('../data/ccsr/disease_hierarchy.csv', index=False)

### Compare CCSR category assignments

In [178]:
for_comparison = for_comparison[['Default CCSR CATEGORY DESCRIPTION IP','ICD-10-CM Diagnosis Chapter']]
for_comparison.columns = ['ccsr_category','chapter']
for_comparison['disease_category'] = for_comparison['chapter'].replace(rename_map)
for_comparison = for_comparison.drop_duplicates()

In [179]:
for_comparison.head()

Unnamed: 0,ccsr_category,chapter,disease_category
0,Intestinal infection,Diseases of the Digestive System,Gastrointestinal diseases
4,Meningitis,Diseases of the Nervous System,Nervous system diseases
5,Bacterial infections,Certain Infectious and Parasitic Diseases,Infections
6,Pneumonia (except that caused by tuberculosis),Diseases of the Respiratory System,Respiratory diseases
7,Infective arthritis,Diseases of the Musculoskeletal System and Con...,Musculoskeletal diseases


In [180]:
intermediate.head()

Unnamed: 0,ccsr_category,int_category,disease_category_int
0,Chronic rheumatic heart disease,Other cardiovascular disorders,Cardiovascular diseases
1,Acute rheumatic heart disease,Other cardiovascular disorders,Cardiovascular diseases
2,Nonrheumatic and unspecified valve disorders,Other cardiovascular disorders,Cardiovascular diseases
3,Endocarditis and endocardial disease,Other cardiovascular disorders,Cardiovascular diseases
4,Myocarditis and cardiomyopathy,Other cardiovascular disorders,Cardiovascular diseases


#### Merge data

In [181]:
for_comparison_merge = pd.merge(intermediate, for_comparison, on='ccsr_category', how='left')

In [190]:
for_comparison_merge[~(for_comparison_merge['disease_category_int'] == for_comparison_merge['disease_category'])].to_csv('../data/asdf.csv')

In [196]:
for_comparison[for_comparison['ccsr_category'] == 'Diabetes mellitus with complication']

Unnamed: 0,ccsr_category,chapter,disease_category
3102,Diabetes mellitus with complication,"Endocrine, Nutritional and Metabolic Diseases",Endocrine diseases


In [186]:
intermediate.shape

(532, 3)

In [195]:
merge[merge['ccsr_category'] == 'Diabetes mellitus with complication']

Unnamed: 0,icd_10_cm_code,icd_10_cm_code_desc,ccsr_category,int_category,disease_category
3102,E08321,Diabetes mellitus due to underlying condition ...,Diabetes mellitus with complication,Diabetes,Endocrine diseases
3107,E08329,Diabetes mellitus due to underlying condition ...,Diabetes mellitus with complication,Diabetes,Endocrine diseases
3112,E08331,Diabetes mellitus due to underlying condition ...,Diabetes mellitus with complication,Diabetes,Endocrine diseases
3117,E08339,Diabetes mellitus due to underlying condition ...,Diabetes mellitus with complication,Diabetes,Endocrine diseases
3122,E08341,Diabetes mellitus due to underlying condition ...,Diabetes mellitus with complication,Diabetes,Endocrine diseases
...,...,...,...,...,...
3555,E13641,Other specified diabetes mellitus with hypogly...,Diabetes mellitus with complication,Diabetes,Endocrine diseases
3556,E13649,Other specified diabetes mellitus with hypogly...,Diabetes mellitus with complication,Diabetes,Endocrine diseases
3557,E1365,Other specified diabetes mellitus with hypergl...,Diabetes mellitus with complication,Diabetes,Endocrine diseases
3558,E1369,Other specified diabetes mellitus with other s...,Diabetes mellitus with complication,Diabetes,Endocrine diseases


In [194]:
merge[merge['icd_10_cm_code_desc'].str.lower().str.contains('diabetes')]['ccsr_category'].value_counts()

Diabetes mellitus with complication                                                             378
Unacceptable PDX                                                                                 89
Diabetes or abnormal glucose tolerance complicating pregnancy; childbirth; or the puerperium     42
Diabetes mellitus without complication                                                            4
Other specified and unspecified perinatal conditions                                              2
Abnormal findings without diagnosis                                                               1
Pituitary disorders                                                                               1
Other specified and unspecified diseases of kidney and ureters                                    1
Name: ccsr_category, dtype: int64