In [None]:
# prompt: load my google drive
import pandas as pd
import numpy as np
from datetime import datetime

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
inpatient_file_path = '/content/drive/MyDrive/Data/All FFS Claims/inpatient.csv'
# LU Look Up file for health codes
LU_drg_file_path = '/content/drive/MyDrive/Data/HealthCodes/DRG.csv'

# Downloaded 2025 ICD Codes but could not find about 90 codes so looked up the remaining using Perplexity.AI
# Ideal way:  Download 2015-2025 ICD Codes, create a database and then do look up

LU_ICD_file_path = '/content/drive/MyDrive/Data/HealthCodes/ICD10Diagnosis.csv'
LU_ICD2_file_path = '/content/drive/MyDrive/Data/HealthCodes/ICD_DIAG_CD_RemainingCodes.csv'

In [None]:
# Reading DRG and Diagnosis Look Ups
LU_drg = pd.read_csv(LU_drg_file_path)[['DRG','Description']]
LU_ICD = pd.read_csv(LU_ICD_file_path)
LU_ICD2 = pd.read_csv(LU_ICD2_file_path)[['ICD_DIAG_CD', 'Description']]


In [None]:
LU_drg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 773 entries, 0 to 772
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DRG          773 non-null    int64 
 1   Description  773 non-null    object
dtypes: int64(1), object(1)
memory usage: 12.2+ KB


In [None]:

LU_drg.rename(columns={'Description': 'DRG_Description'}, inplace=True)
LU_drg['DRG'] = LU_drg['DRG'].fillna(0).astype(int).astype(str).str.zfill(3)

LU_ICD.rename(columns={'Description': 'ICD_Description'}, inplace=True)

LU_ICD2.rename(columns={'Description': 'ICD_Description'}, inplace=True)

LU_ICD_final = pd.concat([LU_ICD, LU_ICD2])



In [None]:
# Read the CSV file
inpatient = pd.read_csv(inpatient_file_path, sep="|")
# How were columns read in?
col_inpatient = pd.DataFrame(inpatient.dtypes, columns=['type'])
col_inpatient


  inpatient = pd.read_csv(inpatient_file_path, sep="|")


Unnamed: 0,type
BENE_ID,int64
CLM_ID,int64
NCH_NEAR_LINE_REC_IDENT_CD,object
NCH_CLM_TYPE_CD,int64
CLM_FROM_DT,object
...,...
CLM_UNCOMPD_CARE_PMT_AMT,float64
CLM_LINE_NUM,int64
REV_CNTR,int64
HCPCS_CD,object


In [None]:


# Convert columns to appropriate data types
inpatient['BENE_ID'] = inpatient['BENE_ID'].astype(str)
inpatient['CLM_ID'] = inpatient['CLM_ID'].astype(str)
inpatient['PTNT_DSCHRG_STUS_CD'] = inpatient['PTNT_DSCHRG_STUS_CD'].astype(str)
inpatient['CLM_IP_ADMSN_TYPE_CD'] = inpatient['CLM_IP_ADMSN_TYPE_CD'].astype(str)

# Convert date columns
inpatient['CLM_FROM_DT'] = pd.to_datetime(inpatient['CLM_FROM_DT'], format='%d-%b-%Y')
inpatient['CLM_THRU_DT'] = pd.to_datetime(inpatient['CLM_THRU_DT'], format='%d-%b-%Y')

# Calculate LOS and YR
inpatient['LOS'] = (inpatient['CLM_THRU_DT'] - inpatient['CLM_FROM_DT']).dt.days + 1
inpatient['YR'] = inpatient['CLM_THRU_DT'].dt.year

# Convert 'CLM_DRG_CD' to numeric, handling errors by coercing to NaN
inpatient['CLM_DRG_CD'] = pd.to_numeric(inpatient['CLM_DRG_CD'], errors='coerce')
inpatient['DRG'] = inpatient['CLM_DRG_CD'].fillna(0).astype(int).astype(str).str.zfill(3)

# Drop and select columns
columns_to_drop = ['CLM_DRG_CD'] + [col for col in inpatient.columns if 'POA' in col or col.endswith('UPIN') or col.startswith('ICD_DGNS_E_CD') or col.startswith('PRCDR_DT')]
inpatient = inpatient.drop(columns=columns_to_drop)

# Filter rows
inpatient = inpatient[inpatient['CLM_LINE_NUM'] == 1]

# Create ER_flag
inpatient['ER_flag'] = np.where((inpatient['REV_CNTR'] == 450) & (inpatient['LOS'] == 1), 1, 0)


In [None]:
inpatient[['BENE_ID', 'YR', 'PRNCPAL_DGNS_CD'] +[col for col in inpatient.columns if col.startswith('ICD_DGNS_CD')]].head(5)

Unnamed: 0,BENE_ID,YR,PRNCPAL_DGNS_CD,ICD_DGNS_CD1,ICD_DGNS_CD2,ICD_DGNS_CD3,ICD_DGNS_CD4,ICD_DGNS_CD5,ICD_DGNS_CD6,ICD_DGNS_CD7,...,ICD_DGNS_CD16,ICD_DGNS_CD17,ICD_DGNS_CD18,ICD_DGNS_CD19,ICD_DGNS_CD20,ICD_DGNS_CD21,ICD_DGNS_CD22,ICD_DGNS_CD23,ICD_DGNS_CD24,ICD_DGNS_CD25
0,-10000010254618,2015,S134XX,S134XX,R4689,E781,J329,E119,D649,E849,...,,,,,,,,,,
1,-10000010254653,2015,Z3480,T7432X,E669,C50919,,,,,...,,,,,,,,,,
2,-10000010254653,2017,T7432X,T7432X,E669,C50929,,,,,...,,,,,,,,,,
3,-10000010254656,2017,S8290X,S8290X,G40909,R569,Z8669,,,,...,,,,,,,,,,
5,-10000010254656,2018,Z3480,Z5989,Z5941,G40909,R569,Z8669,,,...,,,,,,,,,,


In [None]:
diagnosis = inpatient[['BENE_ID', 'YR', 'PRNCPAL_DGNS_CD'] +
                      [col for col in inpatient.columns if col.startswith('ICD_DGNS_CD')]]

# Wide to long
diagnosis = pd.melt(diagnosis,
                    id_vars=['BENE_ID', 'YR'],
                    var_name='ICD_DIAG_COL',
                    value_name='ICD_DIAG_CD')

# Remove rows where ICD_DIAG_CD is not populated
diagnosis = diagnosis[diagnosis['ICD_DIAG_CD'].notna() & (diagnosis['ICD_DIAG_CD'] != '')]

# Drop the 'ICD_DIAG_COL' column
diagnosis = diagnosis.drop(columns=['ICD_DIAG_COL'])

# Remove duplicates
diagnosis = diagnosis.drop_duplicates()

In [None]:
diagnosis[diagnosis['BENE_ID'] == '-10000010254618']


Unnamed: 0,BENE_ID,YR,ICD_DIAG_CD
0,-10000010254618,2015,S134XX
41734,-10000010254618,2015,R4689
62601,-10000010254618,2015,E781
83468,-10000010254618,2015,J329
104335,-10000010254618,2015,E119
125202,-10000010254618,2015,D649
146069,-10000010254618,2015,E849
166936,-10000010254618,2015,B965
187803,-10000010254618,2015,N469


In [None]:

diagnosis = pd.merge(diagnosis, LU_ICD_final, on='ICD_DIAG_CD', how='left')


In [None]:
num_diagnosis = diagnosis.groupby(['BENE_ID', 'YR']).size().reset_index(name='NUM_DIAG')

In [None]:
inpatient_encounters = inpatient[['BENE_ID', 'CLM_ID', 'CLM_FROM_DT',
                            'CLM_THRU_DT', 'YR', 'LOS',
                            'DRG', 'PRNCPAL_DGNS_CD',
                            'PTNT_DSCHRG_STUS_CD',
                            'CLM_IP_ADMSN_TYPE_CD',
                            'ER_flag', 'CLM_TOT_CHRG_AMT']]

# Perform left joins
inpatient_encounters = inpatient_encounters.merge(num_diagnosis, on=['BENE_ID', 'YR'], how='left')
inpatient_encounters = inpatient_encounters.merge(LU_ICD_final,
                                      left_on='PRNCPAL_DGNS_CD',
                                      right_on='ICD_DIAG_CD',
                                      how='left').drop(columns=['ICD_DIAG_CD'])
inpatient_encounters = inpatient_encounters.merge(LU_drg, on='DRG', how='left')


In [None]:
inpatient_encounters

Unnamed: 0,BENE_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,YR,LOS,DRG,PRNCPAL_DGNS_CD,PTNT_DSCHRG_STUS_CD,CLM_IP_ADMSN_TYPE_CD,ER_flag,CLM_TOT_CHRG_AMT,NUM_DIAG,ICD_Description,DRG_Description
0,-10000010254618,-10000930037831,2015-03-25,2015-03-25,2015,1,551,S134XX,1,1,1,96.65,9,Sprain of ligaments of cervical spine,Medical Back Problems with MCC
1,-10000010254653,-10000930038030,2015-09-24,2015-09-24,2015,1,951,Z3480,1,1,1,6311.88,4,Encounter for supervision of other normal preg...,Other Factors Influencing Health Status
2,-10000010254653,-10000930038031,2017-05-09,2017-05-10,2017,2,923,T7432X,1,3,0,8545.72,3,Child psychological abuse,"Other Injury, Poisoning and Toxic Effect Diagn..."
3,-10000010254656,-10000930038162,2017-01-14,2017-01-14,2017,1,564,S8290X,1,1,1,1014.85,4,Unspecified fracture of unspecified lower leg,Other Musculoskeletal System and Connective Ti...
4,-10000010254656,-10000930038163,2018-03-17,2018-03-17,2018,1,951,Z3480,1,1,1,9911.41,6,Encounter for supervision of other normal preg...,Other Factors Influencing Health Status
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20862,-10000010288008,-10000931485965,2020-01-08,2020-01-14,2020,7,000,T50901A,1,3,0,166.97,12,"Poisoning by unspecified drugs, medicaments an...",
20863,-10000010288008,-10000931485967,2020-10-23,2020-10-23,2020,1,000,T50901A,1,1,1,166.97,12,"Poisoning by unspecified drugs, medicaments an...",
20864,-10000010288008,-10000931485969,2021-08-22,2021-08-23,2021,2,000,T50901A,1,1,0,166.97,13,"Poisoning by unspecified drugs, medicaments an...",
20865,-10000010288008,-10000931485971,2021-09-13,2021-09-16,2021,4,000,T50901A,1,2,0,166.97,13,"Poisoning by unspecified drugs, medicaments an...",


In [None]:

inpatient_encounters.to_csv('/content/drive/MyDrive/Data/Output Data/inpatient_encounters.csv', index=False)
diagnosis.to_csv('/content/drive/MyDrive/Data/Output Data/inpatient_diagnosis.csv', index=False)