In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
datatest = pd.read_csv('test_data.csv')
datatest.shape

(50436, 31)

In [None]:
#To drop duplicate rows in datatest
datatest = datatest.drop_duplicates()
datatest.shape


(46915, 31)

In [None]:
# Convert date columns to datetime format
datatest['Service Date'] = pd.to_datetime(datatest['Service Date'], format="%d-%m-%Y")
datatest['Recieved Date'] = pd.to_datetime(datatest['Recieved Date'], format="%d-%m-%Y")
datatest['Paid Date'] = pd.to_datetime(datatest['Paid Date'], format="%d-%m-%Y")

# Create new features
datatest['Claim Processing Time'] = (datatest['Recieved Date'] - datatest['Service Date']).dt.days
datatest['Claim Payment Time'] = (datatest['Paid Date'] - datatest['Recieved Date']).dt.days
datatest['Total Claim Duration'] = (datatest['Paid Date'] - datatest['Service Date']).dt.days

In [None]:
datatest.head()

Unnamed: 0,Claim ID,Service Date,Recieved Date,Paid Date,Patient ID,Member Age,Gender,Marital Status,Ethnicity,LOB,...,ICD10 Code 8,ICD10 Code 9,ICD10 Code 10,Service Type,Service Code,Modifiers,High Cost Claim,Claim Processing Time,Claim Payment Time,Total Claim Duration
0,8110afaab1020212f58193e233369c1ac352baf280637c...,2022-01-15,2022-01-17,2022-01-25,99f69b741b2784601452afbe7f9083b5ab984b1b7acc64...,10-19 Yrs Old,F,Single,Caucasian/White,SGH,...,,,,PROC,U0005,,,2,8,10
1,8110afaab1020212f58193e233369c1ac352baf280637c...,2022-01-15,2022-01-17,2022-01-25,99f69b741b2784601452afbe7f9083b5ab984b1b7acc64...,10-19 Yrs Old,F,Single,Caucasian/White,SGH,...,,,,PROC,U0003,,,2,8,10
2,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,2021-12-14,2021-12-16,2021-12-25,634c777aef420c76fdeede3aeeedda6a31134e7888b32f...,60-69 Yrs Old,M,Widowed,Caucasian/White,IOT,...,,,,PROC,U0003,,,2,9,11
3,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,2021-12-14,2021-12-16,2021-12-25,634c777aef420c76fdeede3aeeedda6a31134e7888b32f...,60-69 Yrs Old,M,Widowed,Caucasian/White,IOT,...,,,,PROC,U0005,,,2,9,11
4,f570fb586d0d14e4509f9fcc8b0f0a485407ce0bbe21af...,2022-01-22,2022-01-23,2022-02-01,46129d9a9e8f26cda7f473d1e00492efe554c8a452481c...,30-39 Yrs Old,M,Single,Caucasian/White,SOT,...,,,,PROC,99214,,,1,9,10


In [None]:

# Define features and target
features = ['Claim ID','Member Age', 'Claim Category', 'Place of Service',
            'ICD10 Code 1', 'ICD10 Code 2', 'ICD10 Code 3', 'Service Type','Service Code',
            'Claim Processing Time', 'Total Claim Duration', 'Claim Payment Time', 'Network Status']

# Select only the specified features
datatest = datatest[features]


In [None]:
datatest.isnull().sum()


Unnamed: 0,0
Claim ID,0
Member Age,0
Claim Category,0
Place of Service,0
ICD10 Code 1,136
ICD10 Code 2,14866
ICD10 Code 3,24087
Service Type,0
Service Code,0
Claim Processing Time,0


In [None]:
datatest.dtypes

Unnamed: 0,0
Claim ID,object
Member Age,object
Claim Category,object
Place of Service,object
ICD10 Code 1,float64
ICD10 Code 2,float64
ICD10 Code 3,float64
Service Type,object
Service Code,object
Claim Processing Time,int64


In [None]:
# Define a dictionary to map categorical age groups to numerical values
age_mapping = {
    "< 1 Yrs Old": 0,
    "1-9 Yrs Old": 5,
    "10-19 Yrs Old": 15,
    "20-29 Yrs Old": 25,
    "30-39 Yrs Old": 35,
    "40-49 Yrs Old": 45,
    "50-59 Yrs Old": 55,
    "60-69 Yrs Old": 65,
    "70-79 Yrs Old": 75,
    "80-89 Yrs Old": 85,
    "90-99 Yrs Old": 95,
    "100+ Yrs Old": 100
}

# Apply the mapping to the 'Member Age' column
datatest['Member Age'] = datatest['Member Age'].map(age_mapping)

# Display the updated datatestframe
print(datatest[['Member Age']].head())  # Check the transformed column

   Member Age
0          15
1          15
2          65
3          65
4          35


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
datatest['Claim Category'] = label_encoder.fit_transform(datatest['Claim Category'])

In [None]:
# Frequency Encoding for 'Place of Service'
Place_of_Service_counts = datatest['Place of Service'].value_counts().to_dict()
datatest['Place of Service'] = datatest['Place of Service'].map(Place_of_Service_counts)


# Display transformed column
print(datatest[['Place of Service']].head())

   Place of Service
0             22453
1             22453
2             22453
3             22453
4             22453


In [None]:
# Frequency Encoding for 'Service Code'
Service_Code_counts = datatest['Service Code'].value_counts().to_dict()
datatest['Service Code'] = datatest['Service Code'].map(Service_Code_counts)


# Display transformed column
print(datatest[['Service Code']].head())

   Service Code
0            31
1            31
2            31
3            31
4          1774


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
datatest['Service Type'] = label_encoder.fit_transform(datatest['Service Type'])

In [None]:
# Frequency Encoding for 'Network Status'
network_status_counts = datatest['Network Status'].value_counts().to_dict()
datatest['Network Status'] = datatest['Network Status'].map(network_status_counts)


# Display transformed column
print(datatest[['Network Status']].head())


   Network Status
0           28468
1           28468
2           28468
3           28468
4           28468


In [None]:
# Apply Frequency Encoding for ICD10 Code columns
icd10_columns = ['ICD10 Code 1', 'ICD10 Code 2', 'ICD10 Code 3']

for col in icd10_columns:
    icd10_counts = datatest[col].value_counts().to_dict()
    datatest[col] = datatest[col].map(icd10_counts)

# Display transformed columns
print(datatest[icd10_columns].head())

   ICD10 Code 1  ICD10 Code 2  ICD10 Code 3
0          22.0           NaN           NaN
1          22.0           NaN           NaN
2         274.0           NaN           NaN
3         274.0           NaN           NaN
4         652.0         286.0         173.0


In [None]:
datatest.dtypes

Unnamed: 0,0
Claim ID,object
Member Age,int64
Claim Category,int64
Place of Service,int64
ICD10 Code 1,float64
ICD10 Code 2,float64
ICD10 Code 3,float64
Service Type,int64
Service Code,int64
Claim Processing Time,int64


In [None]:
datatest.head()

Unnamed: 0,Claim ID,Member Age,Claim Category,Place of Service,ICD10 Code 1,ICD10 Code 2,ICD10 Code 3,Service Type,Service Code,Claim Processing Time,Total Claim Duration,Claim Payment Time,Network Status
0,8110afaab1020212f58193e233369c1ac352baf280637c...,15,5,22453,22.0,,,1,31,2,10,8,28468
1,8110afaab1020212f58193e233369c1ac352baf280637c...,15,5,22453,22.0,,,1,31,2,10,8,28468
2,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,65,5,22453,274.0,,,1,31,2,11,9,28468
3,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,65,5,22453,274.0,,,1,31,2,11,9,28468
4,f570fb586d0d14e4509f9fcc8b0f0a485407ce0bbe21af...,35,5,22453,652.0,286.0,173.0,1,1774,1,10,9,28468


In [None]:
datatest.isnull().sum()

Unnamed: 0,0
Claim ID,0
Member Age,0
Claim Category,0
Place of Service,0
ICD10 Code 1,136
ICD10 Code 2,14866
ICD10 Code 3,24087
Service Type,0
Service Code,0
Claim Processing Time,0


In [None]:
icd10_columns = ['ICD10 Code 1', 'ICD10 Code 2', 'ICD10 Code 3']
datatest[icd10_columns] = datatest[icd10_columns].fillna(0)
#because of frequeny encodig as same null values exist in test datatest as well
#Replace NaN values using Imputation: Replace with 0 (Recommended for Frequency Encoding)

In [None]:
datatest.isnull().sum()

Unnamed: 0,0
Claim ID,0
Member Age,0
Claim Category,0
Place of Service,0
ICD10 Code 1,0
ICD10 Code 2,0
ICD10 Code 3,0
Service Type,0
Service Code,0
Claim Processing Time,0


In [None]:
datatest.head()

Unnamed: 0,Claim ID,Member Age,Claim Category,Place of Service,ICD10 Code 1,ICD10 Code 2,ICD10 Code 3,Service Type,Service Code,Claim Processing Time,Total Claim Duration,Claim Payment Time,Network Status
0,8110afaab1020212f58193e233369c1ac352baf280637c...,15,5,22453,22.0,0.0,0.0,1,31,2,10,8,28468
1,8110afaab1020212f58193e233369c1ac352baf280637c...,15,5,22453,22.0,0.0,0.0,1,31,2,10,8,28468
2,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,65,5,22453,274.0,0.0,0.0,1,31,2,11,9,28468
3,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,65,5,22453,274.0,0.0,0.0,1,31,2,11,9,28468
4,f570fb586d0d14e4509f9fcc8b0f0a485407ce0bbe21af...,35,5,22453,652.0,286.0,173.0,1,1774,1,10,9,28468


In [None]:
datatest.to_csv('preprocessed_data_for_prediction.csv',index = False)

In [None]:
datatest.head()

Unnamed: 0,Claim ID,Member Age,Claim Category,Place of Service,ICD10 Code 1,ICD10 Code 2,ICD10 Code 3,Service Type,Service Code,Claim Processing Time,Total Claim Duration,Claim Payment Time,Network Status
0,8110afaab1020212f58193e233369c1ac352baf280637c...,15,5,22453,22.0,0.0,0.0,1,31,2,10,8,28468
1,8110afaab1020212f58193e233369c1ac352baf280637c...,15,5,22453,22.0,0.0,0.0,1,31,2,10,8,28468
2,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,65,5,22453,274.0,0.0,0.0,1,31,2,11,9,28468
3,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,65,5,22453,274.0,0.0,0.0,1,31,2,11,9,28468
4,f570fb586d0d14e4509f9fcc8b0f0a485407ce0bbe21af...,35,5,22453,652.0,286.0,173.0,1,1774,1,10,9,28468


In [None]:
datatest.columns

Index(['Claim ID', 'Member Age', 'Claim Category', 'Place of Service',
       'ICD10 Code 1', 'ICD10 Code 2', 'ICD10 Code 3', 'Service Type',
       'Service Code', 'Claim Processing Time', 'Total Claim Duration',
       'Claim Payment Time', 'Network Status'],
      dtype='object')

In [None]:
datatest.head()

Unnamed: 0,Claim ID,Member Age,Claim Category,Place of Service,ICD10 Code 1,ICD10 Code 2,ICD10 Code 3,Service Type,Service Code,Claim Processing Time,Total Claim Duration,Claim Payment Time,Network Status
0,8110afaab1020212f58193e233369c1ac352baf280637c...,15,5,22453,22.0,0.0,0.0,1,31,2,10,8,28468
1,8110afaab1020212f58193e233369c1ac352baf280637c...,15,5,22453,22.0,0.0,0.0,1,31,2,10,8,28468
2,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,65,5,22453,274.0,0.0,0.0,1,31,2,11,9,28468
3,1875703175f70b4ce2fe1987473b3971f9ea48c95bf186...,65,5,22453,274.0,0.0,0.0,1,31,2,11,9,28468
4,f570fb586d0d14e4509f9fcc8b0f0a485407ce0bbe21af...,35,5,22453,652.0,286.0,173.0,1,1774,1,10,9,28468
