# **Healthcare Data Cleaning Imputation**

### **Date/Time Conversion**

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [7]:
# Read the patienthistory.csv and store it at df_patient dataframe
df_patient = pd.read_csv('PatientHistory.csv')

In [6]:
# Convert DateOfVisit and LastPaymentDate to datetime objects
df_patient['DateOfVisit'] = pd.to_datetime(df_patient['DateOfVisit'])
df_patient['LastPaymentDate'] = pd.to_datetime(df_patient['LastPaymentDate'])

In [8]:
# Convert BillAmount and FollowUpDays to float type
df_patient['BillAmount'] = pd.to_numeric(df_patient['BillAmount'], errors='coerce')
df_patient['FollowUpDays'] = pd.to_numeric(df_patient['FollowUpDays'], errors='coerce')

In [9]:
# Convert Doctor empty strings into np.nan
df_patient['Doctor'] = df_patient['Doctor'].replace('', np.nan)

In [42]:
# Display the df_patient
df_patient

Unnamed: 0,PatientID,PatientName,DateOfVisit,Diagnosis,Treatment,Doctor,BillAmount,PaymentStatus,LastPaymentDate,FollowUpDays
0,P001,John Doe,2023-01-15,Common Cold,Rest and Fluids,Dr. Smith,50.0,Paid,2023-01-15,7.0
1,P002,Jane Smith,2023-02-01,Migraine,Medication,Dr. Jones,120.5,Paid,2023-02-05,14.0
2,P003,Alice Brown,2023-02-10,Sprained Ankle,Physical Therapy,Dr. Davis,300.75,Pending,,21.0
3,P004,Bob White,2023-03-05,Seasonal Allergies,Antihistamines,Dr. Smith,75.0,Paid,2023-03-05,7.0
4,P005,Charlie Green,2023-03-20,Strep Throat,Antibiotics,Dr. Jones,90.25,Paid,2023-03-22,5.0
5,P006,Diana Prince,2023-04-01,Routine Check-up,Preventive Care,Dr. Evans,80.0,Paid,2023-04-01,30.0
6,P007,Edward Stark,2023-04-12,Back Pain,Chiropractic Adjustment,Dr. Wilson,450.0,Partially Paid,2023-04-20,
7,P008,Fiona Glenn,2023-05-03,Insomnia,Sleep Study,Dr. Reed,600.0,Pending,,90.0
8,P009,George King,2023-05-18,Flu,Antiviral Medication,Dr. Smith,110.0,Paid,2023-05-18,7.0
9,P010,Hannah Scott,2023-06-01,Dermatitis,Topical Cream,Dr. Miller,,Paid,2023-06-02,10.0


In [41]:
# Show df_patient info to check the conversion
df_patient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PatientID        30 non-null     object 
 1   PatientName      30 non-null     object 
 2   DateOfVisit      30 non-null     object 
 3   Diagnosis        30 non-null     object 
 4   Treatment        30 non-null     object 
 5   Doctor           29 non-null     object 
 6   BillAmount       27 non-null     float64
 7   PaymentStatus    30 non-null     object 
 8   LastPaymentDate  25 non-null     object 
 9   FollowUpDays     26 non-null     float64
dtypes: float64(2), object(8)
memory usage: 2.5+ KB


In [39]:
# Show the total missing values
df_patient.isnull().sum()

Unnamed: 0,0
PatientID,0
PatientName,0
DateOfVisit,0
Diagnosis,0
Treatment,0
Doctor,1
BillAmount,3
PaymentStatus,0
LastPaymentDate,5
FollowUpDays,4


### **Handle Duplicates**

In [43]:
# Check first the total number of duplicates
duplicates_count = df_patient.duplicated().sum()
print(f"Number of duplicate rows: {duplicates_count}")

Number of duplicate rows: 5


In [14]:
# Drop duplicate rows
# Store to df_patient_no_duplicates
df_patient_no_duplicates = df_patient.drop_duplicates()

In [44]:
# Print df_patient_no_duplicates
df_patient_no_duplicates

Unnamed: 0,PatientID,PatientName,DateOfVisit,Diagnosis,Treatment,Doctor,BillAmount,PaymentStatus,LastPaymentDate,FollowUpDays
0,P001,John Doe,2023-01-15,Common Cold,Rest and Fluids,Dr. Smith,50.0,Paid,2023-01-15,7.0
1,P002,Jane Smith,2023-02-01,Migraine,Medication,Dr. Jones,120.5,Paid,2023-02-05,14.0
2,P003,Alice Brown,2023-02-10,Sprained Ankle,Physical Therapy,Dr. Davis,300.75,Pending,,21.0
3,P004,Bob White,2023-03-05,Seasonal Allergies,Antihistamines,Dr. Smith,75.0,Paid,2023-03-05,7.0
4,P005,Charlie Green,2023-03-20,Strep Throat,Antibiotics,Dr. Jones,90.25,Paid,2023-03-22,5.0
5,P006,Diana Prince,2023-04-01,Routine Check-up,Preventive Care,Dr. Evans,80.0,Paid,2023-04-01,30.0
6,P007,Edward Stark,2023-04-12,Back Pain,Chiropractic Adjustment,Dr. Wilson,450.0,Partially Paid,2023-04-20,
7,P008,Fiona Glenn,2023-05-03,Insomnia,Sleep Study,Dr. Reed,600.0,Pending,,90.0
8,P009,George King,2023-05-18,Flu,Antiviral Medication,Dr. Smith,110.0,Paid,2023-05-18,7.0
9,P010,Hannah Scott,2023-06-01,Dermatitis,Topical Cream,Dr. Miller,,Paid,2023-06-02,10.0


In [45]:
# Check number of missing values in df_patient_no_duplicates
df_patient_no_duplicates.isnull().sum()

Unnamed: 0,0
PatientID,0
PatientName,0
DateOfVisit,0
Diagnosis,0
Treatment,0
Doctor,1
BillAmount,2
PaymentStatus,0
LastPaymentDate,5
FollowUpDays,4


### **Median Imputation**

In [17]:
# Copy df_patient_no_duplicates to df_median
df_median = df_patient_no_duplicates.copy()

In [34]:
# Calculate median of BillAmount stored at median_bill_amount
median_bill_amount = df_median['BillAmount'].median()
print(f"Median BillAmount: {median_bill_amount}")

Median BillAmount: 100.0


In [19]:
# Fill the missing BillAmount with median_bill_amount in df_median
df_median['BillAmount'] = df_median['BillAmount'].fillna(median_bill_amount)

In [46]:
# Print df_median
df_median

Unnamed: 0,PatientID,PatientName,DateOfVisit,Diagnosis,Treatment,Doctor,BillAmount,PaymentStatus,LastPaymentDate,FollowUpDays
0,P001,John Doe,2023-01-15,Common Cold,Rest and Fluids,Dr. Smith,50.0,Paid,2023-01-15,7.0
1,P002,Jane Smith,2023-02-01,Migraine,Medication,Dr. Jones,120.5,Paid,2023-02-05,14.0
2,P003,Alice Brown,2023-02-10,Sprained Ankle,Physical Therapy,Dr. Davis,300.75,Pending,,21.0
3,P004,Bob White,2023-03-05,Seasonal Allergies,Antihistamines,Dr. Smith,75.0,Paid,2023-03-05,7.0
4,P005,Charlie Green,2023-03-20,Strep Throat,Antibiotics,Dr. Jones,90.25,Paid,2023-03-22,5.0
5,P006,Diana Prince,2023-04-01,Routine Check-up,Preventive Care,Dr. Evans,80.0,Paid,2023-04-01,30.0
6,P007,Edward Stark,2023-04-12,Back Pain,Chiropractic Adjustment,Dr. Wilson,450.0,Partially Paid,2023-04-20,
7,P008,Fiona Glenn,2023-05-03,Insomnia,Sleep Study,Dr. Reed,600.0,Pending,,90.0
8,P009,George King,2023-05-18,Flu,Antiviral Medication,Dr. Smith,110.0,Paid,2023-05-18,7.0
9,P010,Hannah Scott,2023-06-01,Dermatitis,Topical Cream,Dr. Miller,100.0,Paid,2023-06-02,10.0


### **Mode Imputation**

In [21]:
# Copy df_median to df_mode
df_mode = df_median.copy()

In [47]:
# Calculate mode of 'Doctor' and store it at mode_doctor
mode_doctor = df_mode['Doctor'].mode()[0]
print(f"Mode Doctor: {mode_doctor}")

Mode Doctor: Dr. Smith


In [23]:
# Fill missing Doctor with mode_doctor
df_mode['Doctor'] = df_mode['Doctor'].fillna(mode_doctor)

In [48]:
# Print df_mode
df_mode

Unnamed: 0,PatientID,PatientName,DateOfVisit,Diagnosis,Treatment,Doctor,BillAmount,PaymentStatus,LastPaymentDate,FollowUpDays
0,P001,John Doe,2023-01-15,Common Cold,Rest and Fluids,Dr. Smith,50.0,Paid,2023-01-15,7.0
1,P002,Jane Smith,2023-02-01,Migraine,Medication,Dr. Jones,120.5,Paid,2023-02-05,14.0
2,P003,Alice Brown,2023-02-10,Sprained Ankle,Physical Therapy,Dr. Davis,300.75,Pending,,21.0
3,P004,Bob White,2023-03-05,Seasonal Allergies,Antihistamines,Dr. Smith,75.0,Paid,2023-03-05,7.0
4,P005,Charlie Green,2023-03-20,Strep Throat,Antibiotics,Dr. Jones,90.25,Paid,2023-03-22,5.0
5,P006,Diana Prince,2023-04-01,Routine Check-up,Preventive Care,Dr. Evans,80.0,Paid,2023-04-01,30.0
6,P007,Edward Stark,2023-04-12,Back Pain,Chiropractic Adjustment,Dr. Wilson,450.0,Partially Paid,2023-04-20,
7,P008,Fiona Glenn,2023-05-03,Insomnia,Sleep Study,Dr. Reed,600.0,Pending,,90.0
8,P009,George King,2023-05-18,Flu,Antiviral Medication,Dr. Smith,110.0,Paid,2023-05-18,7.0
9,P010,Hannah Scott,2023-06-01,Dermatitis,Topical Cream,Dr. Miller,100.0,Paid,2023-06-02,10.0


### **K-Nearest Neighbor**

In [25]:
# Copy columns 'BillAmount', 'FollowUpDays' stored at columns_for_knn
columns_for_knn = ['BillAmount', 'FollowUpDays']

In [26]:
# Copy df_mode with column_for_knn to df_knn
df_knn = df_mode[columns_for_knn].copy()

In [27]:
# Import KNNImputer from sklearn.impute (already imported above)
# Import numpy with alias np (already imported above)
# Initialize KNNImputer with imputer variable. Use n_neighbors=5
imputer = KNNImputer(n_neighbors=5)

In [28]:
# Store to imputed_data after the fit and transform of df_knn
imputed_data = imputer.fit_transform(df_knn)

In [52]:
# Store in df_knn_imputed the array imputed_data after conversion to dataframe
# Make sure you retain the original index to map back correctly
df_knn_imputed = pd.DataFrame(imputed_data, columns=columns_for_knn, index=df_knn.index)
df_knn_imputed

Unnamed: 0,BillAmount,FollowUpDays
0,50.0,7.0
1,120.5,14.0
2,300.75,21.0
3,75.0,7.0
4,90.25,5.0
5,80.0,30.0
6,450.0,116.0
7,600.0,90.0
8,110.0,7.0
9,100.0,10.0


In [51]:
# Print missing values for df_knn_imputed
df_knn_imputed.isnull().sum()

Unnamed: 0,0
BillAmount,0
FollowUpDays,0


In [55]:
# Print the cleaned data
df_knn_imputed = df_mode.copy()

df_knn_imputed[['BillAmount', 'FollowUpDays']] = pd.DataFrame(imputed_data, columns=['BillAmount', 'FollowUpDays'], index=df_knn_imputed.index)
df_knn_imputed

Unnamed: 0,PatientID,PatientName,DateOfVisit,Diagnosis,Treatment,Doctor,BillAmount,PaymentStatus,LastPaymentDate,FollowUpDays
0,P001,John Doe,2023-01-15,Common Cold,Rest and Fluids,Dr. Smith,50.0,Paid,2023-01-15,7.0
1,P002,Jane Smith,2023-02-01,Migraine,Medication,Dr. Jones,120.5,Paid,2023-02-05,14.0
2,P003,Alice Brown,2023-02-10,Sprained Ankle,Physical Therapy,Dr. Davis,300.75,Pending,,21.0
3,P004,Bob White,2023-03-05,Seasonal Allergies,Antihistamines,Dr. Smith,75.0,Paid,2023-03-05,7.0
4,P005,Charlie Green,2023-03-20,Strep Throat,Antibiotics,Dr. Jones,90.25,Paid,2023-03-22,5.0
5,P006,Diana Prince,2023-04-01,Routine Check-up,Preventive Care,Dr. Evans,80.0,Paid,2023-04-01,30.0
6,P007,Edward Stark,2023-04-12,Back Pain,Chiropractic Adjustment,Dr. Wilson,450.0,Partially Paid,2023-04-20,116.0
7,P008,Fiona Glenn,2023-05-03,Insomnia,Sleep Study,Dr. Reed,600.0,Pending,,90.0
8,P009,George King,2023-05-18,Flu,Antiviral Medication,Dr. Smith,110.0,Paid,2023-05-18,7.0
9,P010,Hannah Scott,2023-06-01,Dermatitis,Topical Cream,Dr. Miller,100.0,Paid,2023-06-02,10.0
