In [19]:
import pandas as pd
import numpy as np
from datetime import datetime

In [26]:
print("Loading dataset...")
df = pd.read_csv('/Users/Sai/Downloads/task1/Raw_Medical_Appoinment.csv')

original_shape = df.shape



Loading dataset...


In [27]:
print("\nCleaning column names...")
df.columns = df.columns.str.lower().str.replace('-', '_').str.replace(' ', '_')





Cleaning column names...


In [28]:
print("\nConverting date columns...")
df['scheduledday'] = pd.to_datetime(df['scheduledday'])
df['appointmentday'] = pd.to_datetime(df['appointmentday'])





Converting date columns...


In [29]:
print("\nFixing data types...")

df['age'] = df['age'].astype(int)




Fixing data types...


In [30]:
binary_columns = ['scholarship', 'hipertension', 'diabetes', 'alcoholism', 'sms_received']
for col in binary_columns:
    df[col] = df[col].astype(int)



In [31]:
df['gender'] = df['gender'].str.lower()


df['no_show'] = (df['no_show'] == 'Yes').astype(int)


In [32]:
print("\nHandling missing values...")
print("Missing values before cleaning:")
print(df.isnull().sum())




Handling missing values...
Missing values before cleaning:
patientid         0
appointmentid     0
gender            0
scheduledday      0
appointmentday    0
age               0
neighbourhood     0
scholarship       0
hipertension      0
diabetes          0
alcoholism        0
handcap           0
sms_received      0
no_show           0
dtype: int64


In [33]:

numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])




In [34]:
print("\nRemoving duplicates...")
initial_rows = len(df)
df = df.drop_duplicates()
removed_rows = initial_rows - len(df)
print(f"Removed {removed_rows} duplicate rows")



Removing duplicates...
Removed 0 duplicate rows


In [35]:
print("\nAdding derived features...")
df['days_until_appointment'] = (df['appointmentday'] - df['scheduledday']).dt.days



Adding derived features...


In [36]:
print("\nCleaning Summary:")
print("-" * 50)
print(f"Original shape: {original_shape}")
print(f"Final shape: {df.shape}")
print(f"Duplicates removed: {removed_rows}")
print("\nColumns in cleaned dataset:")
for col in df.columns:
    print(f"- {col}: {df[col].dtype}")

print("\nMissing values after cleaning:")
print(df.isnull().sum())



Cleaning Summary:
--------------------------------------------------
Original shape: (110527, 14)
Final shape: (110527, 15)
Duplicates removed: 0

Columns in cleaned dataset:
- patientid: float64
- appointmentid: int64
- gender: object
- scheduledday: datetime64[ns, UTC]
- appointmentday: datetime64[ns, UTC]
- age: int64
- neighbourhood: object
- scholarship: int64
- hipertension: int64
- diabetes: int64
- alcoholism: int64
- handcap: int64
- sms_received: int64
- no_show: int64
- days_until_appointment: int64

Missing values after cleaning:
patientid                 0
appointmentid             0
gender                    0
scheduledday              0
appointmentday            0
age                       0
neighbourhood             0
scholarship               0
hipertension              0
diabetes                  0
alcoholism                0
handcap                   0
sms_received              0
no_show                   0
days_until_appointment    0
dtype: int64


In [37]:
print("\nSaving cleaned dataset...")
df.to_csv('cleaned_medical_appointment.csv', index=False)
print("Cleaned dataset saved as 'cleaned_medical_appointment.csv'")


Saving cleaned dataset...
Cleaned dataset saved as 'cleaned_medical_appointment.csv'
