In [14]:
import pandas as pd

# Example: Load dataset (replace with your file path)
df = pd.read_csv("task 1 csv.csv")

In [15]:
# 1. Identify and handle missing values
print("Missing values before handling:\n", df.isnull().sum())

# Example strategy: fill numeric with mean, categorical with mode
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

print("\nMissing values after handling:\n", df.isnull().sum())

Missing values before handling:
 PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64

Missing values after handling:
 PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [16]:
# 2. Remove duplicate rows
df.drop_duplicates(inplace=True)

In [38]:

# 3. Standardize text values (example: Gender, Country)
# Convert all text columns to lowercase and strip spaces
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip().str.lower()
df['gender'] = df['gender'].astype(str).str.strip().str.upper().map({
    'F': 'female',
    'M': 'male'
})

In [None]:
# 4. Convert date formats to a consistent type

# --- Convert scheduledday to datetime if it exists ---
if 'scheduledday' in df.columns:
	df['scheduledday'] = pd.to_datetime(df['scheduledday'], errors='coerce')
	# Extract date (dd-mm-yyyy) and replace original column
	df['scheduled_date'] = df['scheduledday'].dt.strftime('%d-%m-%Y')
	# Extract time (HH:MM:SS) and create new column
	df['scheduled_time'] = df['scheduledday'].dt.strftime('%H:%M:%S')
	# Drop old scheduledday column if you don’t need it
	df.drop(columns=['scheduledday'], inplace=True)

# --- Do the same for appointmentday if needed ---
if 'appointmentday' in df.columns:
	df['appointmentday'] = pd.to_datetime(df['appointmentday'], errors='coerce')
	df['appointment_date'] = df['appointmentday'].dt.strftime('%d-%m-%Y')
	df['appointment_time'] = df['appointmentday'].dt.strftime('%H:%M:%S')
	df.drop(columns=['appointmentday'], inplace=True)



In [57]:
# 5. Rename column headers (lowercase, no spaces)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
df.drop(columns=['appointment_time'], inplace=True)

In [59]:
# 6. Check and fix data types
if 'age' in df.columns:
    df['age'] = df['age'].astype(int, errors='ignore')  # if convertible
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Save cleaned data
df.to_csv("cleaned_data.csv", index=False)
print("cleaned_data.csv")
print("\n Data cleaning completed! Cleaned file saved as 'cleaned_data.csv'")


cleaned_data.csv

 Data cleaning completed! Cleaned file saved as 'cleaned_data.csv'
