In [None]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('/data/raw.csv')

# Replace blank values with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

# Replace 'NULL' string with NaN
df = df.replace('NULL', np.nan)

# Convert numeric columns to appropriate data types
numeric_columns = ['Age']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Convert categorical columns to appropriate data types
categorical_columns = ['Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 
                       'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 
                       'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity', 'class']
df[categorical_columns] = df[categorical_columns].astype('category')

# Handle outliers in the Age column (e.g., replace values outside 0-120 range with NaN)
df.loc[df['Age'] < 0, 'Age'] = np.nan
df.loc[df['Age'] > 120, 'Age'] = np.nan

# Replace weird values in categorical columns with NaN
for col in categorical_columns:
    df[col] = df[col].where(df[col].isin(['Male', 'Female', 'Yes', 'No', 'Positive', 'Negative']), np.nan)

# Drop rows with all NaN values
df = df.dropna(how='all')

# Fill NaN values with appropriate methods
df['Age'] = df['Age'].fillna(df['Age'].median())
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])

# Reset index after cleaning
df = df.reset_index(drop=True)

# Save the cleaned dataset
df.to_csv('cleaned_dataset.csv', index=False)

print("Data cleaning completed. Cleaned dataset saved as 'cleaned_dataset.csv'.")


Data cleaning completed. Cleaned dataset saved as 'cleaned_dataset.csv'.
