In [28]:
# --- Cell 1: Import Required Libraries ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [29]:
# --- Cell 2: Load Dataset ---
df = pd.read_csv("../Data/your_dataset.csv")
print("✅ Raw dataset loaded. Shape:", df.shape)

✅ Raw dataset loaded. Shape: (3000, 25)


In [30]:
# --- Cell 3: Drop Duplicates ---
df.drop_duplicates(inplace=True)
print("✅ Duplicates removed. Shape:", df.shape)

✅ Duplicates removed. Shape: (3000, 25)


In [31]:
# --- Cell 4: Replace Missing Value Indicators ---
df.replace("?", np.nan, inplace=True)
print("✅ Missing indicators replaced with NaN.")

✅ Missing indicators replaced with NaN.


In [32]:
# --- Cell 5: Convert Columns to Numeric ---
numeric_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot',
                'hemo', 'pcv', 'wc', 'rc']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
print("✅ Numeric conversion completed.")

✅ Numeric conversion completed.


In [33]:
# --- Cell 6: Encode Categorical/Binary Columns ---
binary_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad',
               'appet', 'pe', 'ane']
for col in binary_cols:
    df[col] = df[col].map({
        'normal': 0, 'abnormal': 1,
        'not present': 0, 'present': 1,
        'no': 0, 'yes': 1,
        'good': 0, 'poor': 1
    })
print("✅ Binary columns encoded.")

✅ Binary columns encoded.


In [34]:
# --- Cell 7: Encode Target Column ---
df['classification'] = df['classification'].map({'ckd': 1, 'notckd': 0})
print("✅ Target encoded.")

✅ Target encoded.


In [35]:
# --- Cell 8: Drop Rows with Missing Target ---
df.dropna(subset=['classification'], inplace=True)
print("✅ Dropped rows with missing target. Shape:", df.shape)

✅ Dropped rows with missing target. Shape: (3000, 25)


In [36]:
# --- Cell 9: Fill Missing Values ---
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[binary_cols] = df[binary_cols].fillna(df[binary_cols].mode().iloc[0])
print("✅ Missing values filled.")

✅ Missing values filled.


In [37]:
# --- Cell 10: Feature Scaling ---
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print("✅ Numeric features scaled.")

✅ Numeric features scaled.


In [38]:
# --- Cell 11: Select Final Columns ---
final_cols = ['age', 'sc', 'hemo', 'al', 'bp', 'sg', 'classification']
df = df[final_cols]
print("✅ Final selected columns:", final_cols)

✅ Final selected columns: ['age', 'sc', 'hemo', 'al', 'bp', 'sg', 'classification']


In [39]:
# --- Cell 12: Save Cleaned Dataset ---
df.to_csv("../Data/cleaned_ckd_data.csv", index=False)
print("✅ Cleaned dataset saved to ../Data/cleaned_ckd_data.csv")

✅ Cleaned dataset saved to ../Data/cleaned_ckd_data.csv


In [40]:
# --- Cell 13: Save Scaler for Flask App ---
import joblib
joblib.dump(scaler, "../App/model/scaler.pkl")
print("✅ Scaler saved to ../App/model/scaler.pkl")


✅ Scaler saved to ../App/model/scaler.pkl
