In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import Counter

Step 1: Load Dataset

In [9]:
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('/content/sample_data/kidney_disease.csv')

Saving kidney_disease.csv to kidney_disease (1).csv


**PRE-PROCESSING**

Step 2: Rename Columns

In [10]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

Step 3: Drop 'id' column

In [11]:
df.drop(columns=['id'], inplace=True)

Step 4: Clean messy strings

In [12]:
for col in df.columns:
    df[col] = df[col].astype(str).str.strip().str.replace('\t', '').replace('?', np.nan)


Step 5: Convert numeric-looking columns to numeric

In [13]:
for col in ['pcv', 'wc', 'rc']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

Step 6: Check and Fill Missing Values

In [14]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])  # fill categorical with mode
    else:
        df[col] = df[col].fillna(df[col].mean())     # fill numeric with mean


Step 7: Label Encode Categorical Columns

In [15]:
catcols = df.select_dtypes(include='object').columns

le = LabelEncoder()
for col in catcols:
    df[col] = le.fit_transform(df[col])

In [25]:
print("Preprocessing Completed!")
print("Final Dataset Shape:", df.shape)
print("First 5 Rows:\n")


Preprocessing Completed!
Final Dataset Shape: (400, 25)
First 5 Rows:



In [26]:
print(df.head())

   age  bp  sg  al  su  rbc  pc  pcc  ba  bgr  ...   pcv      wc        rc  \
0   36   8   3   1   0    1   2    1   1   21  ...  44.0  7800.0  5.200000   
1   60   5   3   4   0    1   2    1   1  146  ...  38.0  6000.0  4.707435   
2   52   8   1   2   3    2   2    1   1  114  ...  31.0  7500.0  4.707435   
3   36   7   0   4   0    2   0    2   1   17  ...  32.0  6700.0  3.900000   
4   40   8   1   2   0    2   2    1   1    6  ...  35.0  7300.0  4.600000   

   htn  dm  cad  appet  pe  ane  classification  
0    2   2    1      0   1    1               0  
1    1   1    1      0   1    1               0  
2    1   2    1      2   1    2               0  
3    2   1    1      2   2    2               0  
4    1   1    1      0   1    1               0  

[5 rows x 25 columns]
