In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import files
uploaded = files.upload()

Saving kidney_disease.csv to kidney_disease.csv


In [3]:
df = pd.read_csv('/content/kidney_disease.csv')

In [4]:
# Rename columns for better readability (correcting abbreviations)
column_rename_map = {
    'bp': 'blood_pressure',
    'sg': 'specific_gravity',
    'al': 'albumin',
    'su': 'sugar',
    'rbc': 'red_blood_cells',
    'pc': 'pus_cell',
    'pcc': 'pus_cell_clumps',
    'ba': 'bacteria',
    'bgr': 'blood_glucose_random',
    'bu': 'blood_urea',
    'sc': 'serum_creatinine',
    'sod': 'sodium',
    'pot': 'potassium',
    'hemo': 'hemoglobin',
    'pcv': 'packed_cell_volume',
    'wc': 'white_blood_cell_count',
    'rc': 'red_blood_cell_count',
    'htn': 'hypertension',
    'dm': 'diabetes_mellitus',
    'cad': 'coronary_artery_disease',
    'appet': 'appetite',
    'pe': 'pedal_edema',
    'ane': 'anemia'
}

df = df.rename(columns=column_rename_map)
print("\nRenamed columns:", df.columns.tolist())


Renamed columns: ['id', 'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium', 'potassium', 'hemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count', 'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'pedal_edema', 'anemia', 'classification']


In [5]:
print("\nInitial shape:", df.shape)
print("\nInitial info:")
print(df.info())


Initial shape: (400, 26)

Initial info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       400 non-null    int64  
 1   age                      391 non-null    float64
 2   blood_pressure           388 non-null    float64
 3   specific_gravity         353 non-null    float64
 4   albumin                  354 non-null    float64
 5   sugar                    351 non-null    float64
 6   red_blood_cells          248 non-null    object 
 7   pus_cell                 335 non-null    object 
 8   pus_cell_clumps          396 non-null    object 
 9   bacteria                 396 non-null    object 
 10  blood_glucose_random     356 non-null    float64
 11  blood_urea               381 non-null    float64
 12  serum_creatinine         383 non-null    float64
 13  sodium                   313 non-null  

In [6]:
print(df.isnull().sum())

id                           0
age                          9
blood_pressure              12
specific_gravity            47
albumin                     46
sugar                       49
red_blood_cells            152
pus_cell                    65
pus_cell_clumps              4
bacteria                     4
blood_glucose_random        44
blood_urea                  19
serum_creatinine            17
sodium                      87
potassium                   88
hemoglobin                  52
packed_cell_volume          70
white_blood_cell_count     105
red_blood_cell_count       130
hypertension                 2
diabetes_mellitus            2
coronary_artery_disease      2
appetite                     1
pedal_edema                  1
anemia                       1
classification               0
dtype: int64


In [7]:
# For numerical columns: fill with median
num_cols = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
            'blood_glucose_random', 'blood_urea', 'serum_creatinine',
            'sodium', 'potassium', 'hemoglobin', 'packed_cell_volume',
            'white_blood_cell_count', 'red_blood_cell_count']

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, handling errors
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [8]:
# For categorical columns: fill with mode
cat_cols = ['red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
            'hypertension', 'diabetes_mellitus', 'coronary_artery_disease',
            'appetite', 'pedal_edema', 'anemia']

for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [9]:
print("\nNull values after handling:")
print(df.isnull().sum())


Null values after handling:
id                         0
age                        0
blood_pressure             0
specific_gravity           0
albumin                    0
sugar                      0
red_blood_cells            0
pus_cell                   0
pus_cell_clumps            0
bacteria                   0
blood_glucose_random       0
blood_urea                 0
serum_creatinine           0
sodium                     0
potassium                  0
hemoglobin                 0
packed_cell_volume         0
white_blood_cell_count     0
red_blood_cell_count       0
hypertension               0
diabetes_mellitus          0
coronary_artery_disease    0
appetite                   0
pedal_edema                0
anemia                     0
classification             0
dtype: int64


In [10]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
# Remove the target variable 'classification' from categorical columns
cat_cols.remove('classification')
print("\nCategorical columns:", cat_cols)

# Let's examine the unique values in each categorical column
print("\nUnique values in categorical columns:")
for col in cat_cols:
    print(f"{col}: {df[col].unique()}")


Categorical columns: ['red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria', 'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'pedal_edema', 'anemia']

Unique values in categorical columns:
red_blood_cells: ['normal' 'abnormal']
pus_cell: ['normal' 'abnormal']
pus_cell_clumps: ['notpresent' 'present']
bacteria: ['notpresent' 'present']
hypertension: ['yes' 'no']
diabetes_mellitus: ['yes' 'no' ' yes' '\tno' '\tyes']
coronary_artery_disease: ['no' 'yes' '\tno']
appetite: ['good' 'poor']
pedal_edema: ['no' 'yes']
anemia: ['no' 'yes']


In [11]:
df['diabetes_mellitus'] = df['diabetes_mellitus'].replace({' yes': 'yes', '\tyes': 'yes', '\tno': 'no'})
df['coronary_artery_disease'] = df['coronary_artery_disease'].replace({'\tno': 'no'})
df['classification'] = df['classification'].replace({'ckd\t': 'ckd'})

In [12]:
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    print(f"\n{col} after encoding:")
    print(df[col].value_counts())

# Also encode the target variable
le = LabelEncoder()
df['classification'] = le.fit_transform(df['classification'])
print("\nTarget variable after encoding:")
print(df['classification'].value_counts())



red_blood_cells after encoding:
red_blood_cells
1    353
0     47
Name: count, dtype: int64

pus_cell after encoding:
pus_cell
1    324
0     76
Name: count, dtype: int64

pus_cell_clumps after encoding:
pus_cell_clumps
0    358
1     42
Name: count, dtype: int64

bacteria after encoding:
bacteria
0    378
1     22
Name: count, dtype: int64

hypertension after encoding:
hypertension
0    253
1    147
Name: count, dtype: int64

diabetes_mellitus after encoding:
diabetes_mellitus
0    263
1    137
Name: count, dtype: int64

coronary_artery_disease after encoding:
coronary_artery_disease
0    366
1     34
Name: count, dtype: int64

appetite after encoding:
appetite
0    318
1     82
Name: count, dtype: int64

pedal_edema after encoding:
pedal_edema
0    324
1     76
Name: count, dtype: int64

anemia after encoding:
anemia
0    340
1     60
Name: count, dtype: int64

Target variable after encoding:
classification
0    250
1    150
Name: count, dtype: int64


In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['blood_urea'] = scaler.fit_transform(df[['blood_urea']])

print("\nBlood urea after normalization:")
print(df['blood_urea'].describe())


Blood urea after normalization:
count    400.000000
mean       0.141702
std        0.126817
min        0.000000
25%        0.065469
50%        0.103979
75%        0.154685
max        1.000000
Name: blood_urea, dtype: float64


In [14]:
print("\nFinal null check:")
print(df.isnull().sum().sum())


Final null check:
0
