In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [2]:
df = pd.read_csv(r"C:\Users\Tejas\OneDrive\Desktop\chronic kidney disease\data\ckd-dataset-v2.csv")
df_cleaned = df.iloc[2:].reset_index(drop=True)
df_cleaned.dropna(axis=1, how="all", inplace=True)
print("Cleaned dataset shape:", df_cleaned.shape)

Cleaned dataset shape: (200, 29)


In [3]:
df_cleaned.columns = df_cleaned.columns.str.strip().str.lower().str.replace(" ", "_")


In [4]:
def convert_to_numeric(val):
    try:
        # Handle range values (e.g., "1.019 - 1.021")
        if '-' in val:
            low, high = val.split('-')
            return (float(low.strip()) + float(high.strip())) / 2
        # Handle threshold values (e.g., "≥ 227.944")
        elif '≥' in val:
            return float(val.split('≥')[-1].strip())
        # Handle threshold values (e.g., "< 0")
        elif '<' in val:
            return float(val.split('<')[-1].strip()) - 0.1
        # Handle simple numbers
        else:
            return float(val)
    except:
        return np.nan

In [5]:
numeric_cols = ['sg', 'al', 'su', 'bgr', 'bu', 'sod', 'sc', 'pot', 'hemo', 
                'pcv', 'rbcc', 'wbcc', 'grf', 'age', 'bp_(diastolic)']

# Convert all columns to numeric
for col in numeric_cols:
    df_cleaned[col] = df_cleaned[col].apply(convert_to_numeric)
    # Fill missing values with column median
    median_val = df_cleaned[col].median()
    df_cleaned[col] = df_cleaned[col].fillna(median_val)

# Convert target to binary
df_cleaned['affected'] = df_cleaned['affected'].astype(int)

# Handle categorical features
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 
                    'appet', 'pe', 'ane', 'bp_limit', 'stage']


In [6]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

# Separate features and target
X = df_cleaned.drop(columns=['class', 'affected'])
y = df_cleaned['affected']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced',
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.97
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.96      1.00      0.98        26

    accuracy                           0.97        40
   macro avg       0.98      0.96      0.97        40
weighted avg       0.98      0.97      0.97        40



In [8]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
print("\nTop 10 features:")
print(feat_importances.nlargest(10))

# Save artifacts
joblib.dump(model, "ckd_model_v3.pkl")
joblib.dump(X.columns.tolist(), "ckd_features_v3.pkl")
joblib.dump(label_encoders, "ckd_label_encoders_v3.pkl")

print("Model and artifacts saved successfully!")


Top 10 features:
hemo     0.208009
stage    0.147630
pcv      0.130460
grf      0.112026
sg       0.073784
al       0.057638
dm       0.054532
htn      0.053938
rbcc     0.050298
bu       0.022442
dtype: float64
Model and artifacts saved successfully!
